Extract flickr_mail package with Mapped models and shared utilities

Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:07:23 +00:00 · 2026-02-07 13:07:23 +00:00 · 9f0fb01878
commit 9f0fb01878
parent ac1b01ea68
11 changed files with 1129 additions and 300 deletions
--- a/update_flickr_uploads.py
+++ b/update_flickr_uploads.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
+Find UploadWizard contributions that are from Flickr and add them to the database.

 For contributions with comment 'User created page with UploadWizard', queries the
 Commons API to check if the image source is Flickr (by checking the Credit field).
@ -9,12 +9,13 @@ Commons API to check if the image source is Flickr (by checking the Credit field
 import json
 import re
 import time
-from pathlib import Path

 import requests

-CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
-FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
+from flickr_mail.database import init_db, get_session
+from flickr_mail.models import Contribution, FlickrUpload, SentMessage
+from flickr_mail.url_utils import normalize_flickr_url
+
 COMMONS_API = "https://commons.wikimedia.org/w/api.php"
 USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"

@ -75,99 +76,101 @@ def clean_artist_name(artist_html: str) -> str:


 def main():
-    # Load contributions
-    print("Loading contributions...")
-    with open(CONTRIBUTIONS_FILE) as f:
-        data = json.load(f)
+    init_db()
+    session = get_session()

-    contributions = data.get("contributions", [])
+    try:
+        # Get existing normalized flickr URLs to avoid duplicates
+        existing_urls = {
+            r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
+            if r[0]
+        }
+        print(f"Existing uploads: {session.query(FlickrUpload).count()}")
+        print(f"Existing flickr URLs: {len(existing_urls)}")

-    # Load existing flickr uploads
-    existing_flickr_urls = set()
-    existing_uploads = []
-    if FLICKR_UPLOADS_FILE.exists():
-        with open(FLICKR_UPLOADS_FILE) as f:
-            existing_uploads = json.load(f)
-            existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
-            # Also normalize existing URLs for comparison
-            for u in existing_uploads:
-                url = u.get("flickr_url", "")
-                normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
-                existing_flickr_urls.add(normalized)
+        # Build sent message index for FK linking
+        sent_messages = (
+            session.query(SentMessage)
+            .filter(SentMessage.normalized_flickr_url != "")
+            .filter(~SentMessage.subject.startswith("Re:"))
+            .all()
+        )
+        url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}

-    print(f"Existing uploads: {len(existing_uploads)}")
-    print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
+        # Find UploadWizard contributions (page creations only)
+        upload_wizard = (
+            session.query(Contribution)
+            .filter(Contribution.comment == "User created page with UploadWizard")
+            .filter(Contribution.title.startswith("File:"))
+            .all()
+        )

-    # Find UploadWizard contributions (page creations only)
-    upload_wizard_contributions = []
-    for c in contributions:
-        comment = c.get("comment", "")
-        if comment == "User created page with UploadWizard":
-            # Only include if it's a File: page
-            title = c.get("title", "")
-            if title.startswith("File:"):
-                upload_wizard_contributions.append(c)
+        print(f"UploadWizard contributions to check: {len(upload_wizard)}")

-    print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
+        # Process in batches of 50
+        new_count = 0
+        batch_size = 50

-    # Process in batches of 50
-    new_uploads = []
-    batch_size = 50
+        for i in range(0, len(upload_wizard), batch_size):
+            batch = upload_wizard[i : i + batch_size]
+            titles = [c.title for c in batch]

-    for i in range(0, len(upload_wizard_contributions), batch_size):
-        batch = upload_wizard_contributions[i : i + batch_size]
-        titles = [c["title"] for c in batch]
+            print(
+                f"Processing batch {i // batch_size + 1}/"
+                f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
+            )

-        print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
+            metadata = get_image_metadata(titles)

-        metadata = get_image_metadata(titles)
+            for c in batch:
+                meta = metadata.get(c.title, {})
+                credit = meta.get("credit", "")
+                artist = meta.get("artist", "")

-        for c in batch:
-            title = c["title"]
-            meta = metadata.get(title, {})
-            credit = meta.get("credit", "")
-            artist = meta.get("artist", "")
+                flickr_url = extract_flickr_url_from_credit(credit)
+                if not flickr_url:
+                    continue

-            flickr_url = extract_flickr_url_from_credit(credit)
-            if not flickr_url:
-                continue
+                normalized = normalize_flickr_url(flickr_url)
+                if normalized in existing_urls:
+                    continue

-            # Check if we already have this URL
-            normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
-            if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
-                continue
+                creator = clean_artist_name(artist) if artist else None

-            creator = clean_artist_name(artist) if artist else None
+                # Look up sent message for FK linking
+                msg = url_to_message.get(normalized) if normalized else None

-            new_upload = {
-                "pageid": c["pageid"],
-                "revid": c["revid"],
-                "title": title,
-                "timestamp": c["timestamp"],
-                "flickr_url": flickr_url,
-                "creator": creator,
-            }
+                session.add(FlickrUpload(
+                    pageid=c.pageid,
+                    revid=c.revid,
+                    title=c.title,
+                    timestamp=c.timestamp,
+                    flickr_url=flickr_url,
+                    normalized_flickr_url=normalized,
+                    creator=creator,
+                    wikipedia_url=msg.wikipedia_url if msg else "",
+                    creator_profile_url=msg.creator_profile_url if msg else "",
+                    sent_message_id=msg.message_id if msg else None,
+                ))
+                new_count += 1
+                existing_urls.add(normalized)
+                print(f"  Found: {c.title[:50]} -> {flickr_url}")

-            new_uploads.append(new_upload)
-            existing_flickr_urls.add(normalized)
-            print(f"  Found: {title[:50]} -> {flickr_url}")
+            session.commit()

-        # Rate limiting
-        if i + batch_size < len(upload_wizard_contributions):
-            time.sleep(0.5)
+            # Rate limiting
+            if i + batch_size < len(upload_wizard):
+                time.sleep(0.5)

-    print(f"\nFound {len(new_uploads)} new Flickr uploads")
+        total = session.query(FlickrUpload).count()
+        print(f"\nFound {new_count} new Flickr uploads")
+        print(f"Total: {total} uploads in database")

-    if new_uploads:
-        # Merge and sort by timestamp (newest first)
-        all_uploads = existing_uploads + new_uploads
-        all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
-
-        # Save
-        with open(FLICKR_UPLOADS_FILE, "w") as f:
-            json.dump(all_uploads, f, indent=2)
-
-        print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()


 if __name__ == "__main__":