Extract flickr_mail package with Mapped models and shared utilities

Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:07:23 +00:00 · 2026-02-07 13:07:23 +00:00 · 9f0fb01878
commit 9f0fb01878
parent ac1b01ea68
11 changed files with 1129 additions and 300 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@
 __pycache__
 commons_contributions/thumbnail_cache.json
 commons_contributions/sent_mail_index.json
 flickr_mail.db
--- a/download_commons_contributions.py
+++ b/download_commons_contributions.py
@ -0,0 +1,147 @@
 #!/usr/bin/env python3
 """Download Wikimedia Commons contributions for a user."""
 import json
 import time
 import requests
 from flickr_mail.database import init_db, get_session
 from flickr_mail.models import Contribution
 API_URL = "https://commons.wikimedia.org/w/api.php"
 USERNAME = "Edward"
 # Identify ourselves properly to Wikimedia
 USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
 SESSION = requests.Session()
 SESSION.headers.update({"User-Agent": USER_AGENT})
 def fetch_contributions(
    continue_token: str | None = None,
 ) -> tuple[list[dict], str | None]:
    """Fetch a batch of contributions from the API."""
    params = {
        "action": "query",
        "list": "usercontribs",
        "ucuser": USERNAME,
        "uclimit": "500",
        "ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
        "format": "json",
    }
    if continue_token:
        params["uccontinue"] = continue_token
    response = SESSION.get(API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    contributions = data.get("query", {}).get("usercontribs", [])
    # Get continuation token if more results available
    new_continue = data.get("continue", {}).get("uccontinue")
    return contributions, new_continue
 def upsert_contribution(session, c: dict) -> None:
    """Insert or update a contribution by revid."""
    existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
    if existing:
        return  # Already have this revision
    session.add(Contribution(
        userid=c.get("userid"),
        user=c.get("user"),
        pageid=c.get("pageid"),
        revid=c.get("revid"),
        parentid=c.get("parentid"),
        ns=c.get("ns"),
        title=c.get("title"),
        timestamp=c.get("timestamp"),
        minor=c.get("minor"),
        top=c.get("top"),
        comment=c.get("comment"),
        size=c.get("size"),
        sizediff=c.get("sizediff"),
        tags=json.dumps(c.get("tags", [])),
    ))
 def main() -> None:
    """Main entry point."""
    init_db()
    session = get_session()
    try:
        existing_count = session.query(Contribution).count()
        # Get the latest timestamp to know where to resume from
        latest = (
            session.query(Contribution)
            .order_by(Contribution.timestamp.desc())
            .first()
        )
        if existing_count > 0 and latest:
            print(f"Database has {existing_count} contributions")
            print(f"Latest: {latest.timestamp}")
            print("Fetching new contributions...")
        else:
            print(f"Downloading contributions for user: {USERNAME}")
        batch_num = 0
        new_count = 0
        continue_token = None
        while True:
            batch_num += 1
            print(f"  Fetching batch {batch_num}...", end=" ", flush=True)
            contributions, continue_token = fetch_contributions(continue_token)
            if not contributions:
                print("no results")
                break
            batch_new = 0
            for c in contributions:
                # Stop if we've reached contributions we already have
                existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
                if existing:
                    continue
                upsert_contribution(session, c)
                batch_new += 1
            new_count += batch_new
            print(f"got {len(contributions)}, {batch_new} new")
            session.commit()
            if batch_new == 0:
                # All contributions in this batch already exist, we're caught up
                print("  Caught up with existing data")
                break
            if not continue_token:
                break
            # Be polite to the API
            time.sleep(0.5)
        total = session.query(Contribution).count()
        print(f"\nDone! {new_count} new contributions, {total} total in database")
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
 if __name__ == "__main__":
    main()
--- a/download_sent_mail.py
+++ b/download_sent_mail.py
@ -0,0 +1,246 @@
 #!/usr/bin/env python3
 """Download sent FlickrMail messages for backup."""
 import time
 import requests
 from bs4 import BeautifulSoup
 from flickr_mail.database import init_db, get_session
 from flickr_mail.models import SentMessage
 from flickr_mail.url_utils import (
    creator_profile_from_flickr_url,
    extract_urls_from_message,
    normalize_flickr_url,
 )
 BASE_URL = "https://www.flickr.com"
 SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
 MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Priority": "u=0, i",
 }
 COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
 def parse_cookies(cookie_str: str) -> dict[str, str]:
    """Parse cookie string into dictionary."""
    cookies = {}
    for item in cookie_str.split("; "):
        if "=" in item:
            key, value = item.split("=", 1)
            cookies[key] = value
    return cookies
 def create_session() -> requests.Session:
    """Create a requests session with authentication."""
    session = requests.Session()
    session.headers.update(HEADERS)
    session.cookies.update(parse_cookies(COOKIES_STR))
    return session
 def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
    """Fetch a page and return parsed HTML."""
    response = session.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")
 def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
    """Extract message metadata from a sent mail list page."""
    messages = []
    # Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
    mail_rows = soup.select("tr.message_row")
    for row in mail_rows:
        msg = {}
        # Get message ID from the row id attribute
        row_id = row.get("id", "")
        if row_id.startswith("message_row_"):
            msg["message_id"] = row_id.replace("message_row_", "")
        # Find message link in the subject cell
        subj_cell = row.select_one("td.subj")
        if subj_cell:
            link = subj_cell.find("a")
            if link:
                msg["subject"] = link.get_text(strip=True)
                msg["url"] = BASE_URL + link["href"]
        # Recipient is in td.fromto
        fromto_cell = row.select_one("td.fromto")
        if fromto_cell:
            msg["recipient"] = fromto_cell.get_text(strip=True)
        # Date is in td.date
        date_cell = row.select_one("td.date")
        if date_cell:
            msg["date"] = date_cell.get_text(strip=True)
        if "message_id" in msg:
            messages.append(msg)
    return messages
 def extract_message_content(soup: BeautifulSoup) -> dict:
    """Extract full message content from a message page."""
    content = {}
    # Find the ThinCase div containing the message
    thin_case = soup.select_one(".ThinCase")
    if not thin_case:
        return content
    # Find the table with message content
    table = thin_case.find("table")
    if not table:
        return content
    rows = table.find_all("tr", recursive=False)
    # Row 0: To: <recipient>
    # Row 1: Subject: <subject>
    # Row 2: <empty> <body>
    for row in rows:
        cells = row.find_all("td", recursive=False)
        if len(cells) >= 2:
            header_cell = cells[0]
            value_cell = cells[1]
            header = header_cell.get_text(strip=True).lower()
            if header == "to:":
                # Get recipient username
                username = value_cell.select_one(".username")
                if username:
                    content["recipient"] = username.get_text(strip=True)
            elif header == "subject:":
                # Get subject from h3
                h3 = value_cell.find("h3")
                if h3:
                    content["subject"] = h3.get_text(strip=True)
            elif header == "":
                # This is the message body row (empty header cell)
                # Get the content but exclude the delete form
                form = value_cell.find("form")
                if form:
                    form.decompose()
                content["body"] = value_cell.get_text(separator="\n", strip=True)
                content["body_html"] = str(value_cell)
                break  # Body found, stop processing
    return content
 def main() -> None:
    """Main entry point."""
    init_db()
    db_session = get_session()
    try:
        existing_ids = {
            r[0] for r in db_session.query(SentMessage.message_id).all()
        }
        print(f"Database has {len(existing_ids)} messages")
        http_session = create_session()
        # Scrape all pages to find new messages
        total_pages = 29
        new_messages: list[dict] = []
        print("Fetching message list from all pages...")
        for page in range(1, total_pages + 1):
            url = SENT_MAIL_URL.format(page=page)
            print(f"  Fetching page {page}/{total_pages}...")
            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)
                for msg in page_messages:
                    if msg["message_id"] not in existing_ids:
                        new_messages.append(msg)
                time.sleep(1)  # Be polite to the server
            except Exception as e:
                print(f"  Error fetching page {page}: {e}")
                continue
        print(f"Found {len(new_messages)} new messages to download")
        # Download individual messages
        for i, msg in enumerate(new_messages, 1):
            msg_id = msg["message_id"]
            url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
            print(f"  [{i}/{len(new_messages)}] Downloading message {msg_id}...")
            try:
                soup = fetch_page(http_session, url)
                content = extract_message_content(soup)
                # Merge with metadata
                full_msg = {**msg, **content}
                body = full_msg.get("body", "")
                flickr_url, wikipedia_url = extract_urls_from_message(body)
                normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
                creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
                db_session.add(SentMessage(
                    message_id=msg_id,
                    subject=full_msg.get("subject", ""),
                    url=full_msg.get("url", ""),
                    recipient=full_msg.get("recipient", ""),
                    date=full_msg.get("date", ""),
                    body=body,
                    body_html=full_msg.get("body_html", ""),
                    flickr_url=flickr_url,
                    normalized_flickr_url=normalized,
                    wikipedia_url=wikipedia_url,
                    creator_profile_url=creator_profile,
                ))
                db_session.commit()
                time.sleep(1)  # Be polite
            except Exception as e:
                db_session.rollback()
                print(f"    Error downloading message {msg_id}: {e}")
                continue
        total = db_session.query(SentMessage).count()
        print(f"Done! {total} messages in database")
    except Exception:
        db_session.rollback()
        raise
    finally:
        db_session.close()
 if __name__ == "__main__":
    main()
--- a/extract_flickr_uploads.py
+++ b/extract_flickr_uploads.py
@ -0,0 +1,158 @@
 #!/usr/bin/env python3
 """
 Extract Flickr uploads from Wikimedia Commons contributions.
 Filters contributions where the comment contains a flickr.com URL and extracts:
 - pageid, revid, title, timestamp
 - flickr_url: the Flickr photo URL
 - creator: the photographer/author name
 Links uploads to sent messages via normalized Flickr URL matching.
 """
 import re
 from flickr_mail.database import init_db, get_session
 from flickr_mail.models import Contribution, FlickrUpload, SentMessage
 from flickr_mail.url_utils import normalize_flickr_url
 def extract_flickr_url(comment: str) -> str | None:
    """Extract the Flickr photo URL from a comment."""
    # Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
    # Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
    patterns = [
        # Plain URL (modern format)
        r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
        # URL in wiki markup [url title]
        r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
    ]
    for pattern in patterns:
        match = re.search(pattern, comment)
        if match:
            return match.group(1)
    return None
 def extract_creator(comment: str) -> str | None:
    """Extract the creator/author name from a comment."""
    # Modern format: "Uploaded a work by {creator} from https://..."
    modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
    if modern_match:
        return modern_match.group(1).strip()
    # Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
    # The author name comes after the URL, before ] or "from"
    author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
    if author_match:
        author = author_match.group(1).strip()
        # Remove trailing location like "from Toronto, Canada"
        author = re.sub(r'\s+from\s+.+$', '', author)
        return author
    # Handle truncated comments where Author field is cut off
    # Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
    truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
    if truncated_match:
        author = truncated_match.group(1).strip()
        if author:
            return author
    # Sometimes Author field is just plain text without URL
    author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
    if author_plain:
        author = author_plain.group(1).strip()
        # Skip if it looks like a wiki user link
        if not author.startswith('[[User:') and author:
            return author
    return None
 def main() -> None:
    """Process contributions and extract Flickr uploads."""
    init_db()
    session = get_session()
    try:
        # Get existing upload revids to avoid duplicates
        existing_revids = {
            r[0] for r in session.query(FlickrUpload.revid).all()
        }
        # Build sent message index: normalized_flickr_url -> message
        sent_messages = (
            session.query(SentMessage)
            .filter(SentMessage.normalized_flickr_url != "")
            .filter(~SentMessage.subject.startswith("Re:"))
            .all()
        )
        url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
        print(f"Sent message index: {len(url_to_message)} entries")
        # Query contributions with flickr.com in comment
        contributions = (
            session.query(Contribution)
            .filter(Contribution.comment.ilike("%flickr.com%"))
            .all()
        )
        print(f"Found {len(contributions)} contributions mentioning flickr.com")
        new_count = 0
        for contrib in contributions:
            if contrib.revid in existing_revids:
                continue
            flickr_url = extract_flickr_url(contrib.comment or "")
            if not flickr_url:
                continue
            creator = extract_creator(contrib.comment or "")
            normalized = normalize_flickr_url(flickr_url)
            # Look up sent message for FK linking
            msg = url_to_message.get(normalized) if normalized else None
            session.add(FlickrUpload(
                pageid=contrib.pageid,
                revid=contrib.revid,
                title=contrib.title,
                timestamp=contrib.timestamp,
                flickr_url=flickr_url,
                normalized_flickr_url=normalized,
                creator=creator,
                wikipedia_url=msg.wikipedia_url if msg else "",
                creator_profile_url=msg.creator_profile_url if msg else "",
                sent_message_id=msg.message_id if msg else None,
            ))
            new_count += 1
        session.commit()
        total = session.query(FlickrUpload).count()
        linked = session.query(FlickrUpload).filter(
            FlickrUpload.sent_message_id.isnot(None)
        ).count()
        print(f"Extracted {new_count} new Flickr uploads")
        print(f"Total: {total} uploads, {linked} linked to sent messages")
        # Show some stats
        with_creator = session.query(FlickrUpload).filter(
            FlickrUpload.creator.isnot(None)
        ).count()
        print(f"  - {with_creator} with creator identified")
        print(f"  - {total - with_creator} without creator")
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
 if __name__ == '__main__':
    main()
--- a/flickr_mail/init.py
+++ b/flickr_mail/init.py
--- a/flickr_mail/database.py
+++ b/flickr_mail/database.py
@ -0,0 +1,31 @@
 """Database engine and session factory for flickr-mail."""
 from pathlib import Path
 from sqlalchemy import create_engine, event
 from sqlalchemy.orm import Session, sessionmaker
 from flickr_mail.models import Base
 DB_PATH = Path(__file__).parent.parent / "flickr_mail.db"
 engine = create_engine(f"sqlite:///{DB_PATH}")
 SessionLocal = sessionmaker(bind=engine)
@event.listens_for(engine, "connect")
 def set_sqlite_pragma(dbapi_connection, connection_record):
    """Enable WAL mode for concurrent read/write access."""
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA journal_mode=WAL")
    cursor.close()
 def init_db() -> None:
    """Create all tables."""
    Base.metadata.create_all(engine)
 def get_session() -> Session:
    """Create a new database session."""
    return SessionLocal()
--- a/flickr_mail/models.py
+++ b/flickr_mail/models.py
@ -0,0 +1,93 @@
 """SQLAlchemy models for flickr-mail."""
 from sqlalchemy import ForeignKey, Index, Text
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
 class Base(DeclarativeBase):
    pass
 class Contribution(Base):
    __tablename__ = "contributions"
    id: Mapped[int] = mapped_column(primary_key=True)
    userid: Mapped[int | None]
    user: Mapped[str | None]
    pageid: Mapped[int | None]
    revid: Mapped[int | None] = mapped_column(unique=True)
    parentid: Mapped[int | None]
    ns: Mapped[int | None]
    title: Mapped[str | None]
    timestamp: Mapped[str | None]
    minor: Mapped[str | None]
    top: Mapped[str | None]
    comment: Mapped[str | None] = mapped_column(Text)
    size: Mapped[int | None]
    sizediff: Mapped[int | None]
    tags: Mapped[str | None] = mapped_column(Text)  # JSON array stored as text
    __table_args__ = (
        Index("ix_contributions_timestamp", "timestamp"),
        Index("ix_contributions_pageid", "pageid"),
    )
 class SentMessage(Base):
    __tablename__ = "sent_messages"
    message_id: Mapped[str] = mapped_column(primary_key=True)
    subject: Mapped[str | None]
    url: Mapped[str | None]
    recipient: Mapped[str | None]
    date: Mapped[str | None]
    body: Mapped[str | None] = mapped_column(Text)
    body_html: Mapped[str | None] = mapped_column(Text)
    flickr_url: Mapped[str | None]
    normalized_flickr_url: Mapped[str | None]
    wikipedia_url: Mapped[str | None]
    creator_profile_url: Mapped[str | None]
    flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
        back_populates="sent_message"
    )
    __table_args__ = (
        Index("ix_sent_messages_recipient", "recipient"),
        Index("ix_sent_messages_normalized_flickr_url", "normalized_flickr_url"),
    )
 class FlickrUpload(Base):
    __tablename__ = "flickr_uploads"
    id: Mapped[int] = mapped_column(primary_key=True)
    pageid: Mapped[int | None]
    revid: Mapped[int | None]
    title: Mapped[str | None]
    timestamp: Mapped[str | None]
    flickr_url: Mapped[str | None]
    normalized_flickr_url: Mapped[str | None]
    creator: Mapped[str | None]
    wikipedia_url: Mapped[str | None]
    creator_profile_url: Mapped[str | None]
    sent_message_id: Mapped[str | None] = mapped_column(
        ForeignKey("sent_messages.message_id")
    )
    sent_message: Mapped[SentMessage | None] = relationship(
        back_populates="flickr_uploads"
    )
    __table_args__ = (
        Index("ix_flickr_uploads_normalized_flickr_url", "normalized_flickr_url"),
        Index("ix_flickr_uploads_timestamp", "timestamp"),
    )
 class ThumbnailCache(Base):
    __tablename__ = "thumbnail_cache"
    title: Mapped[str] = mapped_column(primary_key=True)
    thumb_url: Mapped[str | None]
    fetched_at: Mapped[int | None]  # Unix timestamp
--- a/flickr_mail/url_utils.py
+++ b/flickr_mail/url_utils.py
@ -0,0 +1,52 @@
 """Shared URL utility functions for flickr-mail."""
 import re
 def normalize_flickr_url(url: str) -> str:
    """Normalize a Flickr photo URL for comparison."""
    # Remove protocol
    url = url.replace("https://", "").replace("http://", "")
    # Remove www.
    url = url.replace("www.", "")
    # Remove trailing slash
    url = url.rstrip("/")
    # Ensure it starts with flickr.com
    if not url.startswith("flickr.com"):
        return ""
    return url
 def extract_urls_from_message(body: str) -> tuple[str, str]:
    """Extract flickr URL and Wikipedia URL from message body."""
    flickr_url = ""
    wikipedia_url = ""
    # Find flickr photo URLs
    flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
    flickr_matches = re.findall(flickr_pattern, body)
    if flickr_matches:
        flickr_url = flickr_matches[0]
        if not flickr_url.startswith("http"):
            flickr_url = "https://" + flickr_url
    # Find Wikipedia URLs
    wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
    wiki_matches = re.findall(wiki_pattern, body)
    if wiki_matches:
        wikipedia_url = wiki_matches[0]
        if not wikipedia_url.startswith("http"):
            wikipedia_url = "https://" + wikipedia_url
    return flickr_url, wikipedia_url
 def creator_profile_from_flickr_url(flickr_url: str) -> str:
    """Extract creator profile URL from a flickr photo URL."""
    parts = flickr_url.split("/")
    for i, part in enumerate(parts):
        if part == "photos" and i + 1 < len(parts):
            username = parts[i + 1]
            return f"https://www.flickr.com/photos/{username}"
    return ""
--- a/main.py
+++ b/main.py
@ -9,14 +9,17 @@ import sys
 import time
 import traceback
 import typing
 from pathlib import Path
 from urllib.parse import quote, unquote
 import flask
 import requests
 import werkzeug
 from sqlalchemy import func
 from werkzeug.debug.tbtools import DebugTraceback
 from flickr_mail.database import get_session
 from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
 from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
 import re
@ -26,18 +29,6 @@ app.debug = False
 enwiki = "en.wikipedia.org/wiki/"
 # Path to Commons contributions data and sent mail
 COMMONS_UPLOADS_FILE = (
    Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
 )
 COMMONS_CACHE_FILE = (
    Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
 )
 SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
 SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
 SENT_MAIL_INDEX_CACHE = (
    Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
 )
 COMMONS_CACHE_MAX_AGE = 86400 * 7  # Cache for 7 days
 RECENT_UPLOADS_COUNT = 24
@ -165,132 +156,6 @@ class CommonsUpload:
        return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
 def normalize_flickr_url(url: str) -> str:
    """Normalize a Flickr photo URL for comparison."""
    # Remove protocol
    url = url.replace("https://", "").replace("http://", "")
    # Remove www.
    url = url.replace("www.", "")
    # Remove trailing slash
    url = url.rstrip("/")
    # Ensure it starts with flickr.com
    if not url.startswith("flickr.com"):
        return ""
    return url
 def extract_urls_from_message(body: str) -> tuple[str, str]:
    """Extract flickr URL and Wikipedia URL from message body."""
    flickr_url = ""
    wikipedia_url = ""
    # Find flickr photo URLs
    flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
    flickr_matches = re.findall(flickr_pattern, body)
    if flickr_matches:
        flickr_url = flickr_matches[0]
        if not flickr_url.startswith("http"):
            flickr_url = "https://" + flickr_url
    # Find Wikipedia URLs
    wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
    wiki_matches = re.findall(wiki_pattern, body)
    if wiki_matches:
        wikipedia_url = wiki_matches[0]
        if not wikipedia_url.startswith("http"):
            wikipedia_url = "https://" + wikipedia_url
    return flickr_url, wikipedia_url
 def build_sent_mail_index() -> dict[str, dict[str, str]]:
    """Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
    if not SENT_MAIL_DIR.exists():
        return {}
    # Check if we have a cached index
    if SENT_MAIL_INDEX_CACHE.exists():
        try:
            with open(SENT_MAIL_INDEX_CACHE) as f:
                cache = json.load(f)
                # Check if cache is still valid (compare file count)
                json_files = list(SENT_MAIL_DIR.glob("*.json"))
                if cache.get("file_count") == len(json_files):
                    return cache.get("index", {})
        except (json.JSONDecodeError, OSError):
            pass
    index: dict[str, dict[str, str]] = {}
    json_files = list(SENT_MAIL_DIR.glob("*.json"))
    for json_file in json_files:
        try:
            with open(json_file) as f:
                message = json.load(f)
        except (json.JSONDecodeError, OSError):
            continue
        # Skip replies - we want original requests
        subject = message.get("subject", "")
        if subject.startswith("Re:"):
            continue
        body = message.get("body", "")
        flickr_url, wikipedia_url = extract_urls_from_message(body)
        if not flickr_url:
            continue
        normalized = normalize_flickr_url(flickr_url)
        if not normalized:
            continue
        # Extract creator profile URL from flickr URL
        # flickr.com/photos/username/12345 -> flickr.com/photos/username
        parts = flickr_url.split("/")
        creator_profile = ""
        for i, part in enumerate(parts):
            if part == "photos" and i + 1 < len(parts):
                username = parts[i + 1]
                creator_profile = f"https://www.flickr.com/photos/{username}"
                break
        index[normalized] = {
            "wikipedia_url": wikipedia_url,
            "creator_profile_url": creator_profile,
            "recipient": message.get("recipient", ""),
        }
    # Cache the index
    try:
        with open(SENT_MAIL_INDEX_CACHE, "w") as f:
            json.dump({"file_count": len(json_files), "index": index}, f)
    except OSError:
        pass
    return index
 def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
    """Load the thumbnail cache from disk."""
    if not COMMONS_CACHE_FILE.exists():
        return {"timestamp": 0, "thumbnails": {}}
    try:
        with open(COMMONS_CACHE_FILE) as f:
            return typing.cast(dict[str, typing.Any], json.load(f))
    except (json.JSONDecodeError, OSError):
        return {"timestamp": 0, "thumbnails": {}}
 def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
    """Save the thumbnail cache to disk."""
    try:
        with open(COMMONS_CACHE_FILE, "w") as f:
            json.dump(cache, f)
    except OSError:
        pass  # Ignore cache write errors
 def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
    """Fetch thumbnail URLs from Commons API for the given file titles."""
@ -340,79 +205,72 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
    Returns a tuple of (uploads_list, total_count) where total_count is the total number
    of uploads obtained via Flickr mail (not just the ones returned).
    """
-    if not COMMONS_UPLOADS_FILE.exists():
+    session = get_session()
        return [], 0
    try:
-        with open(COMMONS_UPLOADS_FILE) as f:
+        query = (
-            all_uploads = json.load(f)
+            session.query(FlickrUpload, SentMessage)
-    except (json.JSONDecodeError, OSError):
+            .join(SentMessage)
-        return [], 0
+            .order_by(FlickrUpload.timestamp.desc())
    # Build sent mail index
    sent_mail_index = build_sent_mail_index()
    # Filter uploads to only those with matching sent mail
    # Count all matches, but only keep RECENT_UPLOADS_COUNT for display
    uploads_with_mail: list[dict[str, typing.Any]] = []
    total_matched = 0
    for upload in all_uploads:
        flickr_url = upload.get("flickr_url", "")
        normalized = normalize_flickr_url(flickr_url)
        if normalized and normalized in sent_mail_index:
            total_matched += 1
            if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
                upload["_mail_info"] = sent_mail_index[normalized]
                uploads_with_mail.append(upload)
    if not uploads_with_mail:
        return [], 0
    # Load cache and check if it's still valid
    cache = load_commons_thumbnail_cache()
    cache_age = time.time() - cache.get("timestamp", 0)
    cached_thumbs = cache.get("thumbnails", {})
    # Find which titles need fetching
    titles = [u["title"] for u in uploads_with_mail]
    titles_to_fetch = [t for t in titles if t not in cached_thumbs]
    # Fetch missing thumbnails or refresh if cache is old
    if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
        new_thumbs = fetch_commons_thumbnails(
            titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
        )
-        cached_thumbs.update(new_thumbs)
+        total_matched = query.count()
-        cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
+        if total_matched == 0:
-        save_commons_thumbnail_cache(cache)
+            return [], 0
-    # Build the result list
+        recent = query.limit(RECENT_UPLOADS_COUNT).all()
    result: list[CommonsUpload] = []
    for upload in uploads_with_mail:
        title = upload["title"]
        thumb_url = cached_thumbs.get(title, "")
        if not thumb_url:
            continue
-        mail_info = upload.get("_mail_info", {})
+        # Get thumbnails from cache
        titles = [upload.title for upload, msg in recent]
        now = int(time.time())
        cached = {
            tc.title: tc
            for tc in session.query(ThumbnailCache)
            .filter(ThumbnailCache.title.in_(titles))
            .all()
        }
-        # Convert title to Commons URL
+        # Find titles needing fetch (missing or expired)
-        commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
+        titles_to_fetch = [
            t for t in titles
            if t not in cached or (now - (cached[t].fetched_at or 0)) > COMMONS_CACHE_MAX_AGE
        ]
-        result.append(
+        if titles_to_fetch:
-            CommonsUpload(
+            new_thumbs = fetch_commons_thumbnails(titles_to_fetch)
-                title=title.replace("File:", "").rsplit(".", 1)[0],
+            for title, thumb_url in new_thumbs.items():
-                thumb_url=thumb_url,
+                existing = cached.get(title)
-                commons_url=commons_url,
+                if existing:
-                flickr_url=upload.get("flickr_url", ""),
+                    existing.thumb_url = thumb_url
-                creator=upload.get("creator") or "Unknown",
+                    existing.fetched_at = now
-                timestamp=upload.get("timestamp", "")[:10],
+                else:
-                wikipedia_url=mail_info.get("wikipedia_url", ""),
+                    tc = ThumbnailCache(title=title, thumb_url=thumb_url, fetched_at=now)
-                creator_profile_url=mail_info.get("creator_profile_url", ""),
+                    session.add(tc)
                    cached[title] = tc
            session.commit()
        result: list[CommonsUpload] = []
        for upload, msg in recent:
            thumb_url = cached[upload.title].thumb_url if upload.title in cached else ""
            if not thumb_url:
                continue
            commons_url = f"https://commons.wikimedia.org/wiki/{upload.title.replace(' ', '_')}"
            result.append(
                CommonsUpload(
                    title=upload.title.replace("File:", "").rsplit(".", 1)[0],
                    thumb_url=thumb_url,
                    commons_url=commons_url,
                    flickr_url=upload.flickr_url or "",
                    creator=upload.creator or "Unknown",
                    timestamp=(upload.timestamp or "")[:10],
                    wikipedia_url=upload.wikipedia_url or "",
                    creator_profile_url=upload.creator_profile_url or "",
                )
            )
        )
-    return result, total_matched
+        return result, total_matched
    finally:
        session.close()
 def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
@ -421,26 +279,33 @@ def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
    Checks both the display name (flickr_user) and username (flickr_username)
    against the recipient field in the messages index.
    """
-    if not SENT_MAIL_INDEX_FILE.exists():
+    names = set()
    if flickr_user:
        names.add(flickr_user.lower())
    if flickr_username:
        names.add(flickr_username.lower())
    if not names:
        return []
    session = get_session()
    try:
-        with open(SENT_MAIL_INDEX_FILE) as f:
+        messages = (
-            messages = json.load(f)
+            session.query(SentMessage)
-    except (json.JSONDecodeError, OSError):
+            .filter(func.lower(SentMessage.recipient).in_(names))
-        return []
+            .all()
-
+        )
-    # Normalize for case-insensitive comparison
+        return [
-    flickr_user_lower = flickr_user.lower() if flickr_user else ""
+            {
-    flickr_username_lower = flickr_username.lower() if flickr_username else ""
+                "message_id": m.message_id,
-
+                "subject": m.subject,
-    matches = []
+                "url": m.url,
-    for msg in messages:
+                "recipient": m.recipient,
-        recipient = msg.get("recipient", "").lower()
+                "date": m.date,
-        if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
+            }
-            matches.append(msg)
+            for m in messages
-
+        ]
-    return matches
+    finally:
        session.close()
 def parse_category_input(category_input: str) -> str | None:
--- a/migrate_json_to_db.py
+++ b/migrate_json_to_db.py
@ -0,0 +1,233 @@
 #!/usr/bin/env python3
 """One-time migration from JSON files to SQLite database."""
 import json
 import time
 from pathlib import Path
 from flickr_mail.database import init_db, get_session
 from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
 from flickr_mail.url_utils import (
    creator_profile_from_flickr_url,
    extract_urls_from_message,
    normalize_flickr_url,
 )
 COMMONS_DIR = Path(__file__).parent / "commons_contributions"
 SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
 SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
 CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
 FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
 THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
 def migrate_contributions(session) -> int:
    """Migrate contributions.json to contributions table."""
    if not CONTRIBUTIONS_FILE.exists():
        print("No contributions.json found, skipping")
        return 0
    with open(CONTRIBUTIONS_FILE) as f:
        data = json.load(f)
    contributions = data.get("contributions", [])
    print(f"Migrating {len(contributions)} contributions...")
    for c in contributions:
        session.add(Contribution(
            userid=c.get("userid"),
            user=c.get("user"),
            pageid=c.get("pageid"),
            revid=c.get("revid"),
            parentid=c.get("parentid"),
            ns=c.get("ns"),
            title=c.get("title"),
            timestamp=c.get("timestamp"),
            minor=c.get("minor"),
            top=c.get("top"),
            comment=c.get("comment"),
            size=c.get("size"),
            sizediff=c.get("sizediff"),
            tags=json.dumps(c.get("tags", [])),
        ))
    session.flush()
    count = session.query(Contribution).count()
    print(f"  -> {count} contributions migrated")
    return count
 def migrate_sent_messages(session) -> dict[str, str]:
    """Migrate sent messages to sent_messages table.
    Returns a dict of normalized_flickr_url -> message_id for FK linking.
    """
    if not SENT_MAIL_INDEX.exists():
        print("No messages_index.json found, skipping")
        return {}
    with open(SENT_MAIL_INDEX) as f:
        index = json.load(f)
    print(f"Migrating {len(index)} sent messages...")
    url_to_message_id: dict[str, str] = {}
    count = 0
    for msg_meta in index:
        msg_id = msg_meta.get("message_id", "")
        if not msg_id:
            continue
        # Load the full message from individual file
        msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
        if msg_file.exists():
            with open(msg_file) as f:
                msg = json.load(f)
        else:
            msg = msg_meta
        body = msg.get("body", "")
        subject = msg.get("subject", "")
        # Extract URLs from body
        flickr_url, wikipedia_url = extract_urls_from_message(body)
        normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
        # Extract creator profile URL
        creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
        session.add(SentMessage(
            message_id=msg_id,
            subject=msg.get("subject", ""),
            url=msg.get("url", ""),
            recipient=msg.get("recipient", ""),
            date=msg.get("date", ""),
            body=body,
            body_html=msg.get("body_html", ""),
            flickr_url=flickr_url,
            normalized_flickr_url=normalized,
            wikipedia_url=wikipedia_url,
            creator_profile_url=creator_profile_url,
        ))
        # Build URL -> message_id map for FK linking (skip replies)
        if normalized and not subject.startswith("Re:"):
            url_to_message_id[normalized] = msg_id
        count += 1
    session.flush()
    actual = session.query(SentMessage).count()
    print(f"  -> {actual} sent messages migrated")
    print(f"  -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
    return url_to_message_id
 def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
    """Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
    if not FLICKR_UPLOADS_FILE.exists():
        print("No flickr_uploads.json found, skipping")
        return 0
    with open(FLICKR_UPLOADS_FILE) as f:
        uploads = json.load(f)
    print(f"Migrating {len(uploads)} flickr uploads...")
    linked = 0
    for u in uploads:
        flickr_url = u.get("flickr_url", "")
        normalized = normalize_flickr_url(flickr_url)
        # Look up sent message FK
        sent_message_id = url_to_message_id.get(normalized) if normalized else None
        if sent_message_id:
            linked += 1
        # Get wikipedia_url and creator_profile_url from the linked message
        wikipedia_url = ""
        creator_profile_url = ""
        if sent_message_id:
            msg = session.get(SentMessage, sent_message_id)
            if msg:
                wikipedia_url = msg.wikipedia_url or ""
                creator_profile_url = msg.creator_profile_url or ""
        session.add(FlickrUpload(
            pageid=u.get("pageid"),
            revid=u.get("revid"),
            title=u.get("title"),
            timestamp=u.get("timestamp"),
            flickr_url=flickr_url,
            normalized_flickr_url=normalized,
            creator=u.get("creator"),
            wikipedia_url=wikipedia_url,
            creator_profile_url=creator_profile_url,
            sent_message_id=sent_message_id,
        ))
    session.flush()
    count = session.query(FlickrUpload).count()
    print(f"  -> {count} flickr uploads migrated")
    print(f"  -> {linked} linked to sent messages")
    return count
 def migrate_thumbnail_cache(session) -> int:
    """Migrate thumbnail_cache.json to thumbnail_cache table."""
    if not THUMBNAIL_CACHE_FILE.exists():
        print("No thumbnail_cache.json found, skipping")
        return 0
    with open(THUMBNAIL_CACHE_FILE) as f:
        cache = json.load(f)
    thumbnails = cache.get("thumbnails", {})
    cache_timestamp = int(cache.get("timestamp", 0))
    print(f"Migrating {len(thumbnails)} cached thumbnails...")
    for title, thumb_url in thumbnails.items():
        session.add(ThumbnailCache(
            title=title,
            thumb_url=thumb_url,
            fetched_at=cache_timestamp,
        ))
    session.flush()
    count = session.query(ThumbnailCache).count()
    print(f"  -> {count} thumbnail cache entries migrated")
    return count
 def main() -> None:
    print("Initializing database...")
    init_db()
    session = get_session()
    try:
        # Check if already migrated
        existing = session.query(Contribution).count()
        if existing > 0:
            print(f"Database already contains {existing} contributions. Aborting.")
            print("Delete flickr_mail.db to re-run migration.")
            return
        migrate_contributions(session)
        url_to_message_id = migrate_sent_messages(session)
        migrate_flickr_uploads(session, url_to_message_id)
        migrate_thumbnail_cache(session)
        session.commit()
        print("\nMigration complete!")
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
 if __name__ == "__main__":
    main()
--- a/update_flickr_uploads.py
+++ b/update_flickr_uploads.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
+Find UploadWizard contributions that are from Flickr and add them to the database.
 For contributions with comment 'User created page with UploadWizard', queries the
 Commons API to check if the image source is Flickr (by checking the Credit field).
@ -9,12 +9,13 @@ Commons API to check if the image source is Flickr (by checking the Credit field
 import json
 import re
 import time
 from pathlib import Path
 import requests
-CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
+from flickr_mail.database import init_db, get_session
-FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
+from flickr_mail.models import Contribution, FlickrUpload, SentMessage
 from flickr_mail.url_utils import normalize_flickr_url
 COMMONS_API = "https://commons.wikimedia.org/w/api.php"
 USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
@ -75,99 +76,101 @@ def clean_artist_name(artist_html: str) -> str:
 def main():
-    # Load contributions
+    init_db()
-    print("Loading contributions...")
+    session = get_session()
    with open(CONTRIBUTIONS_FILE) as f:
        data = json.load(f)
-    contributions = data.get("contributions", [])
+    try:
        # Get existing normalized flickr URLs to avoid duplicates
        existing_urls = {
            r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
            if r[0]
        }
        print(f"Existing uploads: {session.query(FlickrUpload).count()}")
        print(f"Existing flickr URLs: {len(existing_urls)}")
-    # Load existing flickr uploads
+        # Build sent message index for FK linking
-    existing_flickr_urls = set()
+        sent_messages = (
-    existing_uploads = []
+            session.query(SentMessage)
-    if FLICKR_UPLOADS_FILE.exists():
+            .filter(SentMessage.normalized_flickr_url != "")
-        with open(FLICKR_UPLOADS_FILE) as f:
+            .filter(~SentMessage.subject.startswith("Re:"))
-            existing_uploads = json.load(f)
+            .all()
-            existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
+        )
-            # Also normalize existing URLs for comparison
+        url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
            for u in existing_uploads:
                url = u.get("flickr_url", "")
                normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
                existing_flickr_urls.add(normalized)
-    print(f"Existing uploads: {len(existing_uploads)}")
+        # Find UploadWizard contributions (page creations only)
-    print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
+        upload_wizard = (
            session.query(Contribution)
            .filter(Contribution.comment == "User created page with UploadWizard")
            .filter(Contribution.title.startswith("File:"))
            .all()
        )
-    # Find UploadWizard contributions (page creations only)
+        print(f"UploadWizard contributions to check: {len(upload_wizard)}")
    upload_wizard_contributions = []
    for c in contributions:
        comment = c.get("comment", "")
        if comment == "User created page with UploadWizard":
            # Only include if it's a File: page
            title = c.get("title", "")
            if title.startswith("File:"):
                upload_wizard_contributions.append(c)
-    print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
+        # Process in batches of 50
        new_count = 0
        batch_size = 50
-    # Process in batches of 50
+        for i in range(0, len(upload_wizard), batch_size):
-    new_uploads = []
+            batch = upload_wizard[i : i + batch_size]
-    batch_size = 50
+            titles = [c.title for c in batch]
-    for i in range(0, len(upload_wizard_contributions), batch_size):
+            print(
-        batch = upload_wizard_contributions[i : i + batch_size]
+                f"Processing batch {i // batch_size + 1}/"
-        titles = [c["title"] for c in batch]
+                f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
            )
-        print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
+            metadata = get_image_metadata(titles)
-        metadata = get_image_metadata(titles)
+            for c in batch:
                meta = metadata.get(c.title, {})
                credit = meta.get("credit", "")
                artist = meta.get("artist", "")
-        for c in batch:
+                flickr_url = extract_flickr_url_from_credit(credit)
-            title = c["title"]
+                if not flickr_url:
-            meta = metadata.get(title, {})
+                    continue
            credit = meta.get("credit", "")
            artist = meta.get("artist", "")
-            flickr_url = extract_flickr_url_from_credit(credit)
+                normalized = normalize_flickr_url(flickr_url)
-            if not flickr_url:
+                if normalized in existing_urls:
-                continue
+                    continue
-            # Check if we already have this URL
+                creator = clean_artist_name(artist) if artist else None
            normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
            if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
                continue
-            creator = clean_artist_name(artist) if artist else None
+                # Look up sent message for FK linking
                msg = url_to_message.get(normalized) if normalized else None
-            new_upload = {
+                session.add(FlickrUpload(
-                "pageid": c["pageid"],
+                    pageid=c.pageid,
-                "revid": c["revid"],
+                    revid=c.revid,
-                "title": title,
+                    title=c.title,
-                "timestamp": c["timestamp"],
+                    timestamp=c.timestamp,
-                "flickr_url": flickr_url,
+                    flickr_url=flickr_url,
-                "creator": creator,
+                    normalized_flickr_url=normalized,
-            }
+                    creator=creator,
                    wikipedia_url=msg.wikipedia_url if msg else "",
                    creator_profile_url=msg.creator_profile_url if msg else "",
                    sent_message_id=msg.message_id if msg else None,
                ))
                new_count += 1
                existing_urls.add(normalized)
                print(f"  Found: {c.title[:50]} -> {flickr_url}")
-            new_uploads.append(new_upload)
+            session.commit()
            existing_flickr_urls.add(normalized)
            print(f"  Found: {title[:50]} -> {flickr_url}")
-        # Rate limiting
+            # Rate limiting
-        if i + batch_size < len(upload_wizard_contributions):
+            if i + batch_size < len(upload_wizard):
-            time.sleep(0.5)
+                time.sleep(0.5)
-    print(f"\nFound {len(new_uploads)} new Flickr uploads")
+        total = session.query(FlickrUpload).count()
        print(f"\nFound {new_count} new Flickr uploads")
        print(f"Total: {total} uploads in database")
-    if new_uploads:
+    except Exception:
-        # Merge and sort by timestamp (newest first)
+        session.rollback()
-        all_uploads = existing_uploads + new_uploads
+        raise
-        all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
+    finally:
-
+        session.close()
        # Save
        with open(FLICKR_UPLOADS_FILE, "w") as f:
            json.dump(all_uploads, f, indent=2)
        print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
 if __name__ == "__main__":