From e072279566f6cf7c6c11ab8f2f95bc16908145df Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Sat, 7 Feb 2026 13:17:34 +0000
Subject: [PATCH] Stop fetching all pages when downloading sent mail

---
 download_commons_contributions.py | 27 +++++++++++++++---------
 download_sent_mail.py             | 34 ++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/download_commons_contributions.py b/download_commons_contributions.py
index 1f8f508..1753552 100755
--- a/download_commons_contributions.py
+++ b/download_commons_contributions.py
@@ -48,12 +48,8 @@ def fetch_contributions(
     return contributions, new_continue
 
 
-def upsert_contribution(session, c: dict) -> None:
-    """Insert or update a contribution by revid."""
-    existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
-    if existing:
-        return  # Already have this revision
-
+def insert_contribution(session, c: dict) -> None:
+    """Insert a contribution row (caller must ensure revid is new)."""
     session.add(Contribution(
         userid=c.get("userid"),
         user=c.get("user"),
@@ -108,13 +104,24 @@ def main() -> None:
                 print("no results")
                 break
 
+            # One DB query per batch to identify already-known revisions.
+            revids = [c["revid"] for c in contributions if "revid" in c]
+            existing_revids = {
+                row[0]
+                for row in (
+                    session.query(Contribution.revid)
+                    .filter(Contribution.revid.in_(revids))
+                    .all()
+                )
+            }
+
             batch_new = 0
             for c in contributions:
-                # Stop if we've reached contributions we already have
-                existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
-                if existing:
+                revid = c.get("revid")
+                if revid in existing_revids:
                     continue
-                upsert_contribution(session, c)
+
+                insert_contribution(session, c)
                 batch_new += 1
 
             new_count += batch_new
diff --git a/download_sent_mail.py b/download_sent_mail.py
index c3ac224..0146c00 100755
--- a/download_sent_mail.py
+++ b/download_sent_mail.py
@@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
 BASE_URL = "https://www.flickr.com"
 SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
 MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
+MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@@ -166,22 +167,41 @@ def main() -> None:
 
         http_session = create_session()
 
-        # Scrape all pages to find new messages
-        total_pages = 29
         new_messages: list[dict] = []
+        stop_fetching = False
 
-        print("Fetching message list from all pages...")
-        for page in range(1, total_pages + 1):
+        print("Fetching message list until we reach existing messages...")
+        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
             url = SENT_MAIL_URL.format(page=page)
-            print(f"  Fetching page {page}/{total_pages}...")
+            print(f"  Fetching page {page}...")
 
             try:
                 soup = fetch_page(http_session, url)
                 page_messages = extract_messages_from_list_page(soup)
 
+                if not page_messages:
+                    print("  No messages found on this page, stopping")
+                    break
+
+                page_new_messages = 0
                 for msg in page_messages:
-                    if msg["message_id"] not in existing_ids:
-                        new_messages.append(msg)
+                    msg_id = msg.get("message_id")
+                    if not msg_id:
+                        continue
+                    if msg_id in existing_ids:
+                        stop_fetching = True
+                        break
+
+                    new_messages.append(msg)
+                    page_new_messages += 1
+
+                if stop_fetching:
+                    print("  Reached messages already in the database, stopping pagination")
+                    break
+
+                if page_new_messages == 0:
+                    print("  No new messages on this page, stopping pagination")
+                    break
 
                 time.sleep(1)  # Be polite to the server