Stop fetching all pages when downloading sent mail

2026-02-07 13:17:34 +00:00 · 2026-02-07 13:17:34 +00:00 · e072279566
commit e072279566
parent 9f0fb01878
2 changed files with 44 additions and 17 deletions
--- a/download_commons_contributions.py
+++ b/download_commons_contributions.py
@ -48,12 +48,8 @@ def fetch_contributions(
    return contributions, new_continue


-def upsert_contribution(session, c: dict) -> None:
-    """Insert or update a contribution by revid."""
-    existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
-    if existing:
-        return  # Already have this revision
-
+def insert_contribution(session, c: dict) -> None:
+    """Insert a contribution row (caller must ensure revid is new)."""
    session.add(Contribution(
        userid=c.get("userid"),
        user=c.get("user"),
@ -108,13 +104,24 @@ def main() -> None:
                print("no results")
                break

+            # One DB query per batch to identify already-known revisions.
+            revids = [c["revid"] for c in contributions if "revid" in c]
+            existing_revids = {
+                row[0]
+                for row in (
+                    session.query(Contribution.revid)
+                    .filter(Contribution.revid.in_(revids))
+                    .all()
+                )
+            }
+
            batch_new = 0
            for c in contributions:
-                # Stop if we've reached contributions we already have
-                existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
-                if existing:
+                revid = c.get("revid")
+                if revid in existing_revids:
                    continue
-                upsert_contribution(session, c)
+
+                insert_contribution(session, c)
                batch_new += 1

            new_count += batch_new
--- a/download_sent_mail.py
+++ b/download_sent_mail.py
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
 BASE_URL = "https://www.flickr.com"
 SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
 MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
+MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@ -166,22 +167,41 @@ def main() -> None:

        http_session = create_session()

-        # Scrape all pages to find new messages
-        total_pages = 29
        new_messages: list[dict] = []
+        stop_fetching = False

-        print("Fetching message list from all pages...")
-        for page in range(1, total_pages + 1):
+        print("Fetching message list until we reach existing messages...")
+        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
            url = SENT_MAIL_URL.format(page=page)
-            print(f"  Fetching page {page}/{total_pages}...")
+            print(f"  Fetching page {page}...")

            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)

+                if not page_messages:
+                    print("  No messages found on this page, stopping")
+                    break
+
+                page_new_messages = 0
                for msg in page_messages:
-                    if msg["message_id"] not in existing_ids:
-                        new_messages.append(msg)
+                    msg_id = msg.get("message_id")
+                    if not msg_id:
+                        continue
+                    if msg_id in existing_ids:
+                        stop_fetching = True
+                        break
+
+                    new_messages.append(msg)
+                    page_new_messages += 1
+
+                if stop_fetching:
+                    print("  Reached messages already in the database, stopping pagination")
+                    break
+
+                if page_new_messages == 0:
+                    print("  No new messages on this page, stopping pagination")
+                    break

                time.sleep(1)  # Be polite to the server