Stop fetching all pages when downloading sent mail

2026-02-07 13:17:34 +00:00 · 2026-02-07 13:17:34 +00:00 · e072279566
commit e072279566
parent 9f0fb01878
2 changed files with 44 additions and 17 deletions
--- a/download_commons_contributions.py
+++ b/download_commons_contributions.py
@ -48,12 +48,8 @@ def fetch_contributions(
    return contributions, new_continue
-def upsert_contribution(session, c: dict) -> None:
+def insert_contribution(session, c: dict) -> None:
-    """Insert or update a contribution by revid."""
+    """Insert a contribution row (caller must ensure revid is new)."""
    existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
    if existing:
        return  # Already have this revision
    session.add(Contribution(
        userid=c.get("userid"),
        user=c.get("user"),
@ -108,13 +104,24 @@ def main() -> None:
                print("no results")
                break
            # One DB query per batch to identify already-known revisions.
            revids = [c["revid"] for c in contributions if "revid" in c]
            existing_revids = {
                row[0]
                for row in (
                    session.query(Contribution.revid)
                    .filter(Contribution.revid.in_(revids))
                    .all()
                )
            }
            batch_new = 0
            for c in contributions:
-                # Stop if we've reached contributions we already have
+                revid = c.get("revid")
-                existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
+                if revid in existing_revids:
                if existing:
                    continue
-                upsert_contribution(session, c)
+
                insert_contribution(session, c)
                batch_new += 1
            new_count += batch_new
--- a/download_sent_mail.py
+++ b/download_sent_mail.py
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
 BASE_URL = "https://www.flickr.com"
 SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
 MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
 MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@ -166,22 +167,41 @@ def main() -> None:
        http_session = create_session()
        # Scrape all pages to find new messages
        total_pages = 29
        new_messages: list[dict] = []
        stop_fetching = False
-        print("Fetching message list from all pages...")
+        print("Fetching message list until we reach existing messages...")
-        for page in range(1, total_pages + 1):
+        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
            url = SENT_MAIL_URL.format(page=page)
-            print(f"  Fetching page {page}/{total_pages}...")
+            print(f"  Fetching page {page}...")
            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)
                if not page_messages:
                    print("  No messages found on this page, stopping")
                    break
                page_new_messages = 0
                for msg in page_messages:
-                    if msg["message_id"] not in existing_ids:
+                    msg_id = msg.get("message_id")
                    if not msg_id:
                        continue
                    if msg_id in existing_ids:
                        stop_fetching = True
                        break
                    new_messages.append(msg)
                    page_new_messages += 1
                if stop_fetching:
                    print("  Reached messages already in the database, stopping pagination")
                    break
                if page_new_messages == 0:
                    print("  No new messages on this page, stopping pagination")
                    break
                time.sleep(1)  # Be polite to the server