From e072279566f6cf7c6c11ab8f2f95bc16908145df Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 13:17:34 +0000 Subject: [PATCH] Stop fetching all pages when downloading sent mail --- download_commons_contributions.py | 27 +++++++++++++++--------- download_sent_mail.py | 34 ++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/download_commons_contributions.py b/download_commons_contributions.py index 1f8f508..1753552 100755 --- a/download_commons_contributions.py +++ b/download_commons_contributions.py @@ -48,12 +48,8 @@ def fetch_contributions( return contributions, new_continue -def upsert_contribution(session, c: dict) -> None: - """Insert or update a contribution by revid.""" - existing = session.query(Contribution).filter_by(revid=c["revid"]).first() - if existing: - return # Already have this revision - +def insert_contribution(session, c: dict) -> None: + """Insert a contribution row (caller must ensure revid is new).""" session.add(Contribution( userid=c.get("userid"), user=c.get("user"), @@ -108,13 +104,24 @@ def main() -> None: print("no results") break + # One DB query per batch to identify already-known revisions. + revids = [c["revid"] for c in contributions if "revid" in c] + existing_revids = { + row[0] + for row in ( + session.query(Contribution.revid) + .filter(Contribution.revid.in_(revids)) + .all() + ) + } + batch_new = 0 for c in contributions: - # Stop if we've reached contributions we already have - existing = session.query(Contribution).filter_by(revid=c["revid"]).first() - if existing: + revid = c.get("revid") + if revid in existing_revids: continue - upsert_contribution(session, c) + + insert_contribution(session, c) batch_new += 1 new_count += batch_new diff --git a/download_sent_mail.py b/download_sent_mail.py index c3ac224..0146c00 100755 --- a/download_sent_mail.py +++ b/download_sent_mail.py @@ -17,6 +17,7 @@ from flickr_mail.url_utils import ( BASE_URL = "https://www.flickr.com" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" +MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", @@ -166,22 +167,41 @@ def main() -> None: http_session = create_session() - # Scrape all pages to find new messages - total_pages = 29 new_messages: list[dict] = [] + stop_fetching = False - print("Fetching message list from all pages...") - for page in range(1, total_pages + 1): + print("Fetching message list until we reach existing messages...") + for page in range(1, MAX_SENT_MAIL_PAGES + 1): url = SENT_MAIL_URL.format(page=page) - print(f" Fetching page {page}/{total_pages}...") + print(f" Fetching page {page}...") try: soup = fetch_page(http_session, url) page_messages = extract_messages_from_list_page(soup) + if not page_messages: + print(" No messages found on this page, stopping") + break + + page_new_messages = 0 for msg in page_messages: - if msg["message_id"] not in existing_ids: - new_messages.append(msg) + msg_id = msg.get("message_id") + if not msg_id: + continue + if msg_id in existing_ids: + stop_fetching = True + break + + new_messages.append(msg) + page_new_messages += 1 + + if stop_fetching: + print(" Reached messages already in the database, stopping pagination") + break + + if page_new_messages == 0: + print(" No new messages on this page, stopping pagination") + break time.sleep(1) # Be polite to the server