Stop fetching all pages when downloading sent mail

2026-02-07 13:17:34 +00:00 · 2026-02-07 13:17:34 +00:00 · e072279566
commit e072279566
parent 9f0fb01878
2 changed files with 44 additions and 17 deletions
--- a/download_sent_mail.py
+++ b/download_sent_mail.py
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
 BASE_URL = "https://www.flickr.com"
 SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
 MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
+MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@ -166,22 +167,41 @@ def main() -> None:

        http_session = create_session()

-        # Scrape all pages to find new messages
-        total_pages = 29
        new_messages: list[dict] = []
+        stop_fetching = False

-        print("Fetching message list from all pages...")
-        for page in range(1, total_pages + 1):
+        print("Fetching message list until we reach existing messages...")
+        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
            url = SENT_MAIL_URL.format(page=page)
-            print(f"  Fetching page {page}/{total_pages}...")
+            print(f"  Fetching page {page}...")

            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)

+                if not page_messages:
+                    print("  No messages found on this page, stopping")
+                    break
+
+                page_new_messages = 0
                for msg in page_messages:
-                    if msg["message_id"] not in existing_ids:
-                        new_messages.append(msg)
+                    msg_id = msg.get("message_id")
+                    if not msg_id:
+                        continue
+                    if msg_id in existing_ids:
+                        stop_fetching = True
+                        break
+
+                    new_messages.append(msg)
+                    page_new_messages += 1
+
+                if stop_fetching:
+                    print("  Reached messages already in the database, stopping pagination")
+                    break
+
+                if page_new_messages == 0:
+                    print("  No new messages on this page, stopping pagination")
+                    break

                time.sleep(1)  # Be polite to the server