Stop fetching all pages when downloading sent mail

This commit is contained in:
Edward Betts 2026-02-07 13:17:34 +00:00
parent 9f0fb01878
commit e072279566
2 changed files with 44 additions and 17 deletions

View file

@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
BASE_URL = "https://www.flickr.com"
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@ -166,22 +167,41 @@ def main() -> None:
http_session = create_session()
# Scrape all pages to find new messages
total_pages = 29
new_messages: list[dict] = []
stop_fetching = False
print("Fetching message list from all pages...")
for page in range(1, total_pages + 1):
print("Fetching message list until we reach existing messages...")
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
url = SENT_MAIL_URL.format(page=page)
print(f" Fetching page {page}/{total_pages}...")
print(f" Fetching page {page}...")
try:
soup = fetch_page(http_session, url)
page_messages = extract_messages_from_list_page(soup)
if not page_messages:
print(" No messages found on this page, stopping")
break
page_new_messages = 0
for msg in page_messages:
if msg["message_id"] not in existing_ids:
new_messages.append(msg)
msg_id = msg.get("message_id")
if not msg_id:
continue
if msg_id in existing_ids:
stop_fetching = True
break
new_messages.append(msg)
page_new_messages += 1
if stop_fetching:
print(" Reached messages already in the database, stopping pagination")
break
if page_new_messages == 0:
print(" No new messages on this page, stopping pagination")
break
time.sleep(1) # Be polite to the server