Stop fetching all pages when downloading sent mail

This commit is contained in:
Edward Betts 2026-02-07 13:17:34 +00:00
parent 9f0fb01878
commit e072279566
2 changed files with 44 additions and 17 deletions

View file

@ -48,12 +48,8 @@ def fetch_contributions(
return contributions, new_continue return contributions, new_continue
def upsert_contribution(session, c: dict) -> None: def insert_contribution(session, c: dict) -> None:
"""Insert or update a contribution by revid.""" """Insert a contribution row (caller must ensure revid is new)."""
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
if existing:
return # Already have this revision
session.add(Contribution( session.add(Contribution(
userid=c.get("userid"), userid=c.get("userid"),
user=c.get("user"), user=c.get("user"),
@ -108,13 +104,24 @@ def main() -> None:
print("no results") print("no results")
break break
# One DB query per batch to identify already-known revisions.
revids = [c["revid"] for c in contributions if "revid" in c]
existing_revids = {
row[0]
for row in (
session.query(Contribution.revid)
.filter(Contribution.revid.in_(revids))
.all()
)
}
batch_new = 0 batch_new = 0
for c in contributions: for c in contributions:
# Stop if we've reached contributions we already have revid = c.get("revid")
existing = session.query(Contribution).filter_by(revid=c["revid"]).first() if revid in existing_revids:
if existing:
continue continue
upsert_contribution(session, c)
insert_contribution(session, c)
batch_new += 1 batch_new += 1
new_count += batch_new new_count += batch_new

View file

@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
BASE_URL = "https://www.flickr.com" BASE_URL = "https://www.flickr.com"
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
HEADERS = { HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
@ -166,22 +167,41 @@ def main() -> None:
http_session = create_session() http_session = create_session()
# Scrape all pages to find new messages
total_pages = 29
new_messages: list[dict] = [] new_messages: list[dict] = []
stop_fetching = False
print("Fetching message list from all pages...") print("Fetching message list until we reach existing messages...")
for page in range(1, total_pages + 1): for page in range(1, MAX_SENT_MAIL_PAGES + 1):
url = SENT_MAIL_URL.format(page=page) url = SENT_MAIL_URL.format(page=page)
print(f" Fetching page {page}/{total_pages}...") print(f" Fetching page {page}...")
try: try:
soup = fetch_page(http_session, url) soup = fetch_page(http_session, url)
page_messages = extract_messages_from_list_page(soup) page_messages = extract_messages_from_list_page(soup)
if not page_messages:
print(" No messages found on this page, stopping")
break
page_new_messages = 0
for msg in page_messages: for msg in page_messages:
if msg["message_id"] not in existing_ids: msg_id = msg.get("message_id")
if not msg_id:
continue
if msg_id in existing_ids:
stop_fetching = True
break
new_messages.append(msg) new_messages.append(msg)
page_new_messages += 1
if stop_fetching:
print(" Reached messages already in the database, stopping pagination")
break
if page_new_messages == 0:
print(" No new messages on this page, stopping pagination")
break
time.sleep(1) # Be polite to the server time.sleep(1) # Be polite to the server