Stop fetching all pages when downloading sent mail
This commit is contained in:
parent
9f0fb01878
commit
e072279566
2 changed files with 44 additions and 17 deletions
|
|
@ -48,12 +48,8 @@ def fetch_contributions(
|
|||
return contributions, new_continue
|
||||
|
||||
|
||||
def upsert_contribution(session, c: dict) -> None:
|
||||
"""Insert or update a contribution by revid."""
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
return # Already have this revision
|
||||
|
||||
def insert_contribution(session, c: dict) -> None:
|
||||
"""Insert a contribution row (caller must ensure revid is new)."""
|
||||
session.add(Contribution(
|
||||
userid=c.get("userid"),
|
||||
user=c.get("user"),
|
||||
|
|
@ -108,13 +104,24 @@ def main() -> None:
|
|||
print("no results")
|
||||
break
|
||||
|
||||
# One DB query per batch to identify already-known revisions.
|
||||
revids = [c["revid"] for c in contributions if "revid" in c]
|
||||
existing_revids = {
|
||||
row[0]
|
||||
for row in (
|
||||
session.query(Contribution.revid)
|
||||
.filter(Contribution.revid.in_(revids))
|
||||
.all()
|
||||
)
|
||||
}
|
||||
|
||||
batch_new = 0
|
||||
for c in contributions:
|
||||
# Stop if we've reached contributions we already have
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
revid = c.get("revid")
|
||||
if revid in existing_revids:
|
||||
continue
|
||||
upsert_contribution(session, c)
|
||||
|
||||
insert_contribution(session, c)
|
||||
batch_new += 1
|
||||
|
||||
new_count += batch_new
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
|
|||
BASE_URL = "https://www.flickr.com"
|
||||
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
||||
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
||||
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||
|
|
@ -166,22 +167,41 @@ def main() -> None:
|
|||
|
||||
http_session = create_session()
|
||||
|
||||
# Scrape all pages to find new messages
|
||||
total_pages = 29
|
||||
new_messages: list[dict] = []
|
||||
stop_fetching = False
|
||||
|
||||
print("Fetching message list from all pages...")
|
||||
for page in range(1, total_pages + 1):
|
||||
print("Fetching message list until we reach existing messages...")
|
||||
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
|
||||
url = SENT_MAIL_URL.format(page=page)
|
||||
print(f" Fetching page {page}/{total_pages}...")
|
||||
print(f" Fetching page {page}...")
|
||||
|
||||
try:
|
||||
soup = fetch_page(http_session, url)
|
||||
page_messages = extract_messages_from_list_page(soup)
|
||||
|
||||
if not page_messages:
|
||||
print(" No messages found on this page, stopping")
|
||||
break
|
||||
|
||||
page_new_messages = 0
|
||||
for msg in page_messages:
|
||||
if msg["message_id"] not in existing_ids:
|
||||
new_messages.append(msg)
|
||||
msg_id = msg.get("message_id")
|
||||
if not msg_id:
|
||||
continue
|
||||
if msg_id in existing_ids:
|
||||
stop_fetching = True
|
||||
break
|
||||
|
||||
new_messages.append(msg)
|
||||
page_new_messages += 1
|
||||
|
||||
if stop_fetching:
|
||||
print(" Reached messages already in the database, stopping pagination")
|
||||
break
|
||||
|
||||
if page_new_messages == 0:
|
||||
print(" No new messages on this page, stopping pagination")
|
||||
break
|
||||
|
||||
time.sleep(1) # Be polite to the server
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue