Stop fetching all pages when downloading sent mail
This commit is contained in:
parent
9f0fb01878
commit
e072279566
2 changed files with 44 additions and 17 deletions
|
|
@ -48,12 +48,8 @@ def fetch_contributions(
|
||||||
return contributions, new_continue
|
return contributions, new_continue
|
||||||
|
|
||||||
|
|
||||||
def upsert_contribution(session, c: dict) -> None:
|
def insert_contribution(session, c: dict) -> None:
|
||||||
"""Insert or update a contribution by revid."""
|
"""Insert a contribution row (caller must ensure revid is new)."""
|
||||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
|
||||||
if existing:
|
|
||||||
return # Already have this revision
|
|
||||||
|
|
||||||
session.add(Contribution(
|
session.add(Contribution(
|
||||||
userid=c.get("userid"),
|
userid=c.get("userid"),
|
||||||
user=c.get("user"),
|
user=c.get("user"),
|
||||||
|
|
@ -108,13 +104,24 @@ def main() -> None:
|
||||||
print("no results")
|
print("no results")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# One DB query per batch to identify already-known revisions.
|
||||||
|
revids = [c["revid"] for c in contributions if "revid" in c]
|
||||||
|
existing_revids = {
|
||||||
|
row[0]
|
||||||
|
for row in (
|
||||||
|
session.query(Contribution.revid)
|
||||||
|
.filter(Contribution.revid.in_(revids))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
batch_new = 0
|
batch_new = 0
|
||||||
for c in contributions:
|
for c in contributions:
|
||||||
# Stop if we've reached contributions we already have
|
revid = c.get("revid")
|
||||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
if revid in existing_revids:
|
||||||
if existing:
|
|
||||||
continue
|
continue
|
||||||
upsert_contribution(session, c)
|
|
||||||
|
insert_contribution(session, c)
|
||||||
batch_new += 1
|
batch_new += 1
|
||||||
|
|
||||||
new_count += batch_new
|
new_count += batch_new
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
|
||||||
BASE_URL = "https://www.flickr.com"
|
BASE_URL = "https://www.flickr.com"
|
||||||
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
||||||
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
||||||
|
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||||
|
|
@ -166,22 +167,41 @@ def main() -> None:
|
||||||
|
|
||||||
http_session = create_session()
|
http_session = create_session()
|
||||||
|
|
||||||
# Scrape all pages to find new messages
|
|
||||||
total_pages = 29
|
|
||||||
new_messages: list[dict] = []
|
new_messages: list[dict] = []
|
||||||
|
stop_fetching = False
|
||||||
|
|
||||||
print("Fetching message list from all pages...")
|
print("Fetching message list until we reach existing messages...")
|
||||||
for page in range(1, total_pages + 1):
|
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
|
||||||
url = SENT_MAIL_URL.format(page=page)
|
url = SENT_MAIL_URL.format(page=page)
|
||||||
print(f" Fetching page {page}/{total_pages}...")
|
print(f" Fetching page {page}...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
soup = fetch_page(http_session, url)
|
soup = fetch_page(http_session, url)
|
||||||
page_messages = extract_messages_from_list_page(soup)
|
page_messages = extract_messages_from_list_page(soup)
|
||||||
|
|
||||||
|
if not page_messages:
|
||||||
|
print(" No messages found on this page, stopping")
|
||||||
|
break
|
||||||
|
|
||||||
|
page_new_messages = 0
|
||||||
for msg in page_messages:
|
for msg in page_messages:
|
||||||
if msg["message_id"] not in existing_ids:
|
msg_id = msg.get("message_id")
|
||||||
|
if not msg_id:
|
||||||
|
continue
|
||||||
|
if msg_id in existing_ids:
|
||||||
|
stop_fetching = True
|
||||||
|
break
|
||||||
|
|
||||||
new_messages.append(msg)
|
new_messages.append(msg)
|
||||||
|
page_new_messages += 1
|
||||||
|
|
||||||
|
if stop_fetching:
|
||||||
|
print(" Reached messages already in the database, stopping pagination")
|
||||||
|
break
|
||||||
|
|
||||||
|
if page_new_messages == 0:
|
||||||
|
print(" No new messages on this page, stopping pagination")
|
||||||
|
break
|
||||||
|
|
||||||
time.sleep(1) # Be polite to the server
|
time.sleep(1) # Be polite to the server
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue