Stop fetching all pages when downloading sent mail
This commit is contained in:
parent
9f0fb01878
commit
e072279566
2 changed files with 44 additions and 17 deletions
|
|
@ -17,6 +17,7 @@ from flickr_mail.url_utils import (
|
|||
BASE_URL = "https://www.flickr.com"
|
||||
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
||||
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
||||
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||
|
|
@ -166,22 +167,41 @@ def main() -> None:
|
|||
|
||||
http_session = create_session()
|
||||
|
||||
# Scrape all pages to find new messages
|
||||
total_pages = 29
|
||||
new_messages: list[dict] = []
|
||||
stop_fetching = False
|
||||
|
||||
print("Fetching message list from all pages...")
|
||||
for page in range(1, total_pages + 1):
|
||||
print("Fetching message list until we reach existing messages...")
|
||||
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
|
||||
url = SENT_MAIL_URL.format(page=page)
|
||||
print(f" Fetching page {page}/{total_pages}...")
|
||||
print(f" Fetching page {page}...")
|
||||
|
||||
try:
|
||||
soup = fetch_page(http_session, url)
|
||||
page_messages = extract_messages_from_list_page(soup)
|
||||
|
||||
if not page_messages:
|
||||
print(" No messages found on this page, stopping")
|
||||
break
|
||||
|
||||
page_new_messages = 0
|
||||
for msg in page_messages:
|
||||
if msg["message_id"] not in existing_ids:
|
||||
new_messages.append(msg)
|
||||
msg_id = msg.get("message_id")
|
||||
if not msg_id:
|
||||
continue
|
||||
if msg_id in existing_ids:
|
||||
stop_fetching = True
|
||||
break
|
||||
|
||||
new_messages.append(msg)
|
||||
page_new_messages += 1
|
||||
|
||||
if stop_fetching:
|
||||
print(" Reached messages already in the database, stopping pagination")
|
||||
break
|
||||
|
||||
if page_new_messages == 0:
|
||||
print(" No new messages on this page, stopping pagination")
|
||||
break
|
||||
|
||||
time.sleep(1) # Be polite to the server
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue