#!/usr/bin/env python3 """Download sent FlickrMail messages for backup.""" import time import requests from bs4 import BeautifulSoup from flickr_mail.database import init_db, get_session from flickr_mail.models import SentMessage from flickr_mail.url_utils import ( creator_profile_from_flickr_url, extract_urls_from_message, normalize_flickr_url, ) BASE_URL = "https://www.flickr.com" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9", "Accept-Encoding": "gzip, deflate, br, zstd", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Priority": "u=0, i", } COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672""" def parse_cookies(cookie_str: str) -> dict[str, str]: """Parse cookie string into dictionary.""" cookies = {} for item in cookie_str.split("; "): if "=" in item: key, value = item.split("=", 1) cookies[key] = value return cookies def create_session() -> requests.Session: """Create a requests session with authentication.""" session = requests.Session() session.headers.update(HEADERS) session.cookies.update(parse_cookies(COOKIES_STR)) return session def fetch_page(session: requests.Session, url: str) -> BeautifulSoup: """Fetch a page and return parsed HTML.""" response = session.get(url) response.raise_for_status() return BeautifulSoup(response.text, "html.parser") def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]: """Extract message metadata from a sent mail list page.""" messages = [] # Find all message rows: mail_rows = soup.select("tr.message_row") for row in mail_rows: msg = {} # Get message ID from the row id attribute row_id = row.get("id", "") if row_id.startswith("message_row_"): msg["message_id"] = row_id.replace("message_row_", "") # Find message link in the subject cell subj_cell = row.select_one("td.subj") if subj_cell: link = subj_cell.find("a") if link: msg["subject"] = link.get_text(strip=True) msg["url"] = BASE_URL + link["href"] # Recipient is in td.fromto fromto_cell = row.select_one("td.fromto") if fromto_cell: msg["recipient"] = fromto_cell.get_text(strip=True) # Date is in td.date date_cell = row.select_one("td.date") if date_cell: msg["date"] = date_cell.get_text(strip=True) if "message_id" in msg: messages.append(msg) return messages def extract_message_content(soup: BeautifulSoup) -> dict: """Extract full message content from a message page.""" content = {} # Find the ThinCase div containing the message thin_case = soup.select_one(".ThinCase") if not thin_case: return content # Find the table with message content table = thin_case.find("table") if not table: return content rows = table.find_all("tr", recursive=False) # Row 0: To: # Row 1: Subject: # Row 2: for row in rows: cells = row.find_all("td", recursive=False) if len(cells) >= 2: header_cell = cells[0] value_cell = cells[1] header = header_cell.get_text(strip=True).lower() if header == "to:": # Get recipient username username = value_cell.select_one(".username") if username: content["recipient"] = username.get_text(strip=True) elif header == "subject:": # Get subject from h3 h3 = value_cell.find("h3") if h3: content["subject"] = h3.get_text(strip=True) elif header == "": # This is the message body row (empty header cell) # Get the content but exclude the delete form form = value_cell.find("form") if form: form.decompose() content["body"] = value_cell.get_text(separator="\n", strip=True) content["body_html"] = str(value_cell) break # Body found, stop processing return content def main() -> None: """Main entry point.""" init_db() db_session = get_session() try: existing_ids = { r[0] for r in db_session.query(SentMessage.message_id).all() } print(f"Database has {len(existing_ids)} messages") http_session = create_session() new_messages: list[dict] = [] stop_fetching = False print("Fetching message list until we reach existing messages...") for page in range(1, MAX_SENT_MAIL_PAGES + 1): url = SENT_MAIL_URL.format(page=page) print(f" Fetching page {page}...") try: soup = fetch_page(http_session, url) page_messages = extract_messages_from_list_page(soup) if not page_messages: print(" No messages found on this page, stopping") break page_new_messages = 0 for msg in page_messages: msg_id = msg.get("message_id") if not msg_id: continue if msg_id in existing_ids: stop_fetching = True break new_messages.append(msg) page_new_messages += 1 if stop_fetching: print(" Reached messages already in the database, stopping pagination") break if page_new_messages == 0: print(" No new messages on this page, stopping pagination") break time.sleep(1) # Be polite to the server except Exception as e: print(f" Error fetching page {page}: {e}") continue print(f"Found {len(new_messages)} new messages to download") # Download individual messages for i, msg in enumerate(new_messages, 1): msg_id = msg["message_id"] url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id) print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...") try: soup = fetch_page(http_session, url) content = extract_message_content(soup) # Merge with metadata full_msg = {**msg, **content} body = full_msg.get("body", "") flickr_url, wikipedia_url = extract_urls_from_message(body) normalized = normalize_flickr_url(flickr_url) if flickr_url else "" creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else "" db_session.add(SentMessage( message_id=msg_id, subject=full_msg.get("subject", ""), url=full_msg.get("url", ""), recipient=full_msg.get("recipient", ""), date=full_msg.get("date", ""), body=body, body_html=full_msg.get("body_html", ""), flickr_url=flickr_url, normalized_flickr_url=normalized, wikipedia_url=wikipedia_url, creator_profile_url=creator_profile, )) db_session.commit() time.sleep(1) # Be polite except Exception as e: db_session.rollback() print(f" Error downloading message {msg_id}: {e}") continue total = db_session.query(SentMessage).count() print(f"Done! {total} messages in database") except Exception: db_session.rollback() raise finally: db_session.close() if __name__ == "__main__": main()