#!/usr/bin/env python3 """Download sent FlickrMail messages for backup.""" import json import time from pathlib import Path import requests from bs4 import BeautifulSoup from flickr_mail.database import init_db, get_session from flickr_mail.models import SentMessage from flickr_mail.url_utils import ( creator_profile_from_flickr_url, extract_urls_from_message, normalize_flickr_url, ) BASE_URL = "https://www.flickr.com" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything CONFIG_FILE = Path(__file__).with_name("download_sent_mail.local.json") EXAMPLE_CONFIG_FILE = Path(__file__).with_name("download_sent_mail.example.json") HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9", "Accept-Encoding": "gzip, deflate, br, zstd", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Priority": "u=0, i", } def load_cookie_string() -> str: """Load Flickr cookies string from local JSON config.""" if not CONFIG_FILE.exists(): raise RuntimeError( f"Missing config file: {CONFIG_FILE}. " f"Copy {EXAMPLE_CONFIG_FILE.name} to {CONFIG_FILE.name} and set cookies_str." ) try: data = json.loads(CONFIG_FILE.read_text()) except json.JSONDecodeError as exc: raise RuntimeError(f"Invalid JSON in {CONFIG_FILE}: {exc}") from exc cookie_str = data.get("cookies_str", "").strip() if not cookie_str: raise RuntimeError(f"{CONFIG_FILE} must contain a non-empty 'cookies_str' value") return cookie_str def parse_cookies(cookie_str: str) -> dict[str, str]: """Parse cookie string into dictionary.""" cookies = {} for item in cookie_str.split("; "): if "=" in item: key, value = item.split("=", 1) cookies[key] = value return cookies def create_session() -> requests.Session: """Create a requests session with authentication.""" session = requests.Session() session.headers.update(HEADERS) session.cookies.update(parse_cookies(load_cookie_string())) return session def fetch_page(session: requests.Session, url: str) -> BeautifulSoup: """Fetch a page and return parsed HTML.""" response = session.get(url) response.raise_for_status() return BeautifulSoup(response.text, "html.parser") def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]: """Extract message metadata from a sent mail list page.""" messages = [] # Find all message rows: mail_rows = soup.select("tr.message_row") for row in mail_rows: msg = {} # Get message ID from the row id attribute row_id = row.get("id", "") if row_id.startswith("message_row_"): msg["message_id"] = row_id.replace("message_row_", "") # Find message link in the subject cell subj_cell = row.select_one("td.subj") if subj_cell: link = subj_cell.find("a") if link: msg["subject"] = link.get_text(strip=True) msg["url"] = BASE_URL + link["href"] # Recipient is in td.fromto fromto_cell = row.select_one("td.fromto") if fromto_cell: msg["recipient"] = fromto_cell.get_text(strip=True) # Date is in td.date date_cell = row.select_one("td.date") if date_cell: msg["date"] = date_cell.get_text(strip=True) if "message_id" in msg: messages.append(msg) return messages def extract_message_content(soup: BeautifulSoup) -> dict: """Extract full message content from a message page.""" content = {} # Find the ThinCase div containing the message thin_case = soup.select_one(".ThinCase") if not thin_case: return content # Find the table with message content table = thin_case.find("table") if not table: return content rows = table.find_all("tr", recursive=False) # Row 0: To: # Row 1: Subject: # Row 2: for row in rows: cells = row.find_all("td", recursive=False) if len(cells) >= 2: header_cell = cells[0] value_cell = cells[1] header = header_cell.get_text(strip=True).lower() if header == "to:": # Get recipient username username = value_cell.select_one(".username") if username: content["recipient"] = username.get_text(strip=True) elif header == "subject:": # Get subject from h3 h3 = value_cell.find("h3") if h3: content["subject"] = h3.get_text(strip=True) elif header == "": # This is the message body row (empty header cell) # Get the content but exclude the delete form form = value_cell.find("form") if form: form.decompose() content["body"] = value_cell.get_text(separator="\n", strip=True) content["body_html"] = str(value_cell) break # Body found, stop processing return content def main() -> None: """Main entry point.""" init_db() db_session = get_session() try: existing_ids = { r[0] for r in db_session.query(SentMessage.message_id).all() } print(f"Database has {len(existing_ids)} messages") http_session = create_session() new_messages: list[dict] = [] stop_fetching = False print("Fetching message list until we reach existing messages...") for page in range(1, MAX_SENT_MAIL_PAGES + 1): url = SENT_MAIL_URL.format(page=page) print(f" Fetching page {page}...") try: soup = fetch_page(http_session, url) page_messages = extract_messages_from_list_page(soup) if not page_messages: print(" No messages found on this page, stopping") break page_new_messages = 0 for msg in page_messages: msg_id = msg.get("message_id") if not msg_id: continue if msg_id in existing_ids: stop_fetching = True break new_messages.append(msg) page_new_messages += 1 if stop_fetching: print(" Reached messages already in the database, stopping pagination") break if page_new_messages == 0: print(" No new messages on this page, stopping pagination") break time.sleep(1) # Be polite to the server except Exception as e: print(f" Error fetching page {page}: {e}") continue print(f"Found {len(new_messages)} new messages to download") # Download individual messages for i, msg in enumerate(new_messages, 1): msg_id = msg["message_id"] url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id) print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...") try: soup = fetch_page(http_session, url) content = extract_message_content(soup) # Merge with metadata full_msg = {**msg, **content} body = full_msg.get("body", "") flickr_url, wikipedia_url = extract_urls_from_message(body) normalized = normalize_flickr_url(flickr_url) if flickr_url else "" creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else "" db_session.add(SentMessage( message_id=msg_id, subject=full_msg.get("subject", ""), url=full_msg.get("url", ""), recipient=full_msg.get("recipient", ""), date=full_msg.get("date", ""), body=body, body_html=full_msg.get("body_html", ""), flickr_url=flickr_url, normalized_flickr_url=normalized, wikipedia_url=wikipedia_url, creator_profile_url=creator_profile, )) db_session.commit() time.sleep(1) # Be polite except Exception as e: db_session.rollback() print(f" Error downloading message {msg_id}: {e}") continue total = db_session.query(SentMessage).count() print(f"Done! {total} messages in database") except Exception: db_session.rollback() raise finally: db_session.close() if __name__ == "__main__": main()