flickr-mail/download_sent_mail.py

#!/usr/bin/env python3
"""Download sent FlickrMail messages for backup."""

import time

import requests
from bs4 import BeautifulSoup

from flickr_mail.database import init_db, get_session
from flickr_mail.models import SentMessage
from flickr_mail.url_utils import (
    creator_profile_from_flickr_url,
    extract_urls_from_message,
    normalize_flickr_url,
)

BASE_URL = "https://www.flickr.com"
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Priority": "u=0, i",
}

COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""


def parse_cookies(cookie_str: str) -> dict[str, str]:
    """Parse cookie string into dictionary."""
    cookies = {}
    for item in cookie_str.split("; "):
        if "=" in item:
            key, value = item.split("=", 1)
            cookies[key] = value
    return cookies


def create_session() -> requests.Session:
    """Create a requests session with authentication."""
    session = requests.Session()
    session.headers.update(HEADERS)
    session.cookies.update(parse_cookies(COOKIES_STR))
    return session


def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
    """Fetch a page and return parsed HTML."""
    response = session.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
    """Extract message metadata from a sent mail list page."""
    messages = []

    # Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
    mail_rows = soup.select("tr.message_row")

    for row in mail_rows:
        msg = {}

        # Get message ID from the row id attribute
        row_id = row.get("id", "")
        if row_id.startswith("message_row_"):
            msg["message_id"] = row_id.replace("message_row_", "")

        # Find message link in the subject cell
        subj_cell = row.select_one("td.subj")
        if subj_cell:
            link = subj_cell.find("a")
            if link:
                msg["subject"] = link.get_text(strip=True)
                msg["url"] = BASE_URL + link["href"]

        # Recipient is in td.fromto
        fromto_cell = row.select_one("td.fromto")
        if fromto_cell:
            msg["recipient"] = fromto_cell.get_text(strip=True)

        # Date is in td.date
        date_cell = row.select_one("td.date")
        if date_cell:
            msg["date"] = date_cell.get_text(strip=True)

        if "message_id" in msg:
            messages.append(msg)

    return messages


def extract_message_content(soup: BeautifulSoup) -> dict:
    """Extract full message content from a message page."""
    content = {}

    # Find the ThinCase div containing the message
    thin_case = soup.select_one(".ThinCase")
    if not thin_case:
        return content

    # Find the table with message content
    table = thin_case.find("table")
    if not table:
        return content

    rows = table.find_all("tr", recursive=False)

    # Row 0: To: <recipient>
    # Row 1: Subject: <subject>
    # Row 2: <empty> <body>
    for row in rows:
        cells = row.find_all("td", recursive=False)
        if len(cells) >= 2:
            header_cell = cells[0]
            value_cell = cells[1]

            header = header_cell.get_text(strip=True).lower()

            if header == "to:":
                # Get recipient username
                username = value_cell.select_one(".username")
                if username:
                    content["recipient"] = username.get_text(strip=True)

            elif header == "subject:":
                # Get subject from h3
                h3 = value_cell.find("h3")
                if h3:
                    content["subject"] = h3.get_text(strip=True)

            elif header == "":
                # This is the message body row (empty header cell)
                # Get the content but exclude the delete form
                form = value_cell.find("form")
                if form:
                    form.decompose()

                content["body"] = value_cell.get_text(separator="\n", strip=True)
                content["body_html"] = str(value_cell)
                break  # Body found, stop processing

    return content


def main() -> None:
    """Main entry point."""
    init_db()
    db_session = get_session()

    try:
        existing_ids = {
            r[0] for r in db_session.query(SentMessage.message_id).all()
        }
        print(f"Database has {len(existing_ids)} messages")

        http_session = create_session()

        new_messages: list[dict] = []
        stop_fetching = False

        print("Fetching message list until we reach existing messages...")
        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
            url = SENT_MAIL_URL.format(page=page)
            print(f"  Fetching page {page}...")

            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)

                if not page_messages:
                    print("  No messages found on this page, stopping")
                    break

                page_new_messages = 0
                for msg in page_messages:
                    msg_id = msg.get("message_id")
                    if not msg_id:
                        continue
                    if msg_id in existing_ids:
                        stop_fetching = True
                        break

                    new_messages.append(msg)
                    page_new_messages += 1

                if stop_fetching:
                    print("  Reached messages already in the database, stopping pagination")
                    break

                if page_new_messages == 0:
                    print("  No new messages on this page, stopping pagination")
                    break

                time.sleep(1)  # Be polite to the server

            except Exception as e:
                print(f"  Error fetching page {page}: {e}")
                continue

        print(f"Found {len(new_messages)} new messages to download")

        # Download individual messages
        for i, msg in enumerate(new_messages, 1):
            msg_id = msg["message_id"]
            url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)

            print(f"  [{i}/{len(new_messages)}] Downloading message {msg_id}...")

            try:
                soup = fetch_page(http_session, url)
                content = extract_message_content(soup)

                # Merge with metadata
                full_msg = {**msg, **content}

                body = full_msg.get("body", "")
                flickr_url, wikipedia_url = extract_urls_from_message(body)
                normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
                creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""

                db_session.add(SentMessage(
                    message_id=msg_id,
                    subject=full_msg.get("subject", ""),
                    url=full_msg.get("url", ""),
                    recipient=full_msg.get("recipient", ""),
                    date=full_msg.get("date", ""),
                    body=body,
                    body_html=full_msg.get("body_html", ""),
                    flickr_url=flickr_url,
                    normalized_flickr_url=normalized,
                    wikipedia_url=wikipedia_url,
                    creator_profile_url=creator_profile,
                ))
                db_session.commit()

                time.sleep(1)  # Be polite

            except Exception as e:
                db_session.rollback()
                print(f"    Error downloading message {msg_id}: {e}")
                continue

        total = db_session.query(SentMessage).count()
        print(f"Done! {total} messages in database")

    except Exception:
        db_session.rollback()
        raise
    finally:
        db_session.close()


if __name__ == "__main__":
    main()