flickr-mail/main.py

#!/usr/bin/python3
"""Find photos on flickr for Wikipedia articles and contact the photographer."""

import collections
import dataclasses
import inspect
import json
import sys
import traceback
import typing
from urllib.parse import quote, unquote

import flask
import requests
import werkzeug
from werkzeug.debug.tbtools import DebugTraceback


app = flask.Flask(__name__)
app.debug = False

enwiki = "en.wikipedia.org/wiki/"

# Browser-like headers for Flickr requests
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}


# Flickr license codes to human-readable names
FLICKR_LICENSES = {
    0: "All Rights Reserved",
    1: "CC BY-NC-SA",
    2: "CC BY-NC",
    3: "CC BY-NC-ND",
    4: "CC BY",
    5: "CC BY-SA",
    6: "CC BY-ND",
    7: "No known copyright",
    8: "US Government",
    9: "CC0",
    10: "Public Domain",
}


PHOTOS_PER_PAGE = 25


@dataclasses.dataclass
class FlickrPhoto:
    """Represents a Flickr photo from search results."""

    id: str
    title: str
    path_alias: str
    owner_nsid: str
    username: str
    realname: str
    license: int
    thumb_url: str
    medium_url: str

    @property
    def flickr_url(self) -> str:
        """URL to the photo page on Flickr."""
        return f"https://flickr.com/photos/{self.path_alias}/{self.id}"

    @property
    def license_name(self) -> str:
        """Human-readable license name."""
        return FLICKR_LICENSES.get(self.license, f"License {self.license}")


@dataclasses.dataclass
class SearchResult:
    """Flickr search results with pagination metadata."""

    photos: list[FlickrPhoto]
    total_photos: int
    current_page: int
    total_pages: int


def is_valid_flickr_image_url(url: str) -> bool:
    """Check if URL is a valid Flickr static image URL."""
    valid_prefixes = (
        "https://live.staticflickr.com/",
        "https://farm",  # farm1.staticflickr.com, farm2.staticflickr.com, etc.
        "https://c1.staticflickr.com/",
        "https://c2.staticflickr.com/",
    )
    if not url.startswith(valid_prefixes):
        return False
    # For farm URLs, verify the domain pattern
    if url.startswith("https://farm"):
        if ".staticflickr.com/" not in url:
            return False
    return True


def search_flickr(search_term: str, page: int = 1) -> SearchResult:
    """Search Flickr for photos matching the search term."""
    encoded_term = quote(f'"{search_term}"')
    url = f"https://flickr.com/search/?view_all=1&text={encoded_term}&page={page}"

    response = requests.get(url, headers=BROWSER_HEADERS)
    response.raise_for_status()

    return parse_flickr_search_results(response.text, page)


def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult:
    """Parse Flickr search results HTML and extract photo data."""
    empty_result = SearchResult(photos=[], total_photos=0, current_page=page, total_pages=0)

    # Find the modelExport JSON embedded in the page
    start = html.find("modelExport:")
    if start == -1:
        return empty_result

    start += len("modelExport:")
    while html[start].isspace():
        start += 1

    # Parse the JSON by counting braces
    brace_count = 0
    i = start
    in_string = False
    escape_next = False

    while i < len(html):
        char = html[i]

        if escape_next:
            escape_next = False
            i += 1
            continue

        if char == "\\" and in_string:
            escape_next = True
            i += 1
            continue

        if char == '"' and not escape_next:
            in_string = not in_string
        elif not in_string:
            if char == "{":
                brace_count += 1
            elif char == "}":
                brace_count -= 1
                if brace_count == 0:
                    json_str = html[start : i + 1]
                    break
        i += 1
    else:
        return empty_result

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError:
        return empty_result

    # Extract photos from the parsed data
    photos: list[FlickrPhoto] = []

    main = data.get("main", {})
    photos_models = main.get("search-photos-lite-models", [])

    if not photos_models:
        return empty_result

    model_data = photos_models[0].get("data", {})
    photos_container = model_data.get("photos", {}).get("data", {})
    photos_data = photos_container.get("_data", [])
    total_photos = photos_container.get("totalItems", 0)

    # Calculate total pages (Flickr caps at 4000 results)
    total_pages = min(total_photos, 4000) // PHOTOS_PER_PAGE
    if min(total_photos, 4000) % PHOTOS_PER_PAGE:
        total_pages += 1

    for photo_entry in photos_data:
        # Skip None entries (placeholders from pagination)
        if photo_entry is None:
            continue

        pd = photo_entry.get("data", {})
        if not pd:
            continue

        sizes = pd.get("sizes", {}).get("data", {})
        thumb_data = sizes.get("q", sizes.get("sq", {})).get("data", {})
        medium_data = sizes.get("n", sizes.get("m", {})).get("data", {})

        thumb_url = thumb_data.get("url", "")
        medium_url = medium_data.get("url", "")

        # Ensure URLs have protocol
        if thumb_url.startswith("//"):
            thumb_url = "https:" + thumb_url
        if medium_url.startswith("//"):
            medium_url = "https:" + medium_url

        photos.append(
            FlickrPhoto(
                id=str(pd.get("id", "")),
                title=pd.get("title", ""),
                path_alias=pd.get("pathAlias", ""),
                owner_nsid=pd.get("ownerNsid", ""),
                username=pd.get("username", ""),
                realname=pd.get("realname", ""),
                license=pd.get("license", 0),
                thumb_url=thumb_url,
                medium_url=medium_url,
            )
        )

    return SearchResult(
        photos=photos,
        total_photos=total_photos,
        current_page=page,
        total_pages=total_pages,
    )


@app.errorhandler(werkzeug.exceptions.InternalServerError)
def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]:
    """Handle exception."""
    exec_type, exc_value, current_traceback = sys.exc_info()
    assert exc_value
    tb = DebugTraceback(exc_value)

    summary = tb.render_traceback_html(include_title=False)
    exc_lines = "".join(tb._te.format_exception_only())

    last_frame = list(traceback.walk_tb(current_traceback))[-1][0]
    last_frame_args = inspect.getargs(last_frame.f_code)

    return (
        flask.render_template(
            "show_error.html",
            plaintext=tb.render_traceback_text(),
            exception=exc_lines,
            exception_type=tb._te.exc_type.__name__,
            summary=summary,
            last_frame=last_frame,
            last_frame_args=last_frame_args,
        ),
        500,
    )


@app.route("/")
def start() -> str:
    """Start form."""
    enwp = flask.request.args.get("enwp")
    if not enwp:
        return flask.render_template("combined.html")
    enwp = enwp.strip()
    if not enwp:
        return flask.render_template("combined.html")

    input_is = "url" if enwiki in enwp else "title"

    wikipedia_url: str
    wiki_part1: str
    wiki_part2: str
    if input_is == "url":
        start = enwp.find(enwiki) + len(enwiki)
        wiki_part2 = unquote(enwp[start:])
        name = wiki_part2
        wiki_part1 = enwp[:start]
        wikipedia_url = enwp
    else:
        name = enwp
        wiki_part1 = "https://" + enwiki
        wiki_part2 = name.replace(" ", "_")
        wikipedia_url = wiki_part1 + wiki_part2

    if "_(" in name:
        name = name[: name.find("_(")]
    name = name.replace("_", " ")

    flickr_url = flask.request.args.get("flickr")
    if not flickr_url:
        # Search Flickr for photos
        page = flask.request.args.get("page", 1, type=int)
        page = max(1, page)  # Ensure page is at least 1
        search_result = search_flickr(name, page)
        return flask.render_template(
            "combined.html",
            name=name,
            enwp=enwp,
            search_result=search_result,
        )

    if "/in/" in flickr_url:
        flickr_url = flickr_url[: flickr_url.find("/in/")]

    flickr_start = "https://flickr.com/photos/"

    assert flickr_url.startswith(flickr_start)
    flickr_username = flickr_url[
        len(flickr_start) : flickr_url.find("/", len(flickr_start))
    ]

    nsid = flickr_usrename_to_nsid(flickr_username)
    assert nsid
    print(nsid)

    # Get optional image URL for display, validate it's from Flickr
    img_url = flask.request.args.get("img")
    if img_url and not is_valid_flickr_image_url(img_url):
        img_url = None

    msg = flask.render_template(
        "message.jinja",
        flickr_url=flickr_url,
        enwp=enwp,
        wikipedia_url=wikipedia_url,
        name=name,
        wiki_part1=wiki_part1,
        wiki_part2=wiki_part2,
    )

    subject = f"Request to use your photo of {name} on Wikipedia"

    lines = msg.split("\n\n")

    return flask.render_template(
        "combined.html",
        name=name,
        enwp=enwp,
        flickr_url=flickr_url,
        subject=subject,
        lines=lines,
        nsid=nsid,
        img_url=img_url,
    )


def get_params(line_iter: collections.abc.Iterable[str]) -> str:
    """Find and return params from flickr profile page."""
    look_for = 'params: {"isEditingTestimonial":false,'
    return next(line[line.find("{") :] for line in line_iter if look_for in line)


def flickr_usrename_to_nsid(username: str) -> str:
    """Get NSID from flickr username."""
    url = f"https://www.flickr.com/people/{username}/"
    r = requests.get(url, headers=BROWSER_HEADERS)
    params_str = get_params(r.text.splitlines())
    params, _ = json.JSONDecoder().raw_decode(params_str)
    return typing.cast(str, params["nsid"])


if __name__ == "__main__":
    app.run(host="0.0.0.0")