#!/usr/bin/python3 """Find photos on flickr for Wikipedia articles and contact the photographer.""" import collections import dataclasses import inspect import json import sys import traceback import typing from urllib.parse import quote, unquote import flask import requests import werkzeug from werkzeug.debug.tbtools import DebugTraceback app = flask.Flask(__name__) app.debug = False enwiki = "en.wikipedia.org/wiki/" # Browser-like headers for Flickr requests BROWSER_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Cache-Control": "max-age=0", } # Flickr license codes to human-readable names FLICKR_LICENSES = { 0: "All Rights Reserved", 1: "CC BY-NC-SA", 2: "CC BY-NC", 3: "CC BY-NC-ND", 4: "CC BY", 5: "CC BY-SA", 6: "CC BY-ND", 7: "No known copyright", 8: "US Government", 9: "CC0", 10: "Public Domain", } @dataclasses.dataclass class FlickrPhoto: """Represents a Flickr photo from search results.""" id: str title: str path_alias: str owner_nsid: str username: str realname: str license: int thumb_url: str medium_url: str @property def flickr_url(self) -> str: """URL to the photo page on Flickr.""" return f"https://flickr.com/photos/{self.path_alias}/{self.id}" @property def license_name(self) -> str: """Human-readable license name.""" return FLICKR_LICENSES.get(self.license, f"License {self.license}") def is_valid_flickr_image_url(url: str) -> bool: """Check if URL is a valid Flickr static image URL.""" valid_prefixes = ( "https://live.staticflickr.com/", "https://farm", # farm1.staticflickr.com, farm2.staticflickr.com, etc. "https://c1.staticflickr.com/", "https://c2.staticflickr.com/", ) if not url.startswith(valid_prefixes): return False # For farm URLs, verify the domain pattern if url.startswith("https://farm"): if ".staticflickr.com/" not in url: return False return True def search_flickr(search_term: str) -> list[FlickrPhoto]: """Search Flickr for photos matching the search term.""" encoded_term = quote(f'"{search_term}"') url = f"https://flickr.com/search/?view_all=1&text={encoded_term}" response = requests.get(url, headers=BROWSER_HEADERS) response.raise_for_status() return parse_flickr_search_results(response.text) def parse_flickr_search_results(html: str) -> list[FlickrPhoto]: """Parse Flickr search results HTML and extract photo data.""" # Find the modelExport JSON embedded in the page start = html.find("modelExport:") if start == -1: return [] start += len("modelExport:") while html[start].isspace(): start += 1 # Parse the JSON by counting braces brace_count = 0 i = start in_string = False escape_next = False while i < len(html): char = html[i] if escape_next: escape_next = False i += 1 continue if char == "\\" and in_string: escape_next = True i += 1 continue if char == '"' and not escape_next: in_string = not in_string elif not in_string: if char == "{": brace_count += 1 elif char == "}": brace_count -= 1 if brace_count == 0: json_str = html[start : i + 1] break i += 1 else: return [] try: data = json.loads(json_str) except json.JSONDecodeError: return [] # Extract photos from the parsed data photos: list[FlickrPhoto] = [] main = data.get("main", {}) photos_models = main.get("search-photos-lite-models", []) if not photos_models: return [] photos_data = ( photos_models[0] .get("data", {}) .get("photos", {}) .get("data", {}) .get("_data", []) ) for photo_entry in photos_data: pd = photo_entry.get("data", {}) sizes = pd.get("sizes", {}).get("data", {}) thumb_data = sizes.get("q", sizes.get("sq", {})).get("data", {}) medium_data = sizes.get("n", sizes.get("m", {})).get("data", {}) thumb_url = thumb_data.get("url", "") medium_url = medium_data.get("url", "") # Ensure URLs have protocol if thumb_url.startswith("//"): thumb_url = "https:" + thumb_url if medium_url.startswith("//"): medium_url = "https:" + medium_url photos.append( FlickrPhoto( id=str(pd.get("id", "")), title=pd.get("title", ""), path_alias=pd.get("pathAlias", ""), owner_nsid=pd.get("ownerNsid", ""), username=pd.get("username", ""), realname=pd.get("realname", ""), license=pd.get("license", 0), thumb_url=thumb_url, medium_url=medium_url, ) ) return photos @app.errorhandler(werkzeug.exceptions.InternalServerError) def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]: """Handle exception.""" exec_type, exc_value, current_traceback = sys.exc_info() assert exc_value tb = DebugTraceback(exc_value) summary = tb.render_traceback_html(include_title=False) exc_lines = "".join(tb._te.format_exception_only()) last_frame = list(traceback.walk_tb(current_traceback))[-1][0] last_frame_args = inspect.getargs(last_frame.f_code) return ( flask.render_template( "show_error.html", plaintext=tb.render_traceback_text(), exception=exc_lines, exception_type=tb._te.exc_type.__name__, summary=summary, last_frame=last_frame, last_frame_args=last_frame_args, ), 500, ) @app.route("/") def start() -> str: """Start form.""" enwp = flask.request.args.get("enwp") if not enwp: return flask.render_template("combined.html") enwp = enwp.strip() if not enwp: return flask.render_template("combined.html") input_is = "url" if enwiki in enwp else "title" wikipedia_url: str wiki_part1: str wiki_part2: str if input_is == "url": start = enwp.find(enwiki) + len(enwiki) wiki_part2 = unquote(enwp[start:]) name = wiki_part2 wiki_part1 = enwp[:start] wikipedia_url = enwp else: name = enwp wiki_part1 = "https://" + enwiki wiki_part2 = name.replace(" ", "_") wikipedia_url = wiki_part1 + wiki_part2 if "_(" in name: name = name[: name.find("_(")] name = name.replace("_", " ") flickr_url = flask.request.args.get("flickr") if not flickr_url: # Search Flickr for photos photos = search_flickr(name) return flask.render_template( "combined.html", name=name, enwp=enwp, photos=photos, ) if "/in/" in flickr_url: flickr_url = flickr_url[: flickr_url.find("/in/")] flickr_start = "https://flickr.com/photos/" assert flickr_url.startswith(flickr_start) flickr_username = flickr_url[ len(flickr_start) : flickr_url.find("/", len(flickr_start)) ] nsid = flickr_usrename_to_nsid(flickr_username) assert nsid print(nsid) # Get optional image URL for display, validate it's from Flickr img_url = flask.request.args.get("img") if img_url and not is_valid_flickr_image_url(img_url): img_url = None msg = flask.render_template( "message.jinja", flickr_url=flickr_url, enwp=enwp, wikipedia_url=wikipedia_url, name=name, wiki_part1=wiki_part1, wiki_part2=wiki_part2, ) subject = f"Request to use your photo of {name} on Wikipedia" lines = msg.split("\n\n") return flask.render_template( "combined.html", name=name, enwp=enwp, flickr_url=flickr_url, subject=subject, lines=lines, nsid=nsid, img_url=img_url, ) def get_params(line_iter: collections.abc.Iterable[str]) -> str: """Find and return params from flickr profile page.""" look_for = 'params: {"isEditingTestimonial":false,' return next(line[line.find("{") :] for line in line_iter if look_for in line) def flickr_usrename_to_nsid(username: str) -> str: """Get NSID from flickr username.""" url = f"https://www.flickr.com/people/{username}/" r = requests.get(url, headers=BROWSER_HEADERS) params_str = get_params(r.text.splitlines()) params, _ = json.JSONDecoder().raw_decode(params_str) return typing.cast(str, params["nsid"]) if __name__ == "__main__": app.run(host="0.0.0.0")