- Add SearchResult dataclass with pagination metadata - Update search_flickr() to accept page parameter - Parse total results count from Flickr response - Add Bootstrap pagination controls to template - Display total result count in UI - Update documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
368 lines
10 KiB
Python
Executable file
368 lines
10 KiB
Python
Executable file
#!/usr/bin/python3
|
|
"""Find photos on flickr for Wikipedia articles and contact the photographer."""
|
|
|
|
import collections
|
|
import dataclasses
|
|
import inspect
|
|
import json
|
|
import sys
|
|
import traceback
|
|
import typing
|
|
from urllib.parse import quote, unquote
|
|
|
|
import flask
|
|
import requests
|
|
import werkzeug
|
|
from werkzeug.debug.tbtools import DebugTraceback
|
|
|
|
|
|
app = flask.Flask(__name__)
|
|
app.debug = False
|
|
|
|
enwiki = "en.wikipedia.org/wiki/"
|
|
|
|
# Browser-like headers for Flickr requests
|
|
BROWSER_HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
"Cache-Control": "max-age=0",
|
|
}
|
|
|
|
|
|
# Flickr license codes to human-readable names
|
|
FLICKR_LICENSES = {
|
|
0: "All Rights Reserved",
|
|
1: "CC BY-NC-SA",
|
|
2: "CC BY-NC",
|
|
3: "CC BY-NC-ND",
|
|
4: "CC BY",
|
|
5: "CC BY-SA",
|
|
6: "CC BY-ND",
|
|
7: "No known copyright",
|
|
8: "US Government",
|
|
9: "CC0",
|
|
10: "Public Domain",
|
|
}
|
|
|
|
|
|
PHOTOS_PER_PAGE = 25
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class FlickrPhoto:
|
|
"""Represents a Flickr photo from search results."""
|
|
|
|
id: str
|
|
title: str
|
|
path_alias: str
|
|
owner_nsid: str
|
|
username: str
|
|
realname: str
|
|
license: int
|
|
thumb_url: str
|
|
medium_url: str
|
|
|
|
@property
|
|
def flickr_url(self) -> str:
|
|
"""URL to the photo page on Flickr."""
|
|
return f"https://flickr.com/photos/{self.path_alias}/{self.id}"
|
|
|
|
@property
|
|
def license_name(self) -> str:
|
|
"""Human-readable license name."""
|
|
return FLICKR_LICENSES.get(self.license, f"License {self.license}")
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class SearchResult:
|
|
"""Flickr search results with pagination metadata."""
|
|
|
|
photos: list[FlickrPhoto]
|
|
total_photos: int
|
|
current_page: int
|
|
total_pages: int
|
|
|
|
|
|
def is_valid_flickr_image_url(url: str) -> bool:
|
|
"""Check if URL is a valid Flickr static image URL."""
|
|
valid_prefixes = (
|
|
"https://live.staticflickr.com/",
|
|
"https://farm", # farm1.staticflickr.com, farm2.staticflickr.com, etc.
|
|
"https://c1.staticflickr.com/",
|
|
"https://c2.staticflickr.com/",
|
|
)
|
|
if not url.startswith(valid_prefixes):
|
|
return False
|
|
# For farm URLs, verify the domain pattern
|
|
if url.startswith("https://farm"):
|
|
if ".staticflickr.com/" not in url:
|
|
return False
|
|
return True
|
|
|
|
|
|
def search_flickr(search_term: str, page: int = 1) -> SearchResult:
|
|
"""Search Flickr for photos matching the search term."""
|
|
encoded_term = quote(f'"{search_term}"')
|
|
url = f"https://flickr.com/search/?view_all=1&text={encoded_term}&page={page}"
|
|
|
|
response = requests.get(url, headers=BROWSER_HEADERS)
|
|
response.raise_for_status()
|
|
|
|
return parse_flickr_search_results(response.text, page)
|
|
|
|
|
|
def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult:
|
|
"""Parse Flickr search results HTML and extract photo data."""
|
|
empty_result = SearchResult(photos=[], total_photos=0, current_page=page, total_pages=0)
|
|
|
|
# Find the modelExport JSON embedded in the page
|
|
start = html.find("modelExport:")
|
|
if start == -1:
|
|
return empty_result
|
|
|
|
start += len("modelExport:")
|
|
while html[start].isspace():
|
|
start += 1
|
|
|
|
# Parse the JSON by counting braces
|
|
brace_count = 0
|
|
i = start
|
|
in_string = False
|
|
escape_next = False
|
|
|
|
while i < len(html):
|
|
char = html[i]
|
|
|
|
if escape_next:
|
|
escape_next = False
|
|
i += 1
|
|
continue
|
|
|
|
if char == "\\" and in_string:
|
|
escape_next = True
|
|
i += 1
|
|
continue
|
|
|
|
if char == '"' and not escape_next:
|
|
in_string = not in_string
|
|
elif not in_string:
|
|
if char == "{":
|
|
brace_count += 1
|
|
elif char == "}":
|
|
brace_count -= 1
|
|
if brace_count == 0:
|
|
json_str = html[start : i + 1]
|
|
break
|
|
i += 1
|
|
else:
|
|
return empty_result
|
|
|
|
try:
|
|
data = json.loads(json_str)
|
|
except json.JSONDecodeError:
|
|
return empty_result
|
|
|
|
# Extract photos from the parsed data
|
|
photos: list[FlickrPhoto] = []
|
|
|
|
main = data.get("main", {})
|
|
photos_models = main.get("search-photos-lite-models", [])
|
|
|
|
if not photos_models:
|
|
return empty_result
|
|
|
|
model_data = photos_models[0].get("data", {})
|
|
photos_container = model_data.get("photos", {}).get("data", {})
|
|
photos_data = photos_container.get("_data", [])
|
|
total_photos = photos_container.get("totalItems", 0)
|
|
|
|
# Calculate total pages (Flickr caps at 4000 results)
|
|
total_pages = min(total_photos, 4000) // PHOTOS_PER_PAGE
|
|
if min(total_photos, 4000) % PHOTOS_PER_PAGE:
|
|
total_pages += 1
|
|
|
|
for photo_entry in photos_data:
|
|
# Skip None entries (placeholders from pagination)
|
|
if photo_entry is None:
|
|
continue
|
|
|
|
pd = photo_entry.get("data", {})
|
|
if not pd:
|
|
continue
|
|
|
|
sizes = pd.get("sizes", {}).get("data", {})
|
|
thumb_data = sizes.get("q", sizes.get("sq", {})).get("data", {})
|
|
medium_data = sizes.get("n", sizes.get("m", {})).get("data", {})
|
|
|
|
thumb_url = thumb_data.get("url", "")
|
|
medium_url = medium_data.get("url", "")
|
|
|
|
# Ensure URLs have protocol
|
|
if thumb_url.startswith("//"):
|
|
thumb_url = "https:" + thumb_url
|
|
if medium_url.startswith("//"):
|
|
medium_url = "https:" + medium_url
|
|
|
|
photos.append(
|
|
FlickrPhoto(
|
|
id=str(pd.get("id", "")),
|
|
title=pd.get("title", ""),
|
|
path_alias=pd.get("pathAlias", ""),
|
|
owner_nsid=pd.get("ownerNsid", ""),
|
|
username=pd.get("username", ""),
|
|
realname=pd.get("realname", ""),
|
|
license=pd.get("license", 0),
|
|
thumb_url=thumb_url,
|
|
medium_url=medium_url,
|
|
)
|
|
)
|
|
|
|
return SearchResult(
|
|
photos=photos,
|
|
total_photos=total_photos,
|
|
current_page=page,
|
|
total_pages=total_pages,
|
|
)
|
|
|
|
|
|
@app.errorhandler(werkzeug.exceptions.InternalServerError)
|
|
def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]:
|
|
"""Handle exception."""
|
|
exec_type, exc_value, current_traceback = sys.exc_info()
|
|
assert exc_value
|
|
tb = DebugTraceback(exc_value)
|
|
|
|
summary = tb.render_traceback_html(include_title=False)
|
|
exc_lines = "".join(tb._te.format_exception_only())
|
|
|
|
last_frame = list(traceback.walk_tb(current_traceback))[-1][0]
|
|
last_frame_args = inspect.getargs(last_frame.f_code)
|
|
|
|
return (
|
|
flask.render_template(
|
|
"show_error.html",
|
|
plaintext=tb.render_traceback_text(),
|
|
exception=exc_lines,
|
|
exception_type=tb._te.exc_type.__name__,
|
|
summary=summary,
|
|
last_frame=last_frame,
|
|
last_frame_args=last_frame_args,
|
|
),
|
|
500,
|
|
)
|
|
|
|
|
|
@app.route("/")
|
|
def start() -> str:
|
|
"""Start form."""
|
|
enwp = flask.request.args.get("enwp")
|
|
if not enwp:
|
|
return flask.render_template("combined.html")
|
|
enwp = enwp.strip()
|
|
if not enwp:
|
|
return flask.render_template("combined.html")
|
|
|
|
input_is = "url" if enwiki in enwp else "title"
|
|
|
|
wikipedia_url: str
|
|
wiki_part1: str
|
|
wiki_part2: str
|
|
if input_is == "url":
|
|
start = enwp.find(enwiki) + len(enwiki)
|
|
wiki_part2 = unquote(enwp[start:])
|
|
name = wiki_part2
|
|
wiki_part1 = enwp[:start]
|
|
wikipedia_url = enwp
|
|
else:
|
|
name = enwp
|
|
wiki_part1 = "https://" + enwiki
|
|
wiki_part2 = name.replace(" ", "_")
|
|
wikipedia_url = wiki_part1 + wiki_part2
|
|
|
|
if "_(" in name:
|
|
name = name[: name.find("_(")]
|
|
name = name.replace("_", " ")
|
|
|
|
flickr_url = flask.request.args.get("flickr")
|
|
if not flickr_url:
|
|
# Search Flickr for photos
|
|
page = flask.request.args.get("page", 1, type=int)
|
|
page = max(1, page) # Ensure page is at least 1
|
|
search_result = search_flickr(name, page)
|
|
return flask.render_template(
|
|
"combined.html",
|
|
name=name,
|
|
enwp=enwp,
|
|
search_result=search_result,
|
|
)
|
|
|
|
if "/in/" in flickr_url:
|
|
flickr_url = flickr_url[: flickr_url.find("/in/")]
|
|
|
|
flickr_start = "https://flickr.com/photos/"
|
|
|
|
assert flickr_url.startswith(flickr_start)
|
|
flickr_username = flickr_url[
|
|
len(flickr_start) : flickr_url.find("/", len(flickr_start))
|
|
]
|
|
|
|
nsid = flickr_usrename_to_nsid(flickr_username)
|
|
assert nsid
|
|
print(nsid)
|
|
|
|
# Get optional image URL for display, validate it's from Flickr
|
|
img_url = flask.request.args.get("img")
|
|
if img_url and not is_valid_flickr_image_url(img_url):
|
|
img_url = None
|
|
|
|
msg = flask.render_template(
|
|
"message.jinja",
|
|
flickr_url=flickr_url,
|
|
enwp=enwp,
|
|
wikipedia_url=wikipedia_url,
|
|
name=name,
|
|
wiki_part1=wiki_part1,
|
|
wiki_part2=wiki_part2,
|
|
)
|
|
|
|
subject = f"Request to use your photo of {name} on Wikipedia"
|
|
|
|
lines = msg.split("\n\n")
|
|
|
|
return flask.render_template(
|
|
"combined.html",
|
|
name=name,
|
|
enwp=enwp,
|
|
flickr_url=flickr_url,
|
|
subject=subject,
|
|
lines=lines,
|
|
nsid=nsid,
|
|
img_url=img_url,
|
|
)
|
|
|
|
|
|
def get_params(line_iter: collections.abc.Iterable[str]) -> str:
|
|
"""Find and return params from flickr profile page."""
|
|
look_for = 'params: {"isEditingTestimonial":false,'
|
|
return next(line[line.find("{") :] for line in line_iter if look_for in line)
|
|
|
|
|
|
def flickr_usrename_to_nsid(username: str) -> str:
|
|
"""Get NSID from flickr username."""
|
|
url = f"https://www.flickr.com/people/{username}/"
|
|
r = requests.get(url, headers=BROWSER_HEADERS)
|
|
params_str = get_params(r.text.splitlines())
|
|
params, _ = json.JSONDecoder().raw_decode(params_str)
|
|
return typing.cast(str, params["nsid"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run(host="0.0.0.0")
|