Add category search, license handling, and message page improvements

Add /category route to find Wikipedia articles without images in a
category using the MediaWiki API. Filter out non-content images (UI
icons, logos) when checking articles.

Show image license on message page with alternate message for non-free
CC licenses (NC/ND) explaining Wikipedia's restrictions. For photos
with free licenses, show upload options linking to UploadWizard instead
of a message form.

Add Flickr CC 4.0 license codes, user profile links, previous message
detection from sent mail index, and back-navigation between category,
search results, and message pages.

Closes #3

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-02-07 10:22:19 +00:00
parent d59e67b55d
commit c5efd429ce
4 changed files with 403 additions and 19 deletions

270
main.py
View file

@ -34,17 +34,20 @@ COMMONS_CACHE_FILE = (
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
)
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
SENT_MAIL_INDEX_CACHE = (
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
)
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
RECENT_UPLOADS_COUNT = 24
# User agent for Commons API requests
COMMONS_USER_AGENT = (
# User agent for Wikimedia API requests
WIKIMEDIA_USER_AGENT = (
"FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
)
WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
# Browser-like headers for Flickr requests
BROWSER_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
@ -64,18 +67,26 @@ BROWSER_HEADERS = {
# Flickr license codes to human-readable names
FLICKR_LICENSES = {
0: "All Rights Reserved",
1: "CC BY-NC-SA",
2: "CC BY-NC",
3: "CC BY-NC-ND",
4: "CC BY",
5: "CC BY-SA",
6: "CC BY-ND",
1: "CC BY-NC-SA 2.0",
2: "CC BY-NC 2.0",
3: "CC BY-NC-ND 2.0",
4: "CC BY 2.0",
5: "CC BY-SA 2.0",
6: "CC BY-ND 2.0",
7: "No known copyright",
8: "US Government",
9: "CC0",
10: "Public Domain",
# CC 4.0 licenses (codes confirmed via Flickr)
16: "CC BY-NC-ND 4.0",
}
# Non-free CC licenses (NC or ND restrictions)
NONFREE_CC_LICENSES = {1, 2, 3, 6, 11, 12, 13, 16}
# Wikipedia-compatible free licenses
FREE_LICENSES = {4, 5, 7, 8, 9, 10, 14, 15}
PHOTOS_PER_PAGE = 25
@ -296,7 +307,7 @@ def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
"format": "json",
}
headers = {"User-Agent": COMMONS_USER_AGENT}
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
try:
response = requests.get(
@ -404,6 +415,165 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
return result, total_matched
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
"""Get previous messages sent to a Flickr user.
Checks both the display name (flickr_user) and username (flickr_username)
against the recipient field in the messages index.
"""
if not SENT_MAIL_INDEX_FILE.exists():
return []
try:
with open(SENT_MAIL_INDEX_FILE) as f:
messages = json.load(f)
except (json.JSONDecodeError, OSError):
return []
# Normalize for case-insensitive comparison
flickr_user_lower = flickr_user.lower() if flickr_user else ""
flickr_username_lower = flickr_username.lower() if flickr_username else ""
matches = []
for msg in messages:
recipient = msg.get("recipient", "").lower()
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
matches.append(msg)
return matches
def parse_category_input(category_input: str) -> str | None:
"""Parse category title from URL or direct input.
Returns the category title with 'Category:' prefix, or None if invalid.
"""
category_input = category_input.strip()
# Handle URL format: https://en.wikipedia.org/wiki/Category:Example
if "wikipedia.org" in category_input:
match = re.search(r"/wiki/(Category:[^#?]+)", category_input)
if match:
return unquote(match.group(1)).replace("_", " ")
return None
# Handle direct input - add Category: prefix if missing
if category_input.startswith("Category:"):
return category_input.replace("_", " ")
# Assume it's just the category name
return f"Category:{category_input.replace('_', ' ')}"
@dataclasses.dataclass
class ArticleWithoutImage:
"""Represents a Wikipedia article that needs an image."""
title: str
pageid: int
@property
def wikipedia_url(self) -> str:
"""URL to the Wikipedia article."""
return f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}"
@property
def search_url(self) -> str:
"""URL to search for this article in Flickr Mail."""
return f"/?enwp={quote(self.title)}"
# Common non-content images to ignore when checking if an article has images
NON_CONTENT_IMAGE_PATTERNS = [
"OOjs UI icon",
"Commons-logo",
"Symbol ",
"Edit-ltr",
"Ambox ",
"Question book",
"Wiki letter",
"Text document",
"Folder ",
"Crystal ",
"Nuvola ",
"Gnome-",
"Disambig ",
"DAB ",
]
def has_content_image(images: list[dict]) -> bool:
"""Check if an article has a content image (not just UI icons/logos)."""
for img in images:
title = img.get("title", "")
# Skip if it matches any non-content pattern
is_non_content = any(pattern in title for pattern in NON_CONTENT_IMAGE_PATTERNS)
if not is_non_content:
return True
return False
def get_articles_without_images(
category: str, limit: int = 100
) -> tuple[list[ArticleWithoutImage], str | None]:
"""Get articles in a category that don't have images.
Uses generator=categorymembers with prop=images to efficiently check
multiple articles in a single API request.
Returns a tuple of (articles_list, continue_token).
"""
params = {
"action": "query",
"generator": "categorymembers",
"gcmtitle": category,
"gcmtype": "page", # Only articles, not subcategories or files
"gcmnamespace": "0", # Main namespace only
"gcmlimit": str(limit),
"prop": "images",
"imlimit": "max", # Need enough to check all pages in batch
"format": "json",
}
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
try:
response = requests.get(
WIKIPEDIA_API, params=params, headers=headers, timeout=30
)
response.raise_for_status()
data = response.json()
except (requests.RequestException, json.JSONDecodeError) as e:
print(f"Wikipedia API error: {e}")
return [], None
articles_without_images: list[ArticleWithoutImage] = []
pages = data.get("query", {}).get("pages", {})
for page in pages.values():
images = page.get("images", [])
# Skip if page has content images (not just UI icons)
if has_content_image(images):
continue
title = page.get("title", "")
pageid = page.get("pageid", 0)
if title and pageid:
articles_without_images.append(
ArticleWithoutImage(title=title, pageid=pageid)
)
# Sort by title for consistent display
articles_without_images.sort(key=lambda a: a.title)
# Get continue token if there are more results
continue_token = data.get("continue", {}).get("gcmcontinue")
return articles_without_images, continue_token
def is_valid_flickr_image_url(url: str) -> bool:
"""Check if URL is a valid Flickr static image URL."""
valid_prefixes = (
@ -608,9 +778,13 @@ def start() -> str:
wiki_part2 = name.replace(" ", "_")
wikipedia_url = wiki_part1 + wiki_part2
if "_(" in name:
name = name[: name.find("_(")]
# Remove disambiguation suffix like "(academic)" for Flickr search
name = name.replace("_", " ")
if " (" in name:
name = name[: name.find(" (")]
# Get category param if coming from category search
cat = flask.request.args.get("cat")
flickr_url = flask.request.args.get("flickr")
if not flickr_url:
@ -623,6 +797,7 @@ def start() -> str:
name=name,
enwp=enwp,
search_result=search_result,
cat=cat,
)
if "/in/" in flickr_url:
@ -644,6 +819,40 @@ def start() -> str:
if img_url and not is_valid_flickr_image_url(img_url):
img_url = None
# Get flickr_user name and build profile URL
flickr_user = flask.request.args.get("flickr_user", "")
flickr_user_url = f"https://www.flickr.com/photos/{flickr_username}/"
# Check for previous messages to this user
previous_messages = get_previous_messages(flickr_user, flickr_username)
# Get license code if provided
license_code = flask.request.args.get("license", type=int)
license_name = (
FLICKR_LICENSES.get(license_code, "") if license_code is not None else ""
)
is_free_license = license_code in FREE_LICENSES
is_nonfree_cc = license_code in NONFREE_CC_LICENSES
# For free licenses, show upload options instead of message
if is_free_license:
return flask.render_template(
"combined.html",
name=name,
enwp=enwp,
flickr_url=flickr_url,
img_url=img_url,
license_code=license_code,
license_name=license_name,
is_free_license=True,
wikipedia_url=wikipedia_url,
flickr_user=flickr_user,
flickr_user_url=flickr_user_url,
cat=cat,
previous_messages=previous_messages,
)
msg = flask.render_template(
"message.jinja",
flickr_url=flickr_url,
@ -652,6 +861,8 @@ def start() -> str:
name=name,
wiki_part1=wiki_part1,
wiki_part2=wiki_part2,
is_nonfree_cc=is_nonfree_cc,
license_name=license_name,
)
subject = f"Request to use your photo of {name} on Wikipedia"
@ -667,6 +878,43 @@ def start() -> str:
lines=lines,
nsid=nsid,
img_url=img_url,
license_code=license_code,
license_name=license_name,
flickr_user=flickr_user,
flickr_user_url=flickr_user_url,
cat=cat,
previous_messages=previous_messages,
)
@app.route("/category")
def category_search() -> str:
"""Find articles in a Wikipedia category that need images."""
cat = flask.request.args.get("cat", "").strip()
if not cat:
return flask.render_template("category.html")
category = parse_category_input(cat)
if not category:
return flask.render_template(
"category.html",
error="Invalid category format. Please enter a category name or URL.",
cat=cat,
)
articles, continue_token = get_articles_without_images(category)
# Get the display name (without Category: prefix)
category_name = category.replace("Category:", "")
return flask.render_template(
"category.html",
cat=cat,
category=category,
category_name=category_name,
articles=articles,
continue_token=continue_token,
)