Add category search, license handling, and message page improvements
Add /category route to find Wikipedia articles without images in a category using the MediaWiki API. Filter out non-content images (UI icons, logos) when checking articles. Show image license on message page with alternate message for non-free CC licenses (NC/ND) explaining Wikipedia's restrictions. For photos with free licenses, show upload options linking to UploadWizard instead of a message form. Add Flickr CC 4.0 license codes, user profile links, previous message detection from sent mail index, and back-navigation between category, search results, and message pages. Closes #3 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d59e67b55d
commit
c5efd429ce
4 changed files with 403 additions and 19 deletions
270
main.py
270
main.py
|
|
@ -34,17 +34,20 @@ COMMONS_CACHE_FILE = (
|
|||
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
|
||||
)
|
||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||
SENT_MAIL_INDEX_CACHE = (
|
||||
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
|
||||
)
|
||||
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
||||
RECENT_UPLOADS_COUNT = 24
|
||||
|
||||
# User agent for Commons API requests
|
||||
COMMONS_USER_AGENT = (
|
||||
# User agent for Wikimedia API requests
|
||||
WIKIMEDIA_USER_AGENT = (
|
||||
"FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
)
|
||||
|
||||
WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
# Browser-like headers for Flickr requests
|
||||
BROWSER_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
|
|
@ -64,18 +67,26 @@ BROWSER_HEADERS = {
|
|||
# Flickr license codes to human-readable names
|
||||
FLICKR_LICENSES = {
|
||||
0: "All Rights Reserved",
|
||||
1: "CC BY-NC-SA",
|
||||
2: "CC BY-NC",
|
||||
3: "CC BY-NC-ND",
|
||||
4: "CC BY",
|
||||
5: "CC BY-SA",
|
||||
6: "CC BY-ND",
|
||||
1: "CC BY-NC-SA 2.0",
|
||||
2: "CC BY-NC 2.0",
|
||||
3: "CC BY-NC-ND 2.0",
|
||||
4: "CC BY 2.0",
|
||||
5: "CC BY-SA 2.0",
|
||||
6: "CC BY-ND 2.0",
|
||||
7: "No known copyright",
|
||||
8: "US Government",
|
||||
9: "CC0",
|
||||
10: "Public Domain",
|
||||
# CC 4.0 licenses (codes confirmed via Flickr)
|
||||
16: "CC BY-NC-ND 4.0",
|
||||
}
|
||||
|
||||
# Non-free CC licenses (NC or ND restrictions)
|
||||
NONFREE_CC_LICENSES = {1, 2, 3, 6, 11, 12, 13, 16}
|
||||
|
||||
# Wikipedia-compatible free licenses
|
||||
FREE_LICENSES = {4, 5, 7, 8, 9, 10, 14, 15}
|
||||
|
||||
|
||||
PHOTOS_PER_PAGE = 25
|
||||
|
||||
|
|
@ -296,7 +307,7 @@ def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
|||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": COMMONS_USER_AGENT}
|
||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
|
|
@ -404,6 +415,165 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
|
|||
return result, total_matched
|
||||
|
||||
|
||||
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||
"""Get previous messages sent to a Flickr user.
|
||||
|
||||
Checks both the display name (flickr_user) and username (flickr_username)
|
||||
against the recipient field in the messages index.
|
||||
"""
|
||||
if not SENT_MAIL_INDEX_FILE.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_FILE) as f:
|
||||
messages = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return []
|
||||
|
||||
# Normalize for case-insensitive comparison
|
||||
flickr_user_lower = flickr_user.lower() if flickr_user else ""
|
||||
flickr_username_lower = flickr_username.lower() if flickr_username else ""
|
||||
|
||||
matches = []
|
||||
for msg in messages:
|
||||
recipient = msg.get("recipient", "").lower()
|
||||
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
|
||||
matches.append(msg)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def parse_category_input(category_input: str) -> str | None:
|
||||
"""Parse category title from URL or direct input.
|
||||
|
||||
Returns the category title with 'Category:' prefix, or None if invalid.
|
||||
"""
|
||||
category_input = category_input.strip()
|
||||
|
||||
# Handle URL format: https://en.wikipedia.org/wiki/Category:Example
|
||||
if "wikipedia.org" in category_input:
|
||||
match = re.search(r"/wiki/(Category:[^#?]+)", category_input)
|
||||
if match:
|
||||
return unquote(match.group(1)).replace("_", " ")
|
||||
return None
|
||||
|
||||
# Handle direct input - add Category: prefix if missing
|
||||
if category_input.startswith("Category:"):
|
||||
return category_input.replace("_", " ")
|
||||
|
||||
# Assume it's just the category name
|
||||
return f"Category:{category_input.replace('_', ' ')}"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ArticleWithoutImage:
|
||||
"""Represents a Wikipedia article that needs an image."""
|
||||
|
||||
title: str
|
||||
pageid: int
|
||||
|
||||
@property
|
||||
def wikipedia_url(self) -> str:
|
||||
"""URL to the Wikipedia article."""
|
||||
return f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}"
|
||||
|
||||
@property
|
||||
def search_url(self) -> str:
|
||||
"""URL to search for this article in Flickr Mail."""
|
||||
return f"/?enwp={quote(self.title)}"
|
||||
|
||||
|
||||
# Common non-content images to ignore when checking if an article has images
|
||||
NON_CONTENT_IMAGE_PATTERNS = [
|
||||
"OOjs UI icon",
|
||||
"Commons-logo",
|
||||
"Symbol ",
|
||||
"Edit-ltr",
|
||||
"Ambox ",
|
||||
"Question book",
|
||||
"Wiki letter",
|
||||
"Text document",
|
||||
"Folder ",
|
||||
"Crystal ",
|
||||
"Nuvola ",
|
||||
"Gnome-",
|
||||
"Disambig ",
|
||||
"DAB ",
|
||||
]
|
||||
|
||||
|
||||
def has_content_image(images: list[dict]) -> bool:
|
||||
"""Check if an article has a content image (not just UI icons/logos)."""
|
||||
for img in images:
|
||||
title = img.get("title", "")
|
||||
# Skip if it matches any non-content pattern
|
||||
is_non_content = any(pattern in title for pattern in NON_CONTENT_IMAGE_PATTERNS)
|
||||
if not is_non_content:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_articles_without_images(
|
||||
category: str, limit: int = 100
|
||||
) -> tuple[list[ArticleWithoutImage], str | None]:
|
||||
"""Get articles in a category that don't have images.
|
||||
|
||||
Uses generator=categorymembers with prop=images to efficiently check
|
||||
multiple articles in a single API request.
|
||||
|
||||
Returns a tuple of (articles_list, continue_token).
|
||||
"""
|
||||
params = {
|
||||
"action": "query",
|
||||
"generator": "categorymembers",
|
||||
"gcmtitle": category,
|
||||
"gcmtype": "page", # Only articles, not subcategories or files
|
||||
"gcmnamespace": "0", # Main namespace only
|
||||
"gcmlimit": str(limit),
|
||||
"prop": "images",
|
||||
"imlimit": "max", # Need enough to check all pages in batch
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
WIKIPEDIA_API, params=params, headers=headers, timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
print(f"Wikipedia API error: {e}")
|
||||
return [], None
|
||||
|
||||
articles_without_images: list[ArticleWithoutImage] = []
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
images = page.get("images", [])
|
||||
|
||||
# Skip if page has content images (not just UI icons)
|
||||
if has_content_image(images):
|
||||
continue
|
||||
|
||||
title = page.get("title", "")
|
||||
pageid = page.get("pageid", 0)
|
||||
|
||||
if title and pageid:
|
||||
articles_without_images.append(
|
||||
ArticleWithoutImage(title=title, pageid=pageid)
|
||||
)
|
||||
|
||||
# Sort by title for consistent display
|
||||
articles_without_images.sort(key=lambda a: a.title)
|
||||
|
||||
# Get continue token if there are more results
|
||||
continue_token = data.get("continue", {}).get("gcmcontinue")
|
||||
|
||||
return articles_without_images, continue_token
|
||||
|
||||
|
||||
def is_valid_flickr_image_url(url: str) -> bool:
|
||||
"""Check if URL is a valid Flickr static image URL."""
|
||||
valid_prefixes = (
|
||||
|
|
@ -608,9 +778,13 @@ def start() -> str:
|
|||
wiki_part2 = name.replace(" ", "_")
|
||||
wikipedia_url = wiki_part1 + wiki_part2
|
||||
|
||||
if "_(" in name:
|
||||
name = name[: name.find("_(")]
|
||||
# Remove disambiguation suffix like "(academic)" for Flickr search
|
||||
name = name.replace("_", " ")
|
||||
if " (" in name:
|
||||
name = name[: name.find(" (")]
|
||||
|
||||
# Get category param if coming from category search
|
||||
cat = flask.request.args.get("cat")
|
||||
|
||||
flickr_url = flask.request.args.get("flickr")
|
||||
if not flickr_url:
|
||||
|
|
@ -623,6 +797,7 @@ def start() -> str:
|
|||
name=name,
|
||||
enwp=enwp,
|
||||
search_result=search_result,
|
||||
cat=cat,
|
||||
)
|
||||
|
||||
if "/in/" in flickr_url:
|
||||
|
|
@ -644,6 +819,40 @@ def start() -> str:
|
|||
if img_url and not is_valid_flickr_image_url(img_url):
|
||||
img_url = None
|
||||
|
||||
# Get flickr_user name and build profile URL
|
||||
flickr_user = flask.request.args.get("flickr_user", "")
|
||||
flickr_user_url = f"https://www.flickr.com/photos/{flickr_username}/"
|
||||
|
||||
# Check for previous messages to this user
|
||||
previous_messages = get_previous_messages(flickr_user, flickr_username)
|
||||
|
||||
# Get license code if provided
|
||||
license_code = flask.request.args.get("license", type=int)
|
||||
license_name = (
|
||||
FLICKR_LICENSES.get(license_code, "") if license_code is not None else ""
|
||||
)
|
||||
|
||||
is_free_license = license_code in FREE_LICENSES
|
||||
is_nonfree_cc = license_code in NONFREE_CC_LICENSES
|
||||
|
||||
# For free licenses, show upload options instead of message
|
||||
if is_free_license:
|
||||
return flask.render_template(
|
||||
"combined.html",
|
||||
name=name,
|
||||
enwp=enwp,
|
||||
flickr_url=flickr_url,
|
||||
img_url=img_url,
|
||||
license_code=license_code,
|
||||
license_name=license_name,
|
||||
is_free_license=True,
|
||||
wikipedia_url=wikipedia_url,
|
||||
flickr_user=flickr_user,
|
||||
flickr_user_url=flickr_user_url,
|
||||
cat=cat,
|
||||
previous_messages=previous_messages,
|
||||
)
|
||||
|
||||
msg = flask.render_template(
|
||||
"message.jinja",
|
||||
flickr_url=flickr_url,
|
||||
|
|
@ -652,6 +861,8 @@ def start() -> str:
|
|||
name=name,
|
||||
wiki_part1=wiki_part1,
|
||||
wiki_part2=wiki_part2,
|
||||
is_nonfree_cc=is_nonfree_cc,
|
||||
license_name=license_name,
|
||||
)
|
||||
|
||||
subject = f"Request to use your photo of {name} on Wikipedia"
|
||||
|
|
@ -667,6 +878,43 @@ def start() -> str:
|
|||
lines=lines,
|
||||
nsid=nsid,
|
||||
img_url=img_url,
|
||||
license_code=license_code,
|
||||
license_name=license_name,
|
||||
flickr_user=flickr_user,
|
||||
flickr_user_url=flickr_user_url,
|
||||
cat=cat,
|
||||
previous_messages=previous_messages,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/category")
|
||||
def category_search() -> str:
|
||||
"""Find articles in a Wikipedia category that need images."""
|
||||
cat = flask.request.args.get("cat", "").strip()
|
||||
|
||||
if not cat:
|
||||
return flask.render_template("category.html")
|
||||
|
||||
category = parse_category_input(cat)
|
||||
if not category:
|
||||
return flask.render_template(
|
||||
"category.html",
|
||||
error="Invalid category format. Please enter a category name or URL.",
|
||||
cat=cat,
|
||||
)
|
||||
|
||||
articles, continue_token = get_articles_without_images(category)
|
||||
|
||||
# Get the display name (without Category: prefix)
|
||||
category_name = category.replace("Category:", "")
|
||||
|
||||
return flask.render_template(
|
||||
"category.html",
|
||||
cat=cat,
|
||||
category=category,
|
||||
category_name=category_name,
|
||||
articles=articles,
|
||||
continue_token=continue_token,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue