From c5efd429ce8e27a7fc9eea95a9970b61d18c5bac Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 10:22:19 +0000 Subject: [PATCH] Add category search, license handling, and message page improvements Add /category route to find Wikipedia articles without images in a category using the MediaWiki API. Filter out non-content images (UI icons, logos) when checking articles. Show image license on message page with alternate message for non-free CC licenses (NC/ND) explaining Wikipedia's restrictions. For photos with free licenses, show upload options linking to UploadWizard instead of a message form. Add Flickr CC 4.0 license codes, user profile links, previous message detection from sent mail index, and back-navigation between category, search results, and message pages. Closes #3 Co-Authored-By: Claude Opus 4.6 --- main.py | 270 ++++++++++++++++++++++++++++++++++++++-- templates/category.html | 67 ++++++++++ templates/combined.html | 56 +++++++-- templates/message.jinja | 29 +++++ 4 files changed, 403 insertions(+), 19 deletions(-) create mode 100644 templates/category.html diff --git a/main.py b/main.py index d2cc090..5abe5f6 100755 --- a/main.py +++ b/main.py @@ -34,17 +34,20 @@ COMMONS_CACHE_FILE = ( Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json" ) SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages" +SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json" SENT_MAIL_INDEX_CACHE = ( Path(__file__).parent / "commons_contributions" / "sent_mail_index.json" ) COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days RECENT_UPLOADS_COUNT = 24 -# User agent for Commons API requests -COMMONS_USER_AGENT = ( +# User agent for Wikimedia API requests +WIKIMEDIA_USER_AGENT = ( "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)" ) +WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php" + # Browser-like headers for Flickr requests BROWSER_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", @@ -64,18 +67,26 @@ BROWSER_HEADERS = { # Flickr license codes to human-readable names FLICKR_LICENSES = { 0: "All Rights Reserved", - 1: "CC BY-NC-SA", - 2: "CC BY-NC", - 3: "CC BY-NC-ND", - 4: "CC BY", - 5: "CC BY-SA", - 6: "CC BY-ND", + 1: "CC BY-NC-SA 2.0", + 2: "CC BY-NC 2.0", + 3: "CC BY-NC-ND 2.0", + 4: "CC BY 2.0", + 5: "CC BY-SA 2.0", + 6: "CC BY-ND 2.0", 7: "No known copyright", 8: "US Government", 9: "CC0", 10: "Public Domain", + # CC 4.0 licenses (codes confirmed via Flickr) + 16: "CC BY-NC-ND 4.0", } +# Non-free CC licenses (NC or ND restrictions) +NONFREE_CC_LICENSES = {1, 2, 3, 6, 11, 12, 13, 16} + +# Wikipedia-compatible free licenses +FREE_LICENSES = {4, 5, 7, 8, 9, 10, 14, 15} + PHOTOS_PER_PAGE = 25 @@ -296,7 +307,7 @@ def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]: "format": "json", } - headers = {"User-Agent": COMMONS_USER_AGENT} + headers = {"User-Agent": WIKIMEDIA_USER_AGENT} try: response = requests.get( @@ -404,6 +415,165 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]: return result, total_matched +def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]: + """Get previous messages sent to a Flickr user. + + Checks both the display name (flickr_user) and username (flickr_username) + against the recipient field in the messages index. + """ + if not SENT_MAIL_INDEX_FILE.exists(): + return [] + + try: + with open(SENT_MAIL_INDEX_FILE) as f: + messages = json.load(f) + except (json.JSONDecodeError, OSError): + return [] + + # Normalize for case-insensitive comparison + flickr_user_lower = flickr_user.lower() if flickr_user else "" + flickr_username_lower = flickr_username.lower() if flickr_username else "" + + matches = [] + for msg in messages: + recipient = msg.get("recipient", "").lower() + if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower): + matches.append(msg) + + return matches + + +def parse_category_input(category_input: str) -> str | None: + """Parse category title from URL or direct input. + + Returns the category title with 'Category:' prefix, or None if invalid. + """ + category_input = category_input.strip() + + # Handle URL format: https://en.wikipedia.org/wiki/Category:Example + if "wikipedia.org" in category_input: + match = re.search(r"/wiki/(Category:[^#?]+)", category_input) + if match: + return unquote(match.group(1)).replace("_", " ") + return None + + # Handle direct input - add Category: prefix if missing + if category_input.startswith("Category:"): + return category_input.replace("_", " ") + + # Assume it's just the category name + return f"Category:{category_input.replace('_', ' ')}" + + +@dataclasses.dataclass +class ArticleWithoutImage: + """Represents a Wikipedia article that needs an image.""" + + title: str + pageid: int + + @property + def wikipedia_url(self) -> str: + """URL to the Wikipedia article.""" + return f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}" + + @property + def search_url(self) -> str: + """URL to search for this article in Flickr Mail.""" + return f"/?enwp={quote(self.title)}" + + +# Common non-content images to ignore when checking if an article has images +NON_CONTENT_IMAGE_PATTERNS = [ + "OOjs UI icon", + "Commons-logo", + "Symbol ", + "Edit-ltr", + "Ambox ", + "Question book", + "Wiki letter", + "Text document", + "Folder ", + "Crystal ", + "Nuvola ", + "Gnome-", + "Disambig ", + "DAB ", +] + + +def has_content_image(images: list[dict]) -> bool: + """Check if an article has a content image (not just UI icons/logos).""" + for img in images: + title = img.get("title", "") + # Skip if it matches any non-content pattern + is_non_content = any(pattern in title for pattern in NON_CONTENT_IMAGE_PATTERNS) + if not is_non_content: + return True + return False + + +def get_articles_without_images( + category: str, limit: int = 100 +) -> tuple[list[ArticleWithoutImage], str | None]: + """Get articles in a category that don't have images. + + Uses generator=categorymembers with prop=images to efficiently check + multiple articles in a single API request. + + Returns a tuple of (articles_list, continue_token). + """ + params = { + "action": "query", + "generator": "categorymembers", + "gcmtitle": category, + "gcmtype": "page", # Only articles, not subcategories or files + "gcmnamespace": "0", # Main namespace only + "gcmlimit": str(limit), + "prop": "images", + "imlimit": "max", # Need enough to check all pages in batch + "format": "json", + } + + headers = {"User-Agent": WIKIMEDIA_USER_AGENT} + + try: + response = requests.get( + WIKIPEDIA_API, params=params, headers=headers, timeout=30 + ) + response.raise_for_status() + data = response.json() + except (requests.RequestException, json.JSONDecodeError) as e: + print(f"Wikipedia API error: {e}") + return [], None + + articles_without_images: list[ArticleWithoutImage] = [] + + pages = data.get("query", {}).get("pages", {}) + for page in pages.values(): + images = page.get("images", []) + + # Skip if page has content images (not just UI icons) + if has_content_image(images): + continue + + title = page.get("title", "") + pageid = page.get("pageid", 0) + + if title and pageid: + articles_without_images.append( + ArticleWithoutImage(title=title, pageid=pageid) + ) + + # Sort by title for consistent display + articles_without_images.sort(key=lambda a: a.title) + + # Get continue token if there are more results + continue_token = data.get("continue", {}).get("gcmcontinue") + + return articles_without_images, continue_token + + def is_valid_flickr_image_url(url: str) -> bool: """Check if URL is a valid Flickr static image URL.""" valid_prefixes = ( @@ -608,9 +778,13 @@ def start() -> str: wiki_part2 = name.replace(" ", "_") wikipedia_url = wiki_part1 + wiki_part2 - if "_(" in name: - name = name[: name.find("_(")] + # Remove disambiguation suffix like "(academic)" for Flickr search name = name.replace("_", " ") + if " (" in name: + name = name[: name.find(" (")] + + # Get category param if coming from category search + cat = flask.request.args.get("cat") flickr_url = flask.request.args.get("flickr") if not flickr_url: @@ -623,6 +797,7 @@ def start() -> str: name=name, enwp=enwp, search_result=search_result, + cat=cat, ) if "/in/" in flickr_url: @@ -644,6 +819,40 @@ def start() -> str: if img_url and not is_valid_flickr_image_url(img_url): img_url = None + # Get flickr_user name and build profile URL + flickr_user = flask.request.args.get("flickr_user", "") + flickr_user_url = f"https://www.flickr.com/photos/{flickr_username}/" + + # Check for previous messages to this user + previous_messages = get_previous_messages(flickr_user, flickr_username) + + # Get license code if provided + license_code = flask.request.args.get("license", type=int) + license_name = ( + FLICKR_LICENSES.get(license_code, "") if license_code is not None else "" + ) + + is_free_license = license_code in FREE_LICENSES + is_nonfree_cc = license_code in NONFREE_CC_LICENSES + + # For free licenses, show upload options instead of message + if is_free_license: + return flask.render_template( + "combined.html", + name=name, + enwp=enwp, + flickr_url=flickr_url, + img_url=img_url, + license_code=license_code, + license_name=license_name, + is_free_license=True, + wikipedia_url=wikipedia_url, + flickr_user=flickr_user, + flickr_user_url=flickr_user_url, + cat=cat, + previous_messages=previous_messages, + ) + msg = flask.render_template( "message.jinja", flickr_url=flickr_url, @@ -652,6 +861,8 @@ def start() -> str: name=name, wiki_part1=wiki_part1, wiki_part2=wiki_part2, + is_nonfree_cc=is_nonfree_cc, + license_name=license_name, ) subject = f"Request to use your photo of {name} on Wikipedia" @@ -667,6 +878,43 @@ def start() -> str: lines=lines, nsid=nsid, img_url=img_url, + license_code=license_code, + license_name=license_name, + flickr_user=flickr_user, + flickr_user_url=flickr_user_url, + cat=cat, + previous_messages=previous_messages, + ) + + +@app.route("/category") +def category_search() -> str: + """Find articles in a Wikipedia category that need images.""" + cat = flask.request.args.get("cat", "").strip() + + if not cat: + return flask.render_template("category.html") + + category = parse_category_input(cat) + if not category: + return flask.render_template( + "category.html", + error="Invalid category format. Please enter a category name or URL.", + cat=cat, + ) + + articles, continue_token = get_articles_without_images(category) + + # Get the display name (without Category: prefix) + category_name = category.replace("Category:", "") + + return flask.render_template( + "category.html", + cat=cat, + category=category, + category_name=category_name, + articles=articles, + continue_token=continue_token, ) diff --git a/templates/category.html b/templates/category.html new file mode 100644 index 0000000..da52980 --- /dev/null +++ b/templates/category.html @@ -0,0 +1,67 @@ +{% extends "base.html" %} + +{% block title %}Category Search - Flickr mail{% endblock %} + +{% block style %} + +{% endblock %} + +{% block content %} +
+
+

Find articles needing images

+

Enter a Wikipedia category to find articles without images

+ +
+
+ + +
+ + Back to main +
+ + {% if error %} +
{{ error }}
+ {% endif %} + + {% if category and articles is defined %} +
+
Articles without images in {{ category_name }}
+ + {% if articles %} +

Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}

+ +
+ {% for article in articles %} + + {% endfor %} +
+ + {% if continue_token %} +

Note: Only showing first batch of results. More articles may be available in this category.

+ {% endif %} + + {% else %} +
+ All articles in this category have images! +
+ {% endif %} +
+ {% endif %} + + + +
+
+{% endblock %} diff --git a/templates/combined.html b/templates/combined.html index 522dfaf..54158f0 100644 --- a/templates/combined.html +++ b/templates/combined.html @@ -13,6 +13,7 @@ + Find articles by category {% if recent_uploads is defined and recent_uploads and not name %} @@ -58,6 +59,9 @@ {% if name and search_result is defined and search_result.photos %} + {% if cat %} +

← Back to category

+ {% endif %}

Wikipedia article: {{ name }}

Select a photo to compose a message ({{ search_result.total_photos | default(0) }} results):

@@ -65,12 +69,12 @@ {% for photo in search_result.photos %}
- + {{ photo.title }}

{{ photo.realname or photo.username }}

- {{ photo.license_name }} + {{ photo.license_name }}
@@ -82,7 +86,7 @@