Add category search, license handling, and message page improvements

Add /category route to find Wikipedia articles without images in a category using the MediaWiki API. Filter out non-content images (UI icons, logos) when checking articles. Show image license on message page with alternate message for non-free CC licenses (NC/ND) explaining Wikipedia's restrictions. For photos with free licenses, show upload options linking to UploadWizard instead of a message form. Add Flickr CC 4.0 license codes, user profile links, previous message detection from sent mail index, and back-navigation between category, search results, and message pages. Closes #3 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:22:19 +00:00 · 2026-02-07 10:22:19 +00:00 · c5efd429ce
commit c5efd429ce
parent d59e67b55d
4 changed files with 403 additions and 19 deletions
--- a/main.py
+++ b/main.py
@ -34,17 +34,20 @@ COMMONS_CACHE_FILE = (
    Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
 )
 SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
+SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
 SENT_MAIL_INDEX_CACHE = (
    Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
 )
 COMMONS_CACHE_MAX_AGE = 86400 * 7  # Cache for 7 days
 RECENT_UPLOADS_COUNT = 24

-# User agent for Commons API requests
-COMMONS_USER_AGENT = (
+# User agent for Wikimedia API requests
+WIKIMEDIA_USER_AGENT = (
    "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
 )

+WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
+
 # Browser-like headers for Flickr requests
 BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
@ -64,18 +67,26 @@ BROWSER_HEADERS = {
 # Flickr license codes to human-readable names
 FLICKR_LICENSES = {
    0: "All Rights Reserved",
-    1: "CC BY-NC-SA",
-    2: "CC BY-NC",
-    3: "CC BY-NC-ND",
-    4: "CC BY",
-    5: "CC BY-SA",
-    6: "CC BY-ND",
+    1: "CC BY-NC-SA 2.0",
+    2: "CC BY-NC 2.0",
+    3: "CC BY-NC-ND 2.0",
+    4: "CC BY 2.0",
+    5: "CC BY-SA 2.0",
+    6: "CC BY-ND 2.0",
    7: "No known copyright",
    8: "US Government",
    9: "CC0",
    10: "Public Domain",
+    # CC 4.0 licenses (codes confirmed via Flickr)
+    16: "CC BY-NC-ND 4.0",
 }

+# Non-free CC licenses (NC or ND restrictions)
+NONFREE_CC_LICENSES = {1, 2, 3, 6, 11, 12, 13, 16}
+
+# Wikipedia-compatible free licenses
+FREE_LICENSES = {4, 5, 7, 8, 9, 10, 14, 15}
+

 PHOTOS_PER_PAGE = 25

@ -296,7 +307,7 @@ def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
        "format": "json",
    }

-    headers = {"User-Agent": COMMONS_USER_AGENT}
+    headers = {"User-Agent": WIKIMEDIA_USER_AGENT}

    try:
        response = requests.get(
@ -404,6 +415,165 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
    return result, total_matched


+def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
+    """Get previous messages sent to a Flickr user.
+
+    Checks both the display name (flickr_user) and username (flickr_username)
+    against the recipient field in the messages index.
+    """
+    if not SENT_MAIL_INDEX_FILE.exists():
+        return []
+
+    try:
+        with open(SENT_MAIL_INDEX_FILE) as f:
+            messages = json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return []
+
+    # Normalize for case-insensitive comparison
+    flickr_user_lower = flickr_user.lower() if flickr_user else ""
+    flickr_username_lower = flickr_username.lower() if flickr_username else ""
+
+    matches = []
+    for msg in messages:
+        recipient = msg.get("recipient", "").lower()
+        if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
+            matches.append(msg)
+
+    return matches
+
+
+def parse_category_input(category_input: str) -> str | None:
+    """Parse category title from URL or direct input.
+
+    Returns the category title with 'Category:' prefix, or None if invalid.
+    """
+    category_input = category_input.strip()
+
+    # Handle URL format: https://en.wikipedia.org/wiki/Category:Example
+    if "wikipedia.org" in category_input:
+        match = re.search(r"/wiki/(Category:[^#?]+)", category_input)
+        if match:
+            return unquote(match.group(1)).replace("_", " ")
+        return None
+
+    # Handle direct input - add Category: prefix if missing
+    if category_input.startswith("Category:"):
+        return category_input.replace("_", " ")
+
+    # Assume it's just the category name
+    return f"Category:{category_input.replace('_', ' ')}"
+
+
+@dataclasses.dataclass
+class ArticleWithoutImage:
+    """Represents a Wikipedia article that needs an image."""
+
+    title: str
+    pageid: int
+
+    @property
+    def wikipedia_url(self) -> str:
+        """URL to the Wikipedia article."""
+        return f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}"
+
+    @property
+    def search_url(self) -> str:
+        """URL to search for this article in Flickr Mail."""
+        return f"/?enwp={quote(self.title)}"
+
+
+# Common non-content images to ignore when checking if an article has images
+NON_CONTENT_IMAGE_PATTERNS = [
+    "OOjs UI icon",
+    "Commons-logo",
+    "Symbol ",
+    "Edit-ltr",
+    "Ambox ",
+    "Question book",
+    "Wiki letter",
+    "Text document",
+    "Folder ",
+    "Crystal ",
+    "Nuvola ",
+    "Gnome-",
+    "Disambig ",
+    "DAB ",
+]
+
+
+def has_content_image(images: list[dict]) -> bool:
+    """Check if an article has a content image (not just UI icons/logos)."""
+    for img in images:
+        title = img.get("title", "")
+        # Skip if it matches any non-content pattern
+        is_non_content = any(pattern in title for pattern in NON_CONTENT_IMAGE_PATTERNS)
+        if not is_non_content:
+            return True
+    return False
+
+
+def get_articles_without_images(
+    category: str, limit: int = 100
+) -> tuple[list[ArticleWithoutImage], str | None]:
+    """Get articles in a category that don't have images.
+
+    Uses generator=categorymembers with prop=images to efficiently check
+    multiple articles in a single API request.
+
+    Returns a tuple of (articles_list, continue_token).
+    """
+    params = {
+        "action": "query",
+        "generator": "categorymembers",
+        "gcmtitle": category,
+        "gcmtype": "page",  # Only articles, not subcategories or files
+        "gcmnamespace": "0",  # Main namespace only
+        "gcmlimit": str(limit),
+        "prop": "images",
+        "imlimit": "max",  # Need enough to check all pages in batch
+        "format": "json",
+    }
+
+    headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
+
+    try:
+        response = requests.get(
+            WIKIPEDIA_API, params=params, headers=headers, timeout=30
+        )
+        response.raise_for_status()
+        data = response.json()
+    except (requests.RequestException, json.JSONDecodeError) as e:
+        print(f"Wikipedia API error: {e}")
+        return [], None
+
+    articles_without_images: list[ArticleWithoutImage] = []
+
+    pages = data.get("query", {}).get("pages", {})
+    for page in pages.values():
+        images = page.get("images", [])
+
+        # Skip if page has content images (not just UI icons)
+        if has_content_image(images):
+            continue
+
+        title = page.get("title", "")
+        pageid = page.get("pageid", 0)
+
+        if title and pageid:
+            articles_without_images.append(
+                ArticleWithoutImage(title=title, pageid=pageid)
+            )
+
+    # Sort by title for consistent display
+    articles_without_images.sort(key=lambda a: a.title)
+
+    # Get continue token if there are more results
+    continue_token = data.get("continue", {}).get("gcmcontinue")
+
+    return articles_without_images, continue_token
+
+
 def is_valid_flickr_image_url(url: str) -> bool:
    """Check if URL is a valid Flickr static image URL."""
    valid_prefixes = (
@ -608,9 +778,13 @@ def start() -> str:
        wiki_part2 = name.replace(" ", "_")
        wikipedia_url = wiki_part1 + wiki_part2

-    if "_(" in name:
-        name = name[: name.find("_(")]
+    # Remove disambiguation suffix like "(academic)" for Flickr search
    name = name.replace("_", " ")
+    if " (" in name:
+        name = name[: name.find(" (")]
+
+    # Get category param if coming from category search
+    cat = flask.request.args.get("cat")

    flickr_url = flask.request.args.get("flickr")
    if not flickr_url:
@ -623,6 +797,7 @@ def start() -> str:
            name=name,
            enwp=enwp,
            search_result=search_result,
+            cat=cat,
        )

    if "/in/" in flickr_url:
@ -644,6 +819,40 @@ def start() -> str:
    if img_url and not is_valid_flickr_image_url(img_url):
        img_url = None

+    # Get flickr_user name and build profile URL
+    flickr_user = flask.request.args.get("flickr_user", "")
+    flickr_user_url = f"https://www.flickr.com/photos/{flickr_username}/"
+
+    # Check for previous messages to this user
+    previous_messages = get_previous_messages(flickr_user, flickr_username)
+
+    # Get license code if provided
+    license_code = flask.request.args.get("license", type=int)
+    license_name = (
+        FLICKR_LICENSES.get(license_code, "") if license_code is not None else ""
+    )
+
+    is_free_license = license_code in FREE_LICENSES
+    is_nonfree_cc = license_code in NONFREE_CC_LICENSES
+
+    # For free licenses, show upload options instead of message
+    if is_free_license:
+        return flask.render_template(
+            "combined.html",
+            name=name,
+            enwp=enwp,
+            flickr_url=flickr_url,
+            img_url=img_url,
+            license_code=license_code,
+            license_name=license_name,
+            is_free_license=True,
+            wikipedia_url=wikipedia_url,
+            flickr_user=flickr_user,
+            flickr_user_url=flickr_user_url,
+            cat=cat,
+            previous_messages=previous_messages,
+        )
+
    msg = flask.render_template(
        "message.jinja",
        flickr_url=flickr_url,
@ -652,6 +861,8 @@ def start() -> str:
        name=name,
        wiki_part1=wiki_part1,
        wiki_part2=wiki_part2,
+        is_nonfree_cc=is_nonfree_cc,
+        license_name=license_name,
    )

    subject = f"Request to use your photo of {name} on Wikipedia"
@ -667,6 +878,43 @@ def start() -> str:
        lines=lines,
        nsid=nsid,
        img_url=img_url,
+        license_code=license_code,
+        license_name=license_name,
+        flickr_user=flickr_user,
+        flickr_user_url=flickr_user_url,
+        cat=cat,
+        previous_messages=previous_messages,
+    )
+
+
+@app.route("/category")
+def category_search() -> str:
+    """Find articles in a Wikipedia category that need images."""
+    cat = flask.request.args.get("cat", "").strip()
+
+    if not cat:
+        return flask.render_template("category.html")
+
+    category = parse_category_input(cat)
+    if not category:
+        return flask.render_template(
+            "category.html",
+            error="Invalid category format. Please enter a category name or URL.",
+            cat=cat,
+        )
+
+    articles, continue_token = get_articles_without_images(category)
+
+    # Get the display name (without Category: prefix)
+    category_name = category.replace("Category:", "")
+
+    return flask.render_template(
+        "category.html",
+        cat=cat,
+        category=category,
+        category_name=category_name,
+        articles=articles,
+        continue_token=continue_token,
    )