From e072279566f6cf7c6c11ab8f2f95bc16908145df Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 13:17:34 +0000 Subject: [PATCH 1/8] Stop fetching all pages when downloading sent mail --- download_commons_contributions.py | 27 +++++++++++++++--------- download_sent_mail.py | 34 ++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/download_commons_contributions.py b/download_commons_contributions.py index 1f8f508..1753552 100755 --- a/download_commons_contributions.py +++ b/download_commons_contributions.py @@ -48,12 +48,8 @@ def fetch_contributions( return contributions, new_continue -def upsert_contribution(session, c: dict) -> None: - """Insert or update a contribution by revid.""" - existing = session.query(Contribution).filter_by(revid=c["revid"]).first() - if existing: - return # Already have this revision - +def insert_contribution(session, c: dict) -> None: + """Insert a contribution row (caller must ensure revid is new).""" session.add(Contribution( userid=c.get("userid"), user=c.get("user"), @@ -108,13 +104,24 @@ def main() -> None: print("no results") break + # One DB query per batch to identify already-known revisions. + revids = [c["revid"] for c in contributions if "revid" in c] + existing_revids = { + row[0] + for row in ( + session.query(Contribution.revid) + .filter(Contribution.revid.in_(revids)) + .all() + ) + } + batch_new = 0 for c in contributions: - # Stop if we've reached contributions we already have - existing = session.query(Contribution).filter_by(revid=c["revid"]).first() - if existing: + revid = c.get("revid") + if revid in existing_revids: continue - upsert_contribution(session, c) + + insert_contribution(session, c) batch_new += 1 new_count += batch_new diff --git a/download_sent_mail.py b/download_sent_mail.py index c3ac224..0146c00 100755 --- a/download_sent_mail.py +++ b/download_sent_mail.py @@ -17,6 +17,7 @@ from flickr_mail.url_utils import ( BASE_URL = "https://www.flickr.com" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" +MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", @@ -166,22 +167,41 @@ def main() -> None: http_session = create_session() - # Scrape all pages to find new messages - total_pages = 29 new_messages: list[dict] = [] + stop_fetching = False - print("Fetching message list from all pages...") - for page in range(1, total_pages + 1): + print("Fetching message list until we reach existing messages...") + for page in range(1, MAX_SENT_MAIL_PAGES + 1): url = SENT_MAIL_URL.format(page=page) - print(f" Fetching page {page}/{total_pages}...") + print(f" Fetching page {page}...") try: soup = fetch_page(http_session, url) page_messages = extract_messages_from_list_page(soup) + if not page_messages: + print(" No messages found on this page, stopping") + break + + page_new_messages = 0 for msg in page_messages: - if msg["message_id"] not in existing_ids: - new_messages.append(msg) + msg_id = msg.get("message_id") + if not msg_id: + continue + if msg_id in existing_ids: + stop_fetching = True + break + + new_messages.append(msg) + page_new_messages += 1 + + if stop_fetching: + print(" Reached messages already in the database, stopping pagination") + break + + if page_new_messages == 0: + print(" No new messages on this page, stopping pagination") + break time.sleep(1) # Be polite to the server From 4f67960fe17e47a08af01ae18d350768757d067a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 13:34:09 +0000 Subject: [PATCH 2/8] Make commons contributions sync resilient to shallow gaps --- download_commons_contributions.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/download_commons_contributions.py b/download_commons_contributions.py index 1753552..a2dea6e 100755 --- a/download_commons_contributions.py +++ b/download_commons_contributions.py @@ -12,6 +12,7 @@ from flickr_mail.models import Contribution API_URL = "https://commons.wikimedia.org/w/api.php" USERNAME = "Edward" +CONSECUTIVE_KNOWN_BATCHES_TO_STOP = 3 # Identify ourselves properly to Wikimedia USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)" @@ -93,6 +94,7 @@ def main() -> None: batch_num = 0 new_count = 0 continue_token = None + consecutive_known_batches = 0 while True: batch_num += 1 @@ -130,7 +132,18 @@ def main() -> None: session.commit() if batch_new == 0: - # All contributions in this batch already exist, we're caught up + consecutive_known_batches += 1 + print( + " Batch fully known " + f"({consecutive_known_batches}/" + f"{CONSECUTIVE_KNOWN_BATCHES_TO_STOP})" + ) + else: + consecutive_known_batches = 0 + + if consecutive_known_batches >= CONSECUTIVE_KNOWN_BATCHES_TO_STOP: + # Stop after a small overlap window of known-only batches. + # This catches recent historical gaps without full-history scans. print(" Caught up with existing data") break From 2819652afde3c20d2cece435c20deed873240a96 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 13:41:27 +0000 Subject: [PATCH 3/8] Handle modern UploadWizard comments when indexing Flickr uploads --- update_flickr_uploads.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/update_flickr_uploads.py b/update_flickr_uploads.py index 06b85e0..72c70e8 100644 --- a/update_flickr_uploads.py +++ b/update_flickr_uploads.py @@ -2,8 +2,12 @@ """ Find UploadWizard contributions that are from Flickr and add them to the database. -For contributions with comment 'User created page with UploadWizard', queries the -Commons API to check if the image source is Flickr (by checking the Credit field). +Supports both UploadWizard comment styles: +- "User created page with UploadWizard" (older) +- "Uploaded a work by ... with UploadWizard" (newer, often includes Flickr URL) + +If a Flickr URL is not present in the contribution comment, queries Commons API +to check if the image source is Flickr (by checking the Credit field). """ import json @@ -27,6 +31,13 @@ def extract_flickr_url_from_credit(credit: str) -> str | None: return match.group(0) if match else None +def extract_flickr_url_from_comment(comment: str) -> str | None: + """Extract Flickr URL directly from a contribution comment.""" + pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/\s]+/\d+' + match = re.search(pattern, comment or "") + return match.group(0) if match else None + + def get_image_metadata(titles: list[str]) -> dict[str, dict]: """Fetch image metadata from Commons API for multiple titles.""" if not titles: @@ -97,10 +108,12 @@ def main(): ) url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages} - # Find UploadWizard contributions (page creations only) + # Find UploadWizard file uploads. + # Old format: "User created page with UploadWizard" + # New format: "Uploaded a work by ... with UploadWizard" upload_wizard = ( session.query(Contribution) - .filter(Contribution.comment == "User created page with UploadWizard") + .filter(Contribution.comment.contains("UploadWizard")) .filter(Contribution.title.startswith("File:")) .all() ) @@ -127,7 +140,10 @@ def main(): credit = meta.get("credit", "") artist = meta.get("artist", "") - flickr_url = extract_flickr_url_from_credit(credit) + # Prefer URL directly in comment; fall back to extmetadata Credit. + flickr_url = extract_flickr_url_from_comment(c.comment or "") + if not flickr_url: + flickr_url = extract_flickr_url_from_credit(credit) if not flickr_url: continue From 252a854e768b1fa6f100f737715cdf4078611561 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 7 Feb 2026 14:41:41 +0000 Subject: [PATCH 4/8] Move Flickr sent-mail cookies into local config file --- .gitignore | 1 + AGENTS.md | 47 +++++++++--- README.md | 131 ++++++++++++++++---------------- download_sent_mail.example.json | 3 + download_sent_mail.py | 24 +++++- 5 files changed, 127 insertions(+), 79 deletions(-) create mode 100644 download_sent_mail.example.json diff --git a/.gitignore b/.gitignore index 2bcc7a3..f377651 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__ commons_contributions/thumbnail_cache.json commons_contributions/sent_mail_index.json flickr_mail.db +download_sent_mail.local.json diff --git a/AGENTS.md b/AGENTS.md index 60e1614..47b3193 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -85,16 +85,18 @@ for the Flickr mail URL. Scrapes the user's profile page for embedded params. Shows recent Wikimedia Commons uploads on the home page, filtered to only those obtained via Flickr mail requests. -**Data files** (in `commons_contributions/`): -- `flickr_uploads.json`: List of Commons uploads from Flickr with metadata -- `thumbnail_cache.json`: Cached Commons API thumbnail URLs (7-day TTL) -- `sent_mail_index.json`: Index of sent mail messages (flickr_url → wikipedia_url) +**Database tables used by the app**: +- `sent_messages`: downloaded from Flickr sent mail, includes extracted Flickr + URL and Wikipedia URL from message body +- `contributions`: downloaded from Commons `usercontribs` +- `flickr_uploads`: derived table built by `update_flickr_uploads.py` by + matching Commons uploads to Flickr URLs +- `thumbnail_cache`: cached Commons API thumbnail URLs (7-day TTL) **Key functions**: -- `build_sent_mail_index()`: Parses sent mail JSON files, extracts Flickr and - Wikipedia URLs from message bodies, caches the index - `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match, - fetches thumbnails from Commons API + joins `flickr_uploads` with `sent_messages`, and fetches thumbnails from + Commons API - `normalize_flickr_url()`: Normalizes URLs for matching (removes protocol, www, trailing slash) **CommonsUpload dataclass**: @@ -104,9 +106,14 @@ those obtained via Flickr mail requests. - `wiki_link_url`, `wiki_link_label`: Handles Wikidata vs Wikipedia links **Maintenance script** (`update_flickr_uploads.py`): -Run to find Flickr uploads from UploadWizard contributions that don't have -the Flickr URL in the edit comment. Queries Commons API for image metadata -and checks the Credit field for Flickr URLs. +Builds/updates `flickr_uploads` from `contributions` and links to +`sent_messages`. +- Scans file contributions containing `UploadWizard` in the comment +- Supports both comment styles: + - `User created page with UploadWizard` (older) + - `Uploaded a work by ... with UploadWizard` (newer; often includes URL) +- Extracts Flickr URL from contribution comment when present +- Falls back to Commons `extmetadata.Credit` lookup when comment has no URL ### Category Search (`/category` route) @@ -125,7 +132,7 @@ to allow back-navigation to the category. ### Previous Message Detection (`get_previous_messages`) -Checks `sent_mail/messages_index.json` for previous messages to a Flickr user. +Checks the `sent_messages` database table for previous messages to a Flickr user. Matches by both display name and username (case-insensitive). Results shown as an info alert on the message page. @@ -159,6 +166,24 @@ print(f"{len(result.photos)} photos, {result.total_pages} pages") print(result.photos[0].title, result.photos[0].license_name) ``` +## Data Sync Workflow + +To refresh "recent Commons uploads obtained via Flickr mail", run scripts in +this order: + +1. `./download_sent_mail.py` +2. `./download_commons_contributions.py` +3. `./update_flickr_uploads.py` + +Notes: +- `download_sent_mail.py` reads Flickr auth cookies from + `download_sent_mail.local.json` (`cookies_str` key). Copy + `download_sent_mail.example.json` to create local config. +- `main.py` does not populate `flickr_uploads`; it only reads from it. +- `download_commons_contributions.py` intentionally stops after several + consecutive fully-known API batches (overlap window) to avoid full-history + scans while still catching shallow gaps. + ## Potential Improvements - Cache search results to reduce Flickr requests diff --git a/README.md b/README.md index 848e89d..b8d984d 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,88 @@ -# Flickr Photo Finder for Wikipedia Articles +# Flickr Mail Tool lives here: -This tool is designed to help you find photos on Flickr for Wikipedia articles -and contact the photographer. It's a Python application that leverages the Flask -framework for web development. +Flickr Mail is a Flask app that helps find Flickr photos for Wikipedia articles +and contact photographers to request Wikipedia-compatible licensing. -## Table of Contents -- [Introduction](#introduction) -- [Usage](#usage) -- [Error Handling](#error-handling) -- [Running the Application](#running-the-application) +## What It Does -## Introduction +- Searches Flickr from a Wikipedia article title/URL +- Shows license status for each result (free vs non-free CC variants) +- Builds a ready-to-send Flickr message for non-free licenses +- Finds image-less articles in a Wikipedia category +- Shows recent Commons uploads that came from Flickr mail outreach -This tool is developed and maintained by Edward Betts (edward@4angle.com). Its -primary purpose is to simplify the process of discovering and contacting -photographers on Flickr whose photos can be used to enhance Wikipedia articles. +## Project Layout -### Key Features -- **Integrated Flickr search**: Enter a Wikipedia article title and see Flickr - photos directly in the interface - no need to visit Flickr's search page. -- **Photo grid with metadata**: Search results display as a grid of thumbnails - showing the user's name and license for each photo. -- **License handling**: Photos with Wikipedia-compatible licenses (CC BY, - CC BY-SA, CC0, Public Domain) are highlighted with a green badge and link - directly to the Commons UploadWizard. Non-free CC licenses (NC/ND) show a - tailored message explaining Wikipedia's requirements. Supports both CC 2.0 - and CC 4.0 license codes. -- **One-click message composition**: Click any photo to compose a permission - request message with the photo displayed alongside, showing the user's Flickr - profile and current license. -- **Previous message detection**: The message page checks sent mail history and - warns if you have previously contacted the user. -- **Category search**: Find Wikipedia articles without images in a given - category, with links to search Flickr for each article. -- **Pagination**: Browse through thousands of search results with page navigation. -- **Recent uploads showcase**: The home page displays recent Wikimedia Commons - uploads that were obtained via Flickr mail requests, with links to the - Wikipedia article and user's Flickr profile. -- Handle exceptions gracefully and provide detailed error information. +- `main.py`: Flask app routes and core logic +- `templates/`: UI templates +- `download_sent_mail.py`: sync Flickr sent messages into DB +- `download_commons_contributions.py`: sync Commons contributions into DB +- `update_flickr_uploads.py`: derive `flickr_uploads` from contributions/sent mail +- `flickr_mail.db`: SQLite database -## Usage +## Database Pipeline -To use the tool, follow these steps: +The recent uploads section depends on a 3-step pipeline: -1. Start the tool by running the script. -2. Access the tool through a web browser. -3. Enter a Wikipedia article title or URL, or use "Find articles by category" - to discover articles that need images. -4. Browse the Flickr search results displayed in the interface. -5. Click on a photo to select it. If the license is Wikipedia-compatible, you'll - be linked to the Commons UploadWizard. Otherwise, a message is composed to - request a license change. -6. Copy the subject and message, then click "Send message on Flickr" to contact - the user. +1. `./download_sent_mail.py` updates `sent_messages` +2. `./download_commons_contributions.py` updates `contributions` +3. `./update_flickr_uploads.py` builds/updates `flickr_uploads` -## Error Handling +`main.py` only reads `flickr_uploads`; it does not populate it. -The application includes error handling to ensure a smooth user experience. If -an error occurs, it will display a detailed error message with traceback -information. The error handling is designed to provide valuable insights into -any issues that may arise during use. +## UploadWizard Detection -## Running the Application +`update_flickr_uploads.py` supports both Commons UploadWizard comment styles: -To run the application, ensure you have Python 3 installed on your system. You -will also need to install the required Python modules mentioned in the script, -including Flask, requests, and others. +- `User created page with UploadWizard` (older) +- `Uploaded a work by ... with UploadWizard` (newer) -1. Clone this repository to your local machine. -2. Navigate to the project directory. -3. Run the following command to start the application: +It first tries to extract a Flickr URL directly from the contribution comment. +If absent, it falls back to Commons `extmetadata.Credit`. + +## Local Run + +Install dependencies (example): + +```bash +pip install flask requests beautifulsoup4 sqlalchemy +``` + +Start the app: ```bash python3 main.py ``` -4. Access the application by opening a web browser and visiting the provided URL - (usually `http://localhost:5000/`). +Then open: -That's it! You can now use the Flickr Photo Finder tool to streamline the -process of finding and contacting photographers for Wikipedia articles. +- `http://localhost:5000/` -If you encounter any issues or have questions, feel free to contact Edward Betts -(edward@4angle.com). +## Refresh Data -Happy photo hunting! +Run in this order: + +```bash +./download_sent_mail.py +./download_commons_contributions.py +./update_flickr_uploads.py +``` + +Before running `./download_sent_mail.py`, create local auth config: + +```bash +cp download_sent_mail.example.json download_sent_mail.local.json +``` + +Then edit `download_sent_mail.local.json` and set `cookies_str` to your full +Flickr `Cookie` header value. + +## Notes + +- `download_commons_contributions.py` uses an overlap window of known-only + batches before stopping to avoid full-history scans while still catching + shallow gaps. +- If a known Commons upload is missing from `flickr_uploads`, re-run the full + 3-step pipeline above. diff --git a/download_sent_mail.example.json b/download_sent_mail.example.json new file mode 100644 index 0000000..4b305c6 --- /dev/null +++ b/download_sent_mail.example.json @@ -0,0 +1,3 @@ +{ + "cookies_str": "paste your full Flickr Cookie header value here" +} diff --git a/download_sent_mail.py b/download_sent_mail.py index 0146c00..1951882 100755 --- a/download_sent_mail.py +++ b/download_sent_mail.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 """Download sent FlickrMail messages for backup.""" +import json import time +from pathlib import Path import requests from bs4 import BeautifulSoup @@ -18,6 +20,8 @@ BASE_URL = "https://www.flickr.com" SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}" MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}" MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything +CONFIG_FILE = Path(__file__).with_name("download_sent_mail.local.json") +EXAMPLE_CONFIG_FILE = Path(__file__).with_name("download_sent_mail.example.json") HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0", @@ -34,7 +38,23 @@ HEADERS = { "Priority": "u=0, i", } -COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672""" +def load_cookie_string() -> str: + """Load Flickr cookies string from local JSON config.""" + if not CONFIG_FILE.exists(): + raise RuntimeError( + f"Missing config file: {CONFIG_FILE}. " + f"Copy {EXAMPLE_CONFIG_FILE.name} to {CONFIG_FILE.name} and set cookies_str." + ) + + try: + data = json.loads(CONFIG_FILE.read_text()) + except json.JSONDecodeError as exc: + raise RuntimeError(f"Invalid JSON in {CONFIG_FILE}: {exc}") from exc + + cookie_str = data.get("cookies_str", "").strip() + if not cookie_str: + raise RuntimeError(f"{CONFIG_FILE} must contain a non-empty 'cookies_str' value") + return cookie_str def parse_cookies(cookie_str: str) -> dict[str, str]: @@ -51,7 +71,7 @@ def create_session() -> requests.Session: """Create a requests session with authentication.""" session = requests.Session() session.headers.update(HEADERS) - session.cookies.update(parse_cookies(COOKIES_STR)) + session.cookies.update(parse_cookies(load_cookie_string())) return session From 08f5128e8d7fbedb7eee0df1a58c9bd4a5ac3f2b Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 8 Feb 2026 12:34:04 +0000 Subject: [PATCH 5/8] Add interaction logging and tighten model NOT NULL constraints Log searches (article/category) and message-generation events to a new interaction_log table, capturing IP address and User-Agent. Also apply NOT NULL constraints to Contribution, SentMessage, FlickrUpload, and ThumbnailCache fields that are always populated, and remove stale continue_token references from category.html. Co-Authored-By: Claude Sonnet 4.5 --- AGENTS.md | 20 +++++++ README.md | 12 +++++ flickr_mail/models.py | 82 +++++++++++++++++----------- main.py | 116 ++++++++++++++++++++++++++-------------- templates/category.html | 6 +-- 5 files changed, 160 insertions(+), 76 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 47b3193..ef5ddb7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -92,6 +92,8 @@ those obtained via Flickr mail requests. - `flickr_uploads`: derived table built by `update_flickr_uploads.py` by matching Commons uploads to Flickr URLs - `thumbnail_cache`: cached Commons API thumbnail URLs (7-day TTL) +- `interaction_log`: written by the web app to record searches and message + generation events (see below) **Key functions**: - `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match, @@ -115,6 +117,24 @@ Builds/updates `flickr_uploads` from `contributions` and links to - Extracts Flickr URL from contribution comment when present - Falls back to Commons `extmetadata.Credit` lookup when comment has no URL +### Interaction Logging (`log_interaction`) + +The `log_interaction()` helper writes a row to `interaction_log` on each +meaningful user action: + +- `"search_article"` – user submits a Wikipedia article search (page 1 only, + to avoid logging every pagination hit) +- `"search_category"` – user submits a Wikipedia category search +- `"generate_message"` – a non-free CC message is generated after clicking a photo + +Each row captures: Unix `timestamp`, `interaction_type`, `ip_address` +(prefers `X-Forwarded-For` for proxy setups), `user_agent`, `query` (article +title or category name), and optionally `flickr_url` / `wikipedia_url`. + +The table is created by `init_db()` (called via `python3 -c "from +flickr_mail.database import init_db; init_db()"` or any of the maintenance +scripts). The web app never calls `init_db()` itself. + ### Category Search (`/category` route) Finds Wikipedia articles in a category that don't have images. diff --git a/README.md b/README.md index b8d984d..3848a3e 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,18 @@ cp download_sent_mail.example.json download_sent_mail.local.json Then edit `download_sent_mail.local.json` and set `cookies_str` to your full Flickr `Cookie` header value. +## Interaction Logging + +The app logs searches and message generation to the `interaction_log` table: + +- `search_article`: when a user searches for a Wikipedia article title (page 1 only) +- `search_category`: when a user searches a Wikipedia category +- `generate_message`: when a non-free CC message is generated for a photo + +Each row records the timestamp, interaction type, client IP (from +`X-Forwarded-For` if present), User-Agent, query, and (for message events) +the Flickr and Wikipedia URLs. + ## Notes - `download_commons_contributions.py` uses an overlap window of known-only diff --git a/flickr_mail/models.py b/flickr_mail/models.py index 090b37a..0b2bedc 100644 --- a/flickr_mail/models.py +++ b/flickr_mail/models.py @@ -12,20 +12,20 @@ class Contribution(Base): __tablename__ = "contributions" id: Mapped[int] = mapped_column(primary_key=True) - userid: Mapped[int | None] - user: Mapped[str | None] - pageid: Mapped[int | None] - revid: Mapped[int | None] = mapped_column(unique=True) - parentid: Mapped[int | None] - ns: Mapped[int | None] - title: Mapped[str | None] - timestamp: Mapped[str | None] + userid: Mapped[int] + user: Mapped[str] + pageid: Mapped[int] + revid: Mapped[int] = mapped_column(unique=True) + parentid: Mapped[int] + ns: Mapped[int] + title: Mapped[str] + timestamp: Mapped[str] minor: Mapped[str | None] top: Mapped[str | None] - comment: Mapped[str | None] = mapped_column(Text) - size: Mapped[int | None] - sizediff: Mapped[int | None] - tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text + comment: Mapped[str] = mapped_column(Text) + size: Mapped[int] + sizediff: Mapped[int] + tags: Mapped[str] = mapped_column(Text) # JSON array stored as text __table_args__ = ( Index("ix_contributions_timestamp", "timestamp"), @@ -37,16 +37,16 @@ class SentMessage(Base): __tablename__ = "sent_messages" message_id: Mapped[str] = mapped_column(primary_key=True) - subject: Mapped[str | None] - url: Mapped[str | None] - recipient: Mapped[str | None] - date: Mapped[str | None] - body: Mapped[str | None] = mapped_column(Text) - body_html: Mapped[str | None] = mapped_column(Text) - flickr_url: Mapped[str | None] - normalized_flickr_url: Mapped[str | None] - wikipedia_url: Mapped[str | None] - creator_profile_url: Mapped[str | None] + subject: Mapped[str] + url: Mapped[str] + recipient: Mapped[str] + date: Mapped[str] + body: Mapped[str] = mapped_column(Text) + body_html: Mapped[str] = mapped_column(Text) + flickr_url: Mapped[str] + normalized_flickr_url: Mapped[str] + wikipedia_url: Mapped[str] + creator_profile_url: Mapped[str] flickr_uploads: Mapped[list["FlickrUpload"]] = relationship( back_populates="sent_message" @@ -62,15 +62,15 @@ class FlickrUpload(Base): __tablename__ = "flickr_uploads" id: Mapped[int] = mapped_column(primary_key=True) - pageid: Mapped[int | None] - revid: Mapped[int | None] - title: Mapped[str | None] - timestamp: Mapped[str | None] - flickr_url: Mapped[str | None] - normalized_flickr_url: Mapped[str | None] + pageid: Mapped[int] + revid: Mapped[int] + title: Mapped[str] + timestamp: Mapped[str] + flickr_url: Mapped[str] + normalized_flickr_url: Mapped[str] creator: Mapped[str | None] - wikipedia_url: Mapped[str | None] - creator_profile_url: Mapped[str | None] + wikipedia_url: Mapped[str] + creator_profile_url: Mapped[str] sent_message_id: Mapped[str | None] = mapped_column( ForeignKey("sent_messages.message_id") ) @@ -89,5 +89,23 @@ class ThumbnailCache(Base): __tablename__ = "thumbnail_cache" title: Mapped[str] = mapped_column(primary_key=True) - thumb_url: Mapped[str | None] - fetched_at: Mapped[int | None] # Unix timestamp + thumb_url: Mapped[str] + fetched_at: Mapped[int] # Unix timestamp + + +class InteractionLog(Base): + __tablename__ = "interaction_log" + + id: Mapped[int] = mapped_column(primary_key=True) + timestamp: Mapped[int] # Unix timestamp + interaction_type: Mapped[str] # "search_article", "search_category", "generate_message" + ip_address: Mapped[str | None] + user_agent: Mapped[str | None] = mapped_column(Text) + query: Mapped[str | None] # search term or category name + flickr_url: Mapped[str | None] + wikipedia_url: Mapped[str | None] + + __table_args__ = ( + Index("ix_interaction_log_timestamp", "timestamp"), + Index("ix_interaction_log_type", "interaction_type"), + ) diff --git a/main.py b/main.py index 25279ff..3e7c336 100755 --- a/main.py +++ b/main.py @@ -18,7 +18,7 @@ from sqlalchemy import func from werkzeug.debug.tbtools import DebugTraceback from flickr_mail.database import get_session -from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache +from flickr_mail.models import FlickrUpload, InteractionLog, SentMessage, ThumbnailCache from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url import re @@ -378,15 +378,12 @@ def has_content_image(images: list[dict]) -> bool: return False -def get_articles_without_images( - category: str, limit: int = 100 -) -> tuple[list[ArticleWithoutImage], str | None]: +def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: """Get articles in a category that don't have images. Uses generator=categorymembers with prop=images to efficiently check - multiple articles in a single API request. - - Returns a tuple of (articles_list, continue_token). + multiple articles in a single API request, following continuation until + all category members have been processed. """ params = { "action": "query", @@ -394,49 +391,54 @@ def get_articles_without_images( "gcmtitle": category, "gcmtype": "page", # Only articles, not subcategories or files "gcmnamespace": "0", # Main namespace only - "gcmlimit": str(limit), + "gcmlimit": "max", "prop": "images", "imlimit": "max", # Need enough to check all pages in batch "format": "json", } headers = {"User-Agent": WIKIMEDIA_USER_AGENT} - - try: - response = requests.get( - WIKIPEDIA_API, params=params, headers=headers, timeout=30 - ) - response.raise_for_status() - data = response.json() - except (requests.RequestException, json.JSONDecodeError) as e: - print(f"Wikipedia API error: {e}") - return [], None - articles_without_images: list[ArticleWithoutImage] = [] + continue_token: str | None = None - pages = data.get("query", {}).get("pages", {}) - for page in pages.values(): - images = page.get("images", []) + while True: + request_params = params.copy() + if continue_token: + request_params["gcmcontinue"] = continue_token - # Skip if page has content images (not just UI icons) - if has_content_image(images): - continue - - title = page.get("title", "") - pageid = page.get("pageid", 0) - - if title and pageid: - articles_without_images.append( - ArticleWithoutImage(title=title, pageid=pageid) + try: + response = requests.get( + WIKIPEDIA_API, params=request_params, headers=headers, timeout=30 ) + response.raise_for_status() + data = response.json() + except (requests.RequestException, json.JSONDecodeError) as e: + print(f"Wikipedia API error: {e}") + break + + pages = data.get("query", {}).get("pages", {}) + for page in pages.values(): + images = page.get("images", []) + + # Skip if page has content images (not just UI icons) + if has_content_image(images): + continue + + title = page.get("title", "") + pageid = page.get("pageid", 0) + + if title and pageid: + articles_without_images.append( + ArticleWithoutImage(title=title, pageid=pageid) + ) + + continue_token = data.get("continue", {}).get("gcmcontinue") + if not continue_token: + break # Sort by title for consistent display articles_without_images.sort(key=lambda a: a.title) - - # Get continue token if there are more results - continue_token = data.get("continue", {}).get("gcmcontinue") - - return articles_without_images, continue_token + return articles_without_images def is_valid_flickr_image_url(url: str) -> bool: @@ -583,6 +585,33 @@ def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult: ) +def log_interaction( + interaction_type: str, + query: str | None = None, + flickr_url: str | None = None, + wikipedia_url: str | None = None, +) -> None: + """Log a user interaction to the database.""" + forwarded_for = flask.request.headers.get("X-Forwarded-For") + ip_address = forwarded_for.split(",")[0].strip() if forwarded_for else flask.request.remote_addr + user_agent = flask.request.headers.get("User-Agent") + session = get_session() + try: + entry = InteractionLog( + timestamp=int(time.time()), + interaction_type=interaction_type, + ip_address=ip_address, + user_agent=user_agent, + query=query, + flickr_url=flickr_url, + wikipedia_url=wikipedia_url, + ) + session.add(entry) + session.commit() + finally: + session.close() + + @app.errorhandler(werkzeug.exceptions.InternalServerError) def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]: """Handle exception.""" @@ -656,6 +685,8 @@ def start() -> str: # Search Flickr for photos page = flask.request.args.get("page", 1, type=int) page = max(1, page) # Ensure page is at least 1 + if page == 1: + log_interaction("search_article", query=name, wikipedia_url=wikipedia_url) search_result = search_flickr(name, page) return flask.render_template( "combined.html", @@ -718,6 +749,13 @@ def start() -> str: previous_messages=previous_messages, ) + log_interaction( + "generate_message", + query=name, + flickr_url=flickr_url, + wikipedia_url=wikipedia_url, + ) + msg = flask.render_template( "message.jinja", flickr_url=flickr_url, @@ -768,7 +806,8 @@ def category_search() -> str: cat=cat, ) - articles, continue_token = get_articles_without_images(category) + log_interaction("search_category", query=category) + articles = get_articles_without_images(category) # Get the display name (without Category: prefix) category_name = category.replace("Category:", "") @@ -779,7 +818,6 @@ def category_search() -> str: category=category, category_name=category_name, articles=articles, - continue_token=continue_token, ) diff --git a/templates/category.html b/templates/category.html index da52980..31319b6 100644 --- a/templates/category.html +++ b/templates/category.html @@ -33,7 +33,7 @@
Articles without images in {{ category_name }}
{% if articles %} -

Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}

+

Found {{ articles | length }} article(s) without images

{% for article in articles %} @@ -44,10 +44,6 @@ {% endfor %}
- {% if continue_token %} -

Note: Only showing first batch of results. More articles may be available in this category.

- {% endif %} - {% else %}
All articles in this category have images! From ab012f9cf3c352aa2d211232ad9cb3218158360f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 8 Feb 2026 12:34:50 +0000 Subject: [PATCH 6/8] Change submit button to say Search and use btn-primary styling Match the style of the category search page button. Co-Authored-By: Claude Sonnet 4.5 --- templates/combined.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/combined.html b/templates/combined.html index 54158f0..55f96e4 100644 --- a/templates/combined.html +++ b/templates/combined.html @@ -12,7 +12,7 @@
- + Find articles by category From 57b2e474df9c1a49dfadec7f19bcd04de2891126 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 8 Feb 2026 13:32:56 +0000 Subject: [PATCH 7/8] Add pagination to category search for large categories Large categories like "Living people" (900k+ articles) were impractical because the code tried to download all members before displaying results. Now stops after collecting ~200 articles and provides a "Next page" link. Also fixes the MediaWiki API continuation protocol: passes the full continue dict (not just gcmcontinue) so imcontinue responses are handled properly, and reduces gcmlimit from "max" to 50 so each batch's images fit in one API response. Co-Authored-By: Claude Opus 4.6 --- main.py | 63 +++++++++++++++++++++++++++++++---------- templates/category.html | 6 ++++ 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/main.py b/main.py index 3e7c336..fa5bb9a 100755 --- a/main.py +++ b/main.py @@ -348,6 +348,14 @@ class ArticleWithoutImage: return f"/?enwp={quote(self.title)}" +@dataclasses.dataclass +class CategoryResult: + """Result of a paginated category search.""" + + articles: list[ArticleWithoutImage] + gcmcontinue: str | None + + # Common non-content images to ignore when checking if an article has images NON_CONTENT_IMAGE_PATTERNS = [ "OOjs UI icon", @@ -378,12 +386,16 @@ def has_content_image(images: list[dict]) -> bool: return False -def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: +def get_articles_without_images( + category: str, + limit: int = 200, + gcmcontinue: str | None = None, +) -> CategoryResult: """Get articles in a category that don't have images. Uses generator=categorymembers with prop=images to efficiently check multiple articles in a single API request, following continuation until - all category members have been processed. + the limit is reached or all category members have been processed. """ params = { "action": "query", @@ -391,20 +403,25 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: "gcmtitle": category, "gcmtype": "page", # Only articles, not subcategories or files "gcmnamespace": "0", # Main namespace only - "gcmlimit": "max", + "gcmlimit": "50", # Small batches so images fit in one response "prop": "images", - "imlimit": "max", # Need enough to check all pages in batch + "imlimit": "max", "format": "json", } headers = {"User-Agent": WIKIMEDIA_USER_AGENT} articles_without_images: list[ArticleWithoutImage] = [] - continue_token: str | None = None + seen_pageids: set[int] = set() + next_gcmcontinue: str | None = None + + # Build initial continue params from the external pagination token + continue_params: dict[str, str] = {} + if gcmcontinue: + continue_params = {"gcmcontinue": gcmcontinue, "continue": "gcmcontinue||"} while True: request_params = params.copy() - if continue_token: - request_params["gcmcontinue"] = continue_token + request_params.update(continue_params) try: response = requests.get( @@ -418,6 +435,11 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: pages = data.get("query", {}).get("pages", {}) for page in pages.values(): + pageid = page.get("pageid", 0) + if not pageid or pageid in seen_pageids: + continue + seen_pageids.add(pageid) + images = page.get("images", []) # Skip if page has content images (not just UI icons) @@ -425,20 +447,29 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: continue title = page.get("title", "") - pageid = page.get("pageid", 0) - - if title and pageid: + if title: articles_without_images.append( ArticleWithoutImage(title=title, pageid=pageid) ) - continue_token = data.get("continue", {}).get("gcmcontinue") - if not continue_token: + api_continue = data.get("continue") + if not api_continue: break + # Only stop at generator boundaries where we have a resumable token + gcmc = api_continue.get("gcmcontinue") + if gcmc and len(articles_without_images) >= limit: + next_gcmcontinue = gcmc + break + + continue_params = api_continue + # Sort by title for consistent display articles_without_images.sort(key=lambda a: a.title) - return articles_without_images + return CategoryResult( + articles=articles_without_images, + gcmcontinue=next_gcmcontinue, + ) def is_valid_flickr_image_url(url: str) -> bool: @@ -807,7 +838,8 @@ def category_search() -> str: ) log_interaction("search_category", query=category) - articles = get_articles_without_images(category) + gcmcontinue = flask.request.args.get("gcmcontinue") or None + result = get_articles_without_images(category, gcmcontinue=gcmcontinue) # Get the display name (without Category: prefix) category_name = category.replace("Category:", "") @@ -817,7 +849,8 @@ def category_search() -> str: cat=cat, category=category, category_name=category_name, - articles=articles, + articles=result.articles, + gcmcontinue=result.gcmcontinue, ) diff --git a/templates/category.html b/templates/category.html index 31319b6..62fcb1a 100644 --- a/templates/category.html +++ b/templates/category.html @@ -44,6 +44,12 @@ {% endfor %} + {% if gcmcontinue %} + + {% endif %} + {% else %}
All articles in this category have images! From 7b741e951f6787ae6e49fc201fe7df6d3009dffc Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 8 Feb 2026 13:42:18 +0000 Subject: [PATCH 8/8] Add Flickr search term override field Allow users to edit the Flickr search query without changing the Wikipedia article. Shows a text field with the current search term (including quotes for phrase search) that can be modified and re-submitted. The search term persists across pagination and photo selection. Co-Authored-By: Claude Opus 4.6 --- main.py | 12 +++++++++--- templates/combined.html | 32 +++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index fa5bb9a..2a6d238 100755 --- a/main.py +++ b/main.py @@ -491,7 +491,7 @@ def is_valid_flickr_image_url(url: str) -> bool: def search_flickr(search_term: str, page: int = 1) -> SearchResult: """Search Flickr for photos matching the search term.""" - encoded_term = quote(f'"{search_term}"') + encoded_term = quote(search_term) url = f"https://flickr.com/search/?view_all=1&text={encoded_term}&page={page}" response = requests.get(url, headers=BROWSER_HEADERS) @@ -711,20 +711,24 @@ def start() -> str: # Get category param if coming from category search cat = flask.request.args.get("cat") + # Allow overriding the Flickr search term (default includes quotes for phrase search) + flickr_search = flask.request.args.get("flickr_search") or f'"{name}"' + flickr_url = flask.request.args.get("flickr") if not flickr_url: # Search Flickr for photos page = flask.request.args.get("page", 1, type=int) page = max(1, page) # Ensure page is at least 1 if page == 1: - log_interaction("search_article", query=name, wikipedia_url=wikipedia_url) - search_result = search_flickr(name, page) + log_interaction("search_article", query=flickr_search, wikipedia_url=wikipedia_url) + search_result = search_flickr(flickr_search, page) return flask.render_template( "combined.html", name=name, enwp=enwp, search_result=search_result, cat=cat, + flickr_search=flickr_search, ) if "/in/" in flickr_url: @@ -778,6 +782,7 @@ def start() -> str: flickr_user_url=flickr_user_url, cat=cat, previous_messages=previous_messages, + flickr_search=flickr_search, ) log_interaction( @@ -818,6 +823,7 @@ def start() -> str: flickr_user_url=flickr_user_url, cat=cat, previous_messages=previous_messages, + flickr_search=flickr_search, ) diff --git a/templates/combined.html b/templates/combined.html index 55f96e4..394327b 100644 --- a/templates/combined.html +++ b/templates/combined.html @@ -63,13 +63,20 @@

← Back to category

{% endif %}

Wikipedia article: {{ name }}

+
+ + {% if cat %}{% endif %} + + + +

Select a photo to compose a message ({{ search_result.total_photos | default(0) }} results):

{% for photo in search_result.photos %}
- + {{ photo.title }}
@@ -86,7 +93,7 @@
{% endif %}

- ← Back to search results + ← Back to search results