diff --git a/.gitignore b/.gitignore
index 2bcc7a3..abfca28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,3 @@
__pycache__
commons_contributions/thumbnail_cache.json
commons_contributions/sent_mail_index.json
-flickr_mail.db
diff --git a/AGENTS.md b/AGENTS.md
index 60e1614..6fe3cca 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -14,9 +14,7 @@ licensing.
- **templates/**: Jinja2 templates using Bootstrap 5 for styling
- `base.html`: Base template with Bootstrap CSS/JS
- `combined.html`: Main UI template for search, results, and message composition
- - `message.jinja`: Template for the permission request message body (with
- alternate text for non-free CC licenses)
- - `category.html`: Category search page with visited link styling
+ - `message.jinja`: Template for the permission request message body
- `show_error.html`: Error display template
## Key Components
@@ -50,22 +48,10 @@ Represents a photo with:
### License Codes
-Flickr uses numeric codes for licenses. Codes 1-6 are CC 2.0, codes 11-16 are
-CC 4.0 equivalents.
+Wikipedia-compatible licenses (can be used): 4 (CC BY), 5 (CC BY-SA), 7 (No
+known copyright), 8 (US Government), 9 (CC0), 10 (Public Domain).
-Wikipedia-compatible (`FREE_LICENSES`): 4 (CC BY 2.0), 5 (CC BY-SA 2.0),
-7 (No known copyright), 8 (US Government), 9 (CC0), 10 (Public Domain),
-14 (CC BY 4.0), 15 (CC BY-SA 4.0).
-
-Non-free CC (`NONFREE_CC_LICENSES`): 1 (CC BY-NC-SA 2.0), 2 (CC BY-NC 2.0),
-3 (CC BY-NC-ND 2.0), 6 (CC BY-ND 2.0), 11-13 (4.0 NC variants),
-16 (CC BY-ND 4.0).
-
-Not compatible: 0 (All Rights Reserved).
-
-For free licenses, the message page shows an UploadWizard link instead of a
-message. For non-free CC licenses, a tailored message explains which
-restrictions (NC/ND) prevent Wikipedia use.
+Not compatible: 0 (All Rights Reserved), 1-3 (NC variants), 6 (ND).
### URL Validation (`is_valid_flickr_image_url`)
@@ -108,40 +94,15 @@ Run to find Flickr uploads from UploadWizard contributions that don't have
the Flickr URL in the edit comment. Queries Commons API for image metadata
and checks the Credit field for Flickr URLs.
-### Category Search (`/category` route)
-
-Finds Wikipedia articles in a category that don't have images.
-
-**Key functions**:
-- `parse_category_input()`: Accepts category name, `Category:` prefix, or full
- Wikipedia URL
-- `get_articles_without_images()`: Uses MediaWiki API with
- `generator=categorymembers` and `prop=images` for efficient batch queries
-- `has_content_image()`: Filters out non-content images (UI icons, logos) using
- `NON_CONTENT_IMAGE_PATTERNS`
-
-The `cat` URL parameter is preserved through search results and message pages
-to allow back-navigation to the category.
-
-### Previous Message Detection (`get_previous_messages`)
-
-Checks `sent_mail/messages_index.json` for previous messages to a Flickr user.
-Matches by both display name and username (case-insensitive). Results shown as
-an info alert on the message page.
-
## Request Flow
-1. User enters Wikipedia article title/URL → `start()` extracts article name.
- Alternatively, user searches by category via `/category` route.
-2. `search_flickr()` fetches and parses Flickr search results.
- Disambiguation suffixes like "(academic)" are removed for the search.
-3. Results displayed as clickable photo grid with license badges.
-4. User clicks photo → page reloads with `flickr`, `img`, `license`, and
- `flickr_user` params.
-5. If license is Wikipedia-compatible: show UploadWizard link.
-6. Otherwise: `flickr_usrename_to_nsid()` looks up the user's NSID, previous
- messages are checked, and the appropriate message template is rendered.
-7. User copies message and clicks link to Flickr's mail compose page.
+1. User enters Wikipedia article title/URL → `start()` extracts article name
+2. `search_flickr()` fetches and parses Flickr search results
+3. Results displayed as clickable photo grid with license badges
+4. User clicks photo → page reloads with `flickr` and `img` params
+5. `flickr_usrename_to_nsid()` looks up the photographer's NSID
+6. Message template rendered with photo details
+7. User copies message and clicks link to Flickr's mail compose page
## Testing Changes
@@ -162,8 +123,6 @@ print(result.photos[0].title, result.photos[0].license_name)
## Potential Improvements
- Cache search results to reduce Flickr requests
-- Add filtering by license type in search results
+- Add filtering by license type
- Handle Flickr rate limiting/blocks more gracefully
- Add tests for the parsing logic
-- Add pagination for category search (continue token is already returned)
-- Confirm CC 4.0 license codes 11-15 (only 16 confirmed so far)
diff --git a/README.md b/README.md
index 848e89d..1df6a66 100644
--- a/README.md
+++ b/README.md
@@ -22,23 +22,16 @@ photographers on Flickr whose photos can be used to enhance Wikipedia articles.
- **Integrated Flickr search**: Enter a Wikipedia article title and see Flickr
photos directly in the interface - no need to visit Flickr's search page.
- **Photo grid with metadata**: Search results display as a grid of thumbnails
- showing the user's name and license for each photo.
-- **License handling**: Photos with Wikipedia-compatible licenses (CC BY,
- CC BY-SA, CC0, Public Domain) are highlighted with a green badge and link
- directly to the Commons UploadWizard. Non-free CC licenses (NC/ND) show a
- tailored message explaining Wikipedia's requirements. Supports both CC 2.0
- and CC 4.0 license codes.
+ showing the photographer's name and license for each photo.
+- **License highlighting**: Photos with Wikipedia-compatible licenses (CC BY,
+ CC BY-SA, CC0, Public Domain) are highlighted with a green badge.
- **One-click message composition**: Click any photo to compose a permission
- request message with the photo displayed alongside, showing the user's Flickr
- profile and current license.
-- **Previous message detection**: The message page checks sent mail history and
- warns if you have previously contacted the user.
-- **Category search**: Find Wikipedia articles without images in a given
- category, with links to search Flickr for each article.
+ request message with the photo displayed alongside.
- **Pagination**: Browse through thousands of search results with page navigation.
- **Recent uploads showcase**: The home page displays recent Wikimedia Commons
uploads that were obtained via Flickr mail requests, with links to the
- Wikipedia article and user's Flickr profile.
+ Wikipedia article and photographer's Flickr profile.
+- Generate messages to request permission to use photos on Wikipedia.
- Handle exceptions gracefully and provide detailed error information.
## Usage
@@ -47,14 +40,11 @@ To use the tool, follow these steps:
1. Start the tool by running the script.
2. Access the tool through a web browser.
-3. Enter a Wikipedia article title or URL, or use "Find articles by category"
- to discover articles that need images.
+3. Enter the Wikipedia article title or URL.
4. Browse the Flickr search results displayed in the interface.
-5. Click on a photo to select it. If the license is Wikipedia-compatible, you'll
- be linked to the Commons UploadWizard. Otherwise, a message is composed to
- request a license change.
+5. Click on a photo to select it and compose a permission request message.
6. Copy the subject and message, then click "Send message on Flickr" to contact
- the user.
+ the photographer.
## Error Handling
diff --git a/download_commons_contributions.py b/download_commons_contributions.py
deleted file mode 100755
index 1f8f508..0000000
--- a/download_commons_contributions.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-"""Download Wikimedia Commons contributions for a user."""
-
-import json
-import time
-
-import requests
-
-from flickr_mail.database import init_db, get_session
-from flickr_mail.models import Contribution
-
-
-API_URL = "https://commons.wikimedia.org/w/api.php"
-USERNAME = "Edward"
-
-# Identify ourselves properly to Wikimedia
-USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
-
-SESSION = requests.Session()
-SESSION.headers.update({"User-Agent": USER_AGENT})
-
-
-def fetch_contributions(
- continue_token: str | None = None,
-) -> tuple[list[dict], str | None]:
- """Fetch a batch of contributions from the API."""
- params = {
- "action": "query",
- "list": "usercontribs",
- "ucuser": USERNAME,
- "uclimit": "500",
- "ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
- "format": "json",
- }
-
- if continue_token:
- params["uccontinue"] = continue_token
-
- response = SESSION.get(API_URL, params=params)
- response.raise_for_status()
- data = response.json()
-
- contributions = data.get("query", {}).get("usercontribs", [])
-
- # Get continuation token if more results available
- new_continue = data.get("continue", {}).get("uccontinue")
-
- return contributions, new_continue
-
-
-def upsert_contribution(session, c: dict) -> None:
- """Insert or update a contribution by revid."""
- existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
- if existing:
- return # Already have this revision
-
- session.add(Contribution(
- userid=c.get("userid"),
- user=c.get("user"),
- pageid=c.get("pageid"),
- revid=c.get("revid"),
- parentid=c.get("parentid"),
- ns=c.get("ns"),
- title=c.get("title"),
- timestamp=c.get("timestamp"),
- minor=c.get("minor"),
- top=c.get("top"),
- comment=c.get("comment"),
- size=c.get("size"),
- sizediff=c.get("sizediff"),
- tags=json.dumps(c.get("tags", [])),
- ))
-
-
-def main() -> None:
- """Main entry point."""
- init_db()
- session = get_session()
-
- try:
- existing_count = session.query(Contribution).count()
-
- # Get the latest timestamp to know where to resume from
- latest = (
- session.query(Contribution)
- .order_by(Contribution.timestamp.desc())
- .first()
- )
-
- if existing_count > 0 and latest:
- print(f"Database has {existing_count} contributions")
- print(f"Latest: {latest.timestamp}")
- print("Fetching new contributions...")
- else:
- print(f"Downloading contributions for user: {USERNAME}")
-
- batch_num = 0
- new_count = 0
- continue_token = None
-
- while True:
- batch_num += 1
- print(f" Fetching batch {batch_num}...", end=" ", flush=True)
-
- contributions, continue_token = fetch_contributions(continue_token)
-
- if not contributions:
- print("no results")
- break
-
- batch_new = 0
- for c in contributions:
- # Stop if we've reached contributions we already have
- existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
- if existing:
- continue
- upsert_contribution(session, c)
- batch_new += 1
-
- new_count += batch_new
- print(f"got {len(contributions)}, {batch_new} new")
-
- session.commit()
-
- if batch_new == 0:
- # All contributions in this batch already exist, we're caught up
- print(" Caught up with existing data")
- break
-
- if not continue_token:
- break
-
- # Be polite to the API
- time.sleep(0.5)
-
- total = session.query(Contribution).count()
- print(f"\nDone! {new_count} new contributions, {total} total in database")
-
- except Exception:
- session.rollback()
- raise
- finally:
- session.close()
-
-
-if __name__ == "__main__":
- main()
diff --git a/download_sent_mail.py b/download_sent_mail.py
deleted file mode 100755
index c3ac224..0000000
--- a/download_sent_mail.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/env python3
-"""Download sent FlickrMail messages for backup."""
-
-import time
-
-import requests
-from bs4 import BeautifulSoup
-
-from flickr_mail.database import init_db, get_session
-from flickr_mail.models import SentMessage
-from flickr_mail.url_utils import (
- creator_profile_from_flickr_url,
- extract_urls_from_message,
- normalize_flickr_url,
-)
-
-BASE_URL = "https://www.flickr.com"
-SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
-MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
-
-HEADERS = {
- "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "en-GB,en;q=0.9",
- "Accept-Encoding": "gzip, deflate, br, zstd",
- "DNT": "1",
- "Connection": "keep-alive",
- "Upgrade-Insecure-Requests": "1",
- "Sec-Fetch-Dest": "document",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "same-origin",
- "Sec-Fetch-User": "?1",
- "Priority": "u=0, i",
-}
-
-COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
-
-
-def parse_cookies(cookie_str: str) -> dict[str, str]:
- """Parse cookie string into dictionary."""
- cookies = {}
- for item in cookie_str.split("; "):
- if "=" in item:
- key, value = item.split("=", 1)
- cookies[key] = value
- return cookies
-
-
-def create_session() -> requests.Session:
- """Create a requests session with authentication."""
- session = requests.Session()
- session.headers.update(HEADERS)
- session.cookies.update(parse_cookies(COOKIES_STR))
- return session
-
-
-def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
- """Fetch a page and return parsed HTML."""
- response = session.get(url)
- response.raise_for_status()
- return BeautifulSoup(response.text, "html.parser")
-
-
-def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
- """Extract message metadata from a sent mail list page."""
- messages = []
-
- # Find all message rows:
- mail_rows = soup.select("tr.message_row")
-
- for row in mail_rows:
- msg = {}
-
- # Get message ID from the row id attribute
- row_id = row.get("id", "")
- if row_id.startswith("message_row_"):
- msg["message_id"] = row_id.replace("message_row_", "")
-
- # Find message link in the subject cell
- subj_cell = row.select_one("td.subj")
- if subj_cell:
- link = subj_cell.find("a")
- if link:
- msg["subject"] = link.get_text(strip=True)
- msg["url"] = BASE_URL + link["href"]
-
- # Recipient is in td.fromto
- fromto_cell = row.select_one("td.fromto")
- if fromto_cell:
- msg["recipient"] = fromto_cell.get_text(strip=True)
-
- # Date is in td.date
- date_cell = row.select_one("td.date")
- if date_cell:
- msg["date"] = date_cell.get_text(strip=True)
-
- if "message_id" in msg:
- messages.append(msg)
-
- return messages
-
-
-def extract_message_content(soup: BeautifulSoup) -> dict:
- """Extract full message content from a message page."""
- content = {}
-
- # Find the ThinCase div containing the message
- thin_case = soup.select_one(".ThinCase")
- if not thin_case:
- return content
-
- # Find the table with message content
- table = thin_case.find("table")
- if not table:
- return content
-
- rows = table.find_all("tr", recursive=False)
-
- # Row 0: To:
- # Row 1: Subject:
- # Row 2:
- for row in rows:
- cells = row.find_all("td", recursive=False)
- if len(cells) >= 2:
- header_cell = cells[0]
- value_cell = cells[1]
-
- header = header_cell.get_text(strip=True).lower()
-
- if header == "to:":
- # Get recipient username
- username = value_cell.select_one(".username")
- if username:
- content["recipient"] = username.get_text(strip=True)
-
- elif header == "subject:":
- # Get subject from h3
- h3 = value_cell.find("h3")
- if h3:
- content["subject"] = h3.get_text(strip=True)
-
- elif header == "":
- # This is the message body row (empty header cell)
- # Get the content but exclude the delete form
- form = value_cell.find("form")
- if form:
- form.decompose()
-
- content["body"] = value_cell.get_text(separator="\n", strip=True)
- content["body_html"] = str(value_cell)
- break # Body found, stop processing
-
- return content
-
-
-def main() -> None:
- """Main entry point."""
- init_db()
- db_session = get_session()
-
- try:
- existing_ids = {
- r[0] for r in db_session.query(SentMessage.message_id).all()
- }
- print(f"Database has {len(existing_ids)} messages")
-
- http_session = create_session()
-
- # Scrape all pages to find new messages
- total_pages = 29
- new_messages: list[dict] = []
-
- print("Fetching message list from all pages...")
- for page in range(1, total_pages + 1):
- url = SENT_MAIL_URL.format(page=page)
- print(f" Fetching page {page}/{total_pages}...")
-
- try:
- soup = fetch_page(http_session, url)
- page_messages = extract_messages_from_list_page(soup)
-
- for msg in page_messages:
- if msg["message_id"] not in existing_ids:
- new_messages.append(msg)
-
- time.sleep(1) # Be polite to the server
-
- except Exception as e:
- print(f" Error fetching page {page}: {e}")
- continue
-
- print(f"Found {len(new_messages)} new messages to download")
-
- # Download individual messages
- for i, msg in enumerate(new_messages, 1):
- msg_id = msg["message_id"]
- url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
-
- print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
-
- try:
- soup = fetch_page(http_session, url)
- content = extract_message_content(soup)
-
- # Merge with metadata
- full_msg = {**msg, **content}
-
- body = full_msg.get("body", "")
- flickr_url, wikipedia_url = extract_urls_from_message(body)
- normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
- creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
-
- db_session.add(SentMessage(
- message_id=msg_id,
- subject=full_msg.get("subject", ""),
- url=full_msg.get("url", ""),
- recipient=full_msg.get("recipient", ""),
- date=full_msg.get("date", ""),
- body=body,
- body_html=full_msg.get("body_html", ""),
- flickr_url=flickr_url,
- normalized_flickr_url=normalized,
- wikipedia_url=wikipedia_url,
- creator_profile_url=creator_profile,
- ))
- db_session.commit()
-
- time.sleep(1) # Be polite
-
- except Exception as e:
- db_session.rollback()
- print(f" Error downloading message {msg_id}: {e}")
- continue
-
- total = db_session.query(SentMessage).count()
- print(f"Done! {total} messages in database")
-
- except Exception:
- db_session.rollback()
- raise
- finally:
- db_session.close()
-
-
-if __name__ == "__main__":
- main()
diff --git a/extract_flickr_uploads.py b/extract_flickr_uploads.py
deleted file mode 100644
index 25f2fec..0000000
--- a/extract_flickr_uploads.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/usr/bin/env python3
-"""
-Extract Flickr uploads from Wikimedia Commons contributions.
-
-Filters contributions where the comment contains a flickr.com URL and extracts:
-- pageid, revid, title, timestamp
-- flickr_url: the Flickr photo URL
-- creator: the photographer/author name
-
-Links uploads to sent messages via normalized Flickr URL matching.
-"""
-
-import re
-
-from flickr_mail.database import init_db, get_session
-from flickr_mail.models import Contribution, FlickrUpload, SentMessage
-from flickr_mail.url_utils import normalize_flickr_url
-
-
-def extract_flickr_url(comment: str) -> str | None:
- """Extract the Flickr photo URL from a comment."""
- # Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
- # Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
- patterns = [
- # Plain URL (modern format)
- r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
- # URL in wiki markup [url title]
- r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
- ]
-
- for pattern in patterns:
- match = re.search(pattern, comment)
- if match:
- return match.group(1)
-
- return None
-
-
-def extract_creator(comment: str) -> str | None:
- """Extract the creator/author name from a comment."""
- # Modern format: "Uploaded a work by {creator} from https://..."
- modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
- if modern_match:
- return modern_match.group(1).strip()
-
- # Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
- # The author name comes after the URL, before ] or "from"
- author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
- if author_match:
- author = author_match.group(1).strip()
- # Remove trailing location like "from Toronto, Canada"
- author = re.sub(r'\s+from\s+.+$', '', author)
- return author
-
- # Handle truncated comments where Author field is cut off
- # Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
- truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
- if truncated_match:
- author = truncated_match.group(1).strip()
- if author:
- return author
-
- # Sometimes Author field is just plain text without URL
- author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
- if author_plain:
- author = author_plain.group(1).strip()
- # Skip if it looks like a wiki user link
- if not author.startswith('[[User:') and author:
- return author
-
- return None
-
-
-def main() -> None:
- """Process contributions and extract Flickr uploads."""
- init_db()
- session = get_session()
-
- try:
- # Get existing upload revids to avoid duplicates
- existing_revids = {
- r[0] for r in session.query(FlickrUpload.revid).all()
- }
-
- # Build sent message index: normalized_flickr_url -> message
- sent_messages = (
- session.query(SentMessage)
- .filter(SentMessage.normalized_flickr_url != "")
- .filter(~SentMessage.subject.startswith("Re:"))
- .all()
- )
- url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
- print(f"Sent message index: {len(url_to_message)} entries")
-
- # Query contributions with flickr.com in comment
- contributions = (
- session.query(Contribution)
- .filter(Contribution.comment.ilike("%flickr.com%"))
- .all()
- )
-
- print(f"Found {len(contributions)} contributions mentioning flickr.com")
-
- new_count = 0
- for contrib in contributions:
- if contrib.revid in existing_revids:
- continue
-
- flickr_url = extract_flickr_url(contrib.comment or "")
- if not flickr_url:
- continue
-
- creator = extract_creator(contrib.comment or "")
- normalized = normalize_flickr_url(flickr_url)
-
- # Look up sent message for FK linking
- msg = url_to_message.get(normalized) if normalized else None
-
- session.add(FlickrUpload(
- pageid=contrib.pageid,
- revid=contrib.revid,
- title=contrib.title,
- timestamp=contrib.timestamp,
- flickr_url=flickr_url,
- normalized_flickr_url=normalized,
- creator=creator,
- wikipedia_url=msg.wikipedia_url if msg else "",
- creator_profile_url=msg.creator_profile_url if msg else "",
- sent_message_id=msg.message_id if msg else None,
- ))
- new_count += 1
-
- session.commit()
-
- total = session.query(FlickrUpload).count()
- linked = session.query(FlickrUpload).filter(
- FlickrUpload.sent_message_id.isnot(None)
- ).count()
-
- print(f"Extracted {new_count} new Flickr uploads")
- print(f"Total: {total} uploads, {linked} linked to sent messages")
-
- # Show some stats
- with_creator = session.query(FlickrUpload).filter(
- FlickrUpload.creator.isnot(None)
- ).count()
- print(f" - {with_creator} with creator identified")
- print(f" - {total - with_creator} without creator")
-
- except Exception:
- session.rollback()
- raise
- finally:
- session.close()
-
-
-if __name__ == '__main__':
- main()
diff --git a/flickr_mail/__init__.py b/flickr_mail/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/flickr_mail/database.py b/flickr_mail/database.py
deleted file mode 100644
index 0015669..0000000
--- a/flickr_mail/database.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""Database engine and session factory for flickr-mail."""
-
-from pathlib import Path
-
-from sqlalchemy import create_engine, event
-from sqlalchemy.orm import Session, sessionmaker
-
-from flickr_mail.models import Base
-
-DB_PATH = Path(__file__).parent.parent / "flickr_mail.db"
-
-engine = create_engine(f"sqlite:///{DB_PATH}")
-SessionLocal = sessionmaker(bind=engine)
-
-
-@event.listens_for(engine, "connect")
-def set_sqlite_pragma(dbapi_connection, connection_record):
- """Enable WAL mode for concurrent read/write access."""
- cursor = dbapi_connection.cursor()
- cursor.execute("PRAGMA journal_mode=WAL")
- cursor.close()
-
-
-def init_db() -> None:
- """Create all tables."""
- Base.metadata.create_all(engine)
-
-
-def get_session() -> Session:
- """Create a new database session."""
- return SessionLocal()
diff --git a/flickr_mail/models.py b/flickr_mail/models.py
deleted file mode 100644
index 090b37a..0000000
--- a/flickr_mail/models.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""SQLAlchemy models for flickr-mail."""
-
-from sqlalchemy import ForeignKey, Index, Text
-from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
-
-
-class Base(DeclarativeBase):
- pass
-
-
-class Contribution(Base):
- __tablename__ = "contributions"
-
- id: Mapped[int] = mapped_column(primary_key=True)
- userid: Mapped[int | None]
- user: Mapped[str | None]
- pageid: Mapped[int | None]
- revid: Mapped[int | None] = mapped_column(unique=True)
- parentid: Mapped[int | None]
- ns: Mapped[int | None]
- title: Mapped[str | None]
- timestamp: Mapped[str | None]
- minor: Mapped[str | None]
- top: Mapped[str | None]
- comment: Mapped[str | None] = mapped_column(Text)
- size: Mapped[int | None]
- sizediff: Mapped[int | None]
- tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text
-
- __table_args__ = (
- Index("ix_contributions_timestamp", "timestamp"),
- Index("ix_contributions_pageid", "pageid"),
- )
-
-
-class SentMessage(Base):
- __tablename__ = "sent_messages"
-
- message_id: Mapped[str] = mapped_column(primary_key=True)
- subject: Mapped[str | None]
- url: Mapped[str | None]
- recipient: Mapped[str | None]
- date: Mapped[str | None]
- body: Mapped[str | None] = mapped_column(Text)
- body_html: Mapped[str | None] = mapped_column(Text)
- flickr_url: Mapped[str | None]
- normalized_flickr_url: Mapped[str | None]
- wikipedia_url: Mapped[str | None]
- creator_profile_url: Mapped[str | None]
-
- flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
- back_populates="sent_message"
- )
-
- __table_args__ = (
- Index("ix_sent_messages_recipient", "recipient"),
- Index("ix_sent_messages_normalized_flickr_url", "normalized_flickr_url"),
- )
-
-
-class FlickrUpload(Base):
- __tablename__ = "flickr_uploads"
-
- id: Mapped[int] = mapped_column(primary_key=True)
- pageid: Mapped[int | None]
- revid: Mapped[int | None]
- title: Mapped[str | None]
- timestamp: Mapped[str | None]
- flickr_url: Mapped[str | None]
- normalized_flickr_url: Mapped[str | None]
- creator: Mapped[str | None]
- wikipedia_url: Mapped[str | None]
- creator_profile_url: Mapped[str | None]
- sent_message_id: Mapped[str | None] = mapped_column(
- ForeignKey("sent_messages.message_id")
- )
-
- sent_message: Mapped[SentMessage | None] = relationship(
- back_populates="flickr_uploads"
- )
-
- __table_args__ = (
- Index("ix_flickr_uploads_normalized_flickr_url", "normalized_flickr_url"),
- Index("ix_flickr_uploads_timestamp", "timestamp"),
- )
-
-
-class ThumbnailCache(Base):
- __tablename__ = "thumbnail_cache"
-
- title: Mapped[str] = mapped_column(primary_key=True)
- thumb_url: Mapped[str | None]
- fetched_at: Mapped[int | None] # Unix timestamp
diff --git a/flickr_mail/url_utils.py b/flickr_mail/url_utils.py
deleted file mode 100644
index 53e0979..0000000
--- a/flickr_mail/url_utils.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""Shared URL utility functions for flickr-mail."""
-
-import re
-
-
-def normalize_flickr_url(url: str) -> str:
- """Normalize a Flickr photo URL for comparison."""
- # Remove protocol
- url = url.replace("https://", "").replace("http://", "")
- # Remove www.
- url = url.replace("www.", "")
- # Remove trailing slash
- url = url.rstrip("/")
- # Ensure it starts with flickr.com
- if not url.startswith("flickr.com"):
- return ""
- return url
-
-
-def extract_urls_from_message(body: str) -> tuple[str, str]:
- """Extract flickr URL and Wikipedia URL from message body."""
-
- flickr_url = ""
- wikipedia_url = ""
-
- # Find flickr photo URLs
- flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
- flickr_matches = re.findall(flickr_pattern, body)
- if flickr_matches:
- flickr_url = flickr_matches[0]
- if not flickr_url.startswith("http"):
- flickr_url = "https://" + flickr_url
-
- # Find Wikipedia URLs
- wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
- wiki_matches = re.findall(wiki_pattern, body)
- if wiki_matches:
- wikipedia_url = wiki_matches[0]
- if not wikipedia_url.startswith("http"):
- wikipedia_url = "https://" + wikipedia_url
-
- return flickr_url, wikipedia_url
-
-
-def creator_profile_from_flickr_url(flickr_url: str) -> str:
- """Extract creator profile URL from a flickr photo URL."""
- parts = flickr_url.split("/")
- for i, part in enumerate(parts):
- if part == "photos" and i + 1 < len(parts):
- username = parts[i + 1]
- return f"https://www.flickr.com/photos/{username}"
- return ""
diff --git a/main.py b/main.py
index 25279ff..5abe5f6 100755
--- a/main.py
+++ b/main.py
@@ -9,17 +9,14 @@ import sys
import time
import traceback
import typing
+from pathlib import Path
from urllib.parse import quote, unquote
import flask
import requests
import werkzeug
-from sqlalchemy import func
from werkzeug.debug.tbtools import DebugTraceback
-from flickr_mail.database import get_session
-from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
-from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
import re
@@ -29,6 +26,18 @@ app.debug = False
enwiki = "en.wikipedia.org/wiki/"
+# Path to Commons contributions data and sent mail
+COMMONS_UPLOADS_FILE = (
+ Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
+)
+COMMONS_CACHE_FILE = (
+ Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
+)
+SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
+SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
+SENT_MAIL_INDEX_CACHE = (
+ Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
+)
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
RECENT_UPLOADS_COUNT = 24
@@ -156,6 +165,132 @@ class CommonsUpload:
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
+def normalize_flickr_url(url: str) -> str:
+ """Normalize a Flickr photo URL for comparison."""
+ # Remove protocol
+ url = url.replace("https://", "").replace("http://", "")
+ # Remove www.
+ url = url.replace("www.", "")
+ # Remove trailing slash
+ url = url.rstrip("/")
+ # Ensure it starts with flickr.com
+ if not url.startswith("flickr.com"):
+ return ""
+ return url
+
+
+def extract_urls_from_message(body: str) -> tuple[str, str]:
+ """Extract flickr URL and Wikipedia URL from message body."""
+
+ flickr_url = ""
+ wikipedia_url = ""
+
+ # Find flickr photo URLs
+ flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
+ flickr_matches = re.findall(flickr_pattern, body)
+ if flickr_matches:
+ flickr_url = flickr_matches[0]
+ if not flickr_url.startswith("http"):
+ flickr_url = "https://" + flickr_url
+
+ # Find Wikipedia URLs
+ wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
+ wiki_matches = re.findall(wiki_pattern, body)
+ if wiki_matches:
+ wikipedia_url = wiki_matches[0]
+ if not wikipedia_url.startswith("http"):
+ wikipedia_url = "https://" + wikipedia_url
+
+ return flickr_url, wikipedia_url
+
+
+def build_sent_mail_index() -> dict[str, dict[str, str]]:
+ """Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
+ if not SENT_MAIL_DIR.exists():
+ return {}
+
+ # Check if we have a cached index
+ if SENT_MAIL_INDEX_CACHE.exists():
+ try:
+ with open(SENT_MAIL_INDEX_CACHE) as f:
+ cache = json.load(f)
+ # Check if cache is still valid (compare file count)
+ json_files = list(SENT_MAIL_DIR.glob("*.json"))
+ if cache.get("file_count") == len(json_files):
+ return cache.get("index", {})
+ except (json.JSONDecodeError, OSError):
+ pass
+
+ index: dict[str, dict[str, str]] = {}
+ json_files = list(SENT_MAIL_DIR.glob("*.json"))
+
+ for json_file in json_files:
+ try:
+ with open(json_file) as f:
+ message = json.load(f)
+ except (json.JSONDecodeError, OSError):
+ continue
+
+ # Skip replies - we want original requests
+ subject = message.get("subject", "")
+ if subject.startswith("Re:"):
+ continue
+
+ body = message.get("body", "")
+ flickr_url, wikipedia_url = extract_urls_from_message(body)
+
+ if not flickr_url:
+ continue
+
+ normalized = normalize_flickr_url(flickr_url)
+ if not normalized:
+ continue
+
+ # Extract creator profile URL from flickr URL
+ # flickr.com/photos/username/12345 -> flickr.com/photos/username
+ parts = flickr_url.split("/")
+ creator_profile = ""
+ for i, part in enumerate(parts):
+ if part == "photos" and i + 1 < len(parts):
+ username = parts[i + 1]
+ creator_profile = f"https://www.flickr.com/photos/{username}"
+ break
+
+ index[normalized] = {
+ "wikipedia_url": wikipedia_url,
+ "creator_profile_url": creator_profile,
+ "recipient": message.get("recipient", ""),
+ }
+
+ # Cache the index
+ try:
+ with open(SENT_MAIL_INDEX_CACHE, "w") as f:
+ json.dump({"file_count": len(json_files), "index": index}, f)
+ except OSError:
+ pass
+
+ return index
+
+
+def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
+ """Load the thumbnail cache from disk."""
+ if not COMMONS_CACHE_FILE.exists():
+ return {"timestamp": 0, "thumbnails": {}}
+ try:
+ with open(COMMONS_CACHE_FILE) as f:
+ return typing.cast(dict[str, typing.Any], json.load(f))
+ except (json.JSONDecodeError, OSError):
+ return {"timestamp": 0, "thumbnails": {}}
+
+
+def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
+ """Save the thumbnail cache to disk."""
+ try:
+ with open(COMMONS_CACHE_FILE, "w") as f:
+ json.dump(cache, f)
+ except OSError:
+ pass # Ignore cache write errors
+
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
"""Fetch thumbnail URLs from Commons API for the given file titles."""
@@ -205,72 +340,79 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
Returns a tuple of (uploads_list, total_count) where total_count is the total number
of uploads obtained via Flickr mail (not just the ones returned).
"""
- session = get_session()
+ if not COMMONS_UPLOADS_FILE.exists():
+ return [], 0
+
try:
- query = (
- session.query(FlickrUpload, SentMessage)
- .join(SentMessage)
- .order_by(FlickrUpload.timestamp.desc())
+ with open(COMMONS_UPLOADS_FILE) as f:
+ all_uploads = json.load(f)
+ except (json.JSONDecodeError, OSError):
+ return [], 0
+
+ # Build sent mail index
+ sent_mail_index = build_sent_mail_index()
+
+ # Filter uploads to only those with matching sent mail
+ # Count all matches, but only keep RECENT_UPLOADS_COUNT for display
+ uploads_with_mail: list[dict[str, typing.Any]] = []
+ total_matched = 0
+ for upload in all_uploads:
+ flickr_url = upload.get("flickr_url", "")
+ normalized = normalize_flickr_url(flickr_url)
+ if normalized and normalized in sent_mail_index:
+ total_matched += 1
+ if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
+ upload["_mail_info"] = sent_mail_index[normalized]
+ uploads_with_mail.append(upload)
+
+ if not uploads_with_mail:
+ return [], 0
+
+ # Load cache and check if it's still valid
+ cache = load_commons_thumbnail_cache()
+ cache_age = time.time() - cache.get("timestamp", 0)
+ cached_thumbs = cache.get("thumbnails", {})
+
+ # Find which titles need fetching
+ titles = [u["title"] for u in uploads_with_mail]
+ titles_to_fetch = [t for t in titles if t not in cached_thumbs]
+
+ # Fetch missing thumbnails or refresh if cache is old
+ if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
+ new_thumbs = fetch_commons_thumbnails(
+ titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
)
- total_matched = query.count()
- if total_matched == 0:
- return [], 0
+ cached_thumbs.update(new_thumbs)
+ cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
+ save_commons_thumbnail_cache(cache)
- recent = query.limit(RECENT_UPLOADS_COUNT).all()
+ # Build the result list
+ result: list[CommonsUpload] = []
+ for upload in uploads_with_mail:
+ title = upload["title"]
+ thumb_url = cached_thumbs.get(title, "")
+ if not thumb_url:
+ continue
- # Get thumbnails from cache
- titles = [upload.title for upload, msg in recent]
- now = int(time.time())
- cached = {
- tc.title: tc
- for tc in session.query(ThumbnailCache)
- .filter(ThumbnailCache.title.in_(titles))
- .all()
- }
+ mail_info = upload.get("_mail_info", {})
- # Find titles needing fetch (missing or expired)
- titles_to_fetch = [
- t for t in titles
- if t not in cached or (now - (cached[t].fetched_at or 0)) > COMMONS_CACHE_MAX_AGE
- ]
+ # Convert title to Commons URL
+ commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
- if titles_to_fetch:
- new_thumbs = fetch_commons_thumbnails(titles_to_fetch)
- for title, thumb_url in new_thumbs.items():
- existing = cached.get(title)
- if existing:
- existing.thumb_url = thumb_url
- existing.fetched_at = now
- else:
- tc = ThumbnailCache(title=title, thumb_url=thumb_url, fetched_at=now)
- session.add(tc)
- cached[title] = tc
- session.commit()
-
- result: list[CommonsUpload] = []
- for upload, msg in recent:
- thumb_url = cached[upload.title].thumb_url if upload.title in cached else ""
- if not thumb_url:
- continue
-
- commons_url = f"https://commons.wikimedia.org/wiki/{upload.title.replace(' ', '_')}"
-
- result.append(
- CommonsUpload(
- title=upload.title.replace("File:", "").rsplit(".", 1)[0],
- thumb_url=thumb_url,
- commons_url=commons_url,
- flickr_url=upload.flickr_url or "",
- creator=upload.creator or "Unknown",
- timestamp=(upload.timestamp or "")[:10],
- wikipedia_url=upload.wikipedia_url or "",
- creator_profile_url=upload.creator_profile_url or "",
- )
+ result.append(
+ CommonsUpload(
+ title=title.replace("File:", "").rsplit(".", 1)[0],
+ thumb_url=thumb_url,
+ commons_url=commons_url,
+ flickr_url=upload.get("flickr_url", ""),
+ creator=upload.get("creator") or "Unknown",
+ timestamp=upload.get("timestamp", "")[:10],
+ wikipedia_url=mail_info.get("wikipedia_url", ""),
+ creator_profile_url=mail_info.get("creator_profile_url", ""),
)
+ )
- return result, total_matched
- finally:
- session.close()
+ return result, total_matched
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
@@ -279,33 +421,26 @@ def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
Checks both the display name (flickr_user) and username (flickr_username)
against the recipient field in the messages index.
"""
- names = set()
- if flickr_user:
- names.add(flickr_user.lower())
- if flickr_username:
- names.add(flickr_username.lower())
- if not names:
+ if not SENT_MAIL_INDEX_FILE.exists():
return []
- session = get_session()
try:
- messages = (
- session.query(SentMessage)
- .filter(func.lower(SentMessage.recipient).in_(names))
- .all()
- )
- return [
- {
- "message_id": m.message_id,
- "subject": m.subject,
- "url": m.url,
- "recipient": m.recipient,
- "date": m.date,
- }
- for m in messages
- ]
- finally:
- session.close()
+ with open(SENT_MAIL_INDEX_FILE) as f:
+ messages = json.load(f)
+ except (json.JSONDecodeError, OSError):
+ return []
+
+ # Normalize for case-insensitive comparison
+ flickr_user_lower = flickr_user.lower() if flickr_user else ""
+ flickr_username_lower = flickr_username.lower() if flickr_username else ""
+
+ matches = []
+ for msg in messages:
+ recipient = msg.get("recipient", "").lower()
+ if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
+ matches.append(msg)
+
+ return matches
def parse_category_input(category_input: str) -> str | None:
diff --git a/migrate_json_to_db.py b/migrate_json_to_db.py
deleted file mode 100644
index a2d2982..0000000
--- a/migrate_json_to_db.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python3
-"""One-time migration from JSON files to SQLite database."""
-
-import json
-import time
-from pathlib import Path
-
-from flickr_mail.database import init_db, get_session
-from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
-from flickr_mail.url_utils import (
- creator_profile_from_flickr_url,
- extract_urls_from_message,
- normalize_flickr_url,
-)
-
-COMMONS_DIR = Path(__file__).parent / "commons_contributions"
-SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
-SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
-CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
-FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
-THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
-
-
-def migrate_contributions(session) -> int:
- """Migrate contributions.json to contributions table."""
- if not CONTRIBUTIONS_FILE.exists():
- print("No contributions.json found, skipping")
- return 0
-
- with open(CONTRIBUTIONS_FILE) as f:
- data = json.load(f)
-
- contributions = data.get("contributions", [])
- print(f"Migrating {len(contributions)} contributions...")
-
- for c in contributions:
- session.add(Contribution(
- userid=c.get("userid"),
- user=c.get("user"),
- pageid=c.get("pageid"),
- revid=c.get("revid"),
- parentid=c.get("parentid"),
- ns=c.get("ns"),
- title=c.get("title"),
- timestamp=c.get("timestamp"),
- minor=c.get("minor"),
- top=c.get("top"),
- comment=c.get("comment"),
- size=c.get("size"),
- sizediff=c.get("sizediff"),
- tags=json.dumps(c.get("tags", [])),
- ))
-
- session.flush()
- count = session.query(Contribution).count()
- print(f" -> {count} contributions migrated")
- return count
-
-
-def migrate_sent_messages(session) -> dict[str, str]:
- """Migrate sent messages to sent_messages table.
-
- Returns a dict of normalized_flickr_url -> message_id for FK linking.
- """
- if not SENT_MAIL_INDEX.exists():
- print("No messages_index.json found, skipping")
- return {}
-
- with open(SENT_MAIL_INDEX) as f:
- index = json.load(f)
-
- print(f"Migrating {len(index)} sent messages...")
-
- url_to_message_id: dict[str, str] = {}
- count = 0
-
- for msg_meta in index:
- msg_id = msg_meta.get("message_id", "")
- if not msg_id:
- continue
-
- # Load the full message from individual file
- msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
- if msg_file.exists():
- with open(msg_file) as f:
- msg = json.load(f)
- else:
- msg = msg_meta
-
- body = msg.get("body", "")
- subject = msg.get("subject", "")
-
- # Extract URLs from body
- flickr_url, wikipedia_url = extract_urls_from_message(body)
- normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
-
- # Extract creator profile URL
- creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
-
- session.add(SentMessage(
- message_id=msg_id,
- subject=msg.get("subject", ""),
- url=msg.get("url", ""),
- recipient=msg.get("recipient", ""),
- date=msg.get("date", ""),
- body=body,
- body_html=msg.get("body_html", ""),
- flickr_url=flickr_url,
- normalized_flickr_url=normalized,
- wikipedia_url=wikipedia_url,
- creator_profile_url=creator_profile_url,
- ))
-
- # Build URL -> message_id map for FK linking (skip replies)
- if normalized and not subject.startswith("Re:"):
- url_to_message_id[normalized] = msg_id
-
- count += 1
-
- session.flush()
- actual = session.query(SentMessage).count()
- print(f" -> {actual} sent messages migrated")
- print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
- return url_to_message_id
-
-
-def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
- """Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
- if not FLICKR_UPLOADS_FILE.exists():
- print("No flickr_uploads.json found, skipping")
- return 0
-
- with open(FLICKR_UPLOADS_FILE) as f:
- uploads = json.load(f)
-
- print(f"Migrating {len(uploads)} flickr uploads...")
-
- linked = 0
- for u in uploads:
- flickr_url = u.get("flickr_url", "")
- normalized = normalize_flickr_url(flickr_url)
-
- # Look up sent message FK
- sent_message_id = url_to_message_id.get(normalized) if normalized else None
- if sent_message_id:
- linked += 1
-
- # Get wikipedia_url and creator_profile_url from the linked message
- wikipedia_url = ""
- creator_profile_url = ""
- if sent_message_id:
- msg = session.get(SentMessage, sent_message_id)
- if msg:
- wikipedia_url = msg.wikipedia_url or ""
- creator_profile_url = msg.creator_profile_url or ""
-
- session.add(FlickrUpload(
- pageid=u.get("pageid"),
- revid=u.get("revid"),
- title=u.get("title"),
- timestamp=u.get("timestamp"),
- flickr_url=flickr_url,
- normalized_flickr_url=normalized,
- creator=u.get("creator"),
- wikipedia_url=wikipedia_url,
- creator_profile_url=creator_profile_url,
- sent_message_id=sent_message_id,
- ))
-
- session.flush()
- count = session.query(FlickrUpload).count()
- print(f" -> {count} flickr uploads migrated")
- print(f" -> {linked} linked to sent messages")
- return count
-
-
-def migrate_thumbnail_cache(session) -> int:
- """Migrate thumbnail_cache.json to thumbnail_cache table."""
- if not THUMBNAIL_CACHE_FILE.exists():
- print("No thumbnail_cache.json found, skipping")
- return 0
-
- with open(THUMBNAIL_CACHE_FILE) as f:
- cache = json.load(f)
-
- thumbnails = cache.get("thumbnails", {})
- cache_timestamp = int(cache.get("timestamp", 0))
-
- print(f"Migrating {len(thumbnails)} cached thumbnails...")
-
- for title, thumb_url in thumbnails.items():
- session.add(ThumbnailCache(
- title=title,
- thumb_url=thumb_url,
- fetched_at=cache_timestamp,
- ))
-
- session.flush()
- count = session.query(ThumbnailCache).count()
- print(f" -> {count} thumbnail cache entries migrated")
- return count
-
-
-def main() -> None:
- print("Initializing database...")
- init_db()
-
- session = get_session()
- try:
- # Check if already migrated
- existing = session.query(Contribution).count()
- if existing > 0:
- print(f"Database already contains {existing} contributions. Aborting.")
- print("Delete flickr_mail.db to re-run migration.")
- return
-
- migrate_contributions(session)
- url_to_message_id = migrate_sent_messages(session)
- migrate_flickr_uploads(session, url_to_message_id)
- migrate_thumbnail_cache(session)
-
- session.commit()
- print("\nMigration complete!")
-
- except Exception:
- session.rollback()
- raise
- finally:
- session.close()
-
-
-if __name__ == "__main__":
- main()
diff --git a/update_flickr_uploads.py b/update_flickr_uploads.py
index 06b85e0..d140b3c 100644
--- a/update_flickr_uploads.py
+++ b/update_flickr_uploads.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
-Find UploadWizard contributions that are from Flickr and add them to the database.
+Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
For contributions with comment 'User created page with UploadWizard', queries the
Commons API to check if the image source is Flickr (by checking the Credit field).
@@ -9,13 +9,12 @@ Commons API to check if the image source is Flickr (by checking the Credit field
import json
import re
import time
+from pathlib import Path
import requests
-from flickr_mail.database import init_db, get_session
-from flickr_mail.models import Contribution, FlickrUpload, SentMessage
-from flickr_mail.url_utils import normalize_flickr_url
-
+CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
+FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
@@ -76,101 +75,99 @@ def clean_artist_name(artist_html: str) -> str:
def main():
- init_db()
- session = get_session()
+ # Load contributions
+ print("Loading contributions...")
+ with open(CONTRIBUTIONS_FILE) as f:
+ data = json.load(f)
- try:
- # Get existing normalized flickr URLs to avoid duplicates
- existing_urls = {
- r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
- if r[0]
- }
- print(f"Existing uploads: {session.query(FlickrUpload).count()}")
- print(f"Existing flickr URLs: {len(existing_urls)}")
+ contributions = data.get("contributions", [])
- # Build sent message index for FK linking
- sent_messages = (
- session.query(SentMessage)
- .filter(SentMessage.normalized_flickr_url != "")
- .filter(~SentMessage.subject.startswith("Re:"))
- .all()
- )
- url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
+ # Load existing flickr uploads
+ existing_flickr_urls = set()
+ existing_uploads = []
+ if FLICKR_UPLOADS_FILE.exists():
+ with open(FLICKR_UPLOADS_FILE) as f:
+ existing_uploads = json.load(f)
+ existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
+ # Also normalize existing URLs for comparison
+ for u in existing_uploads:
+ url = u.get("flickr_url", "")
+ normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
+ existing_flickr_urls.add(normalized)
- # Find UploadWizard contributions (page creations only)
- upload_wizard = (
- session.query(Contribution)
- .filter(Contribution.comment == "User created page with UploadWizard")
- .filter(Contribution.title.startswith("File:"))
- .all()
- )
+ print(f"Existing uploads: {len(existing_uploads)}")
+ print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
- print(f"UploadWizard contributions to check: {len(upload_wizard)}")
+ # Find UploadWizard contributions (page creations only)
+ upload_wizard_contributions = []
+ for c in contributions:
+ comment = c.get("comment", "")
+ if comment == "User created page with UploadWizard":
+ # Only include if it's a File: page
+ title = c.get("title", "")
+ if title.startswith("File:"):
+ upload_wizard_contributions.append(c)
- # Process in batches of 50
- new_count = 0
- batch_size = 50
+ print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
- for i in range(0, len(upload_wizard), batch_size):
- batch = upload_wizard[i : i + batch_size]
- titles = [c.title for c in batch]
+ # Process in batches of 50
+ new_uploads = []
+ batch_size = 50
- print(
- f"Processing batch {i // batch_size + 1}/"
- f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
- )
+ for i in range(0, len(upload_wizard_contributions), batch_size):
+ batch = upload_wizard_contributions[i : i + batch_size]
+ titles = [c["title"] for c in batch]
- metadata = get_image_metadata(titles)
+ print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
- for c in batch:
- meta = metadata.get(c.title, {})
- credit = meta.get("credit", "")
- artist = meta.get("artist", "")
+ metadata = get_image_metadata(titles)
- flickr_url = extract_flickr_url_from_credit(credit)
- if not flickr_url:
- continue
+ for c in batch:
+ title = c["title"]
+ meta = metadata.get(title, {})
+ credit = meta.get("credit", "")
+ artist = meta.get("artist", "")
- normalized = normalize_flickr_url(flickr_url)
- if normalized in existing_urls:
- continue
+ flickr_url = extract_flickr_url_from_credit(credit)
+ if not flickr_url:
+ continue
- creator = clean_artist_name(artist) if artist else None
+ # Check if we already have this URL
+ normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
+ if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
+ continue
- # Look up sent message for FK linking
- msg = url_to_message.get(normalized) if normalized else None
+ creator = clean_artist_name(artist) if artist else None
- session.add(FlickrUpload(
- pageid=c.pageid,
- revid=c.revid,
- title=c.title,
- timestamp=c.timestamp,
- flickr_url=flickr_url,
- normalized_flickr_url=normalized,
- creator=creator,
- wikipedia_url=msg.wikipedia_url if msg else "",
- creator_profile_url=msg.creator_profile_url if msg else "",
- sent_message_id=msg.message_id if msg else None,
- ))
- new_count += 1
- existing_urls.add(normalized)
- print(f" Found: {c.title[:50]} -> {flickr_url}")
+ new_upload = {
+ "pageid": c["pageid"],
+ "revid": c["revid"],
+ "title": title,
+ "timestamp": c["timestamp"],
+ "flickr_url": flickr_url,
+ "creator": creator,
+ }
- session.commit()
+ new_uploads.append(new_upload)
+ existing_flickr_urls.add(normalized)
+ print(f" Found: {title[:50]} -> {flickr_url}")
- # Rate limiting
- if i + batch_size < len(upload_wizard):
- time.sleep(0.5)
+ # Rate limiting
+ if i + batch_size < len(upload_wizard_contributions):
+ time.sleep(0.5)
- total = session.query(FlickrUpload).count()
- print(f"\nFound {new_count} new Flickr uploads")
- print(f"Total: {total} uploads in database")
+ print(f"\nFound {len(new_uploads)} new Flickr uploads")
- except Exception:
- session.rollback()
- raise
- finally:
- session.close()
+ if new_uploads:
+ # Merge and sort by timestamp (newest first)
+ all_uploads = existing_uploads + new_uploads
+ all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
+
+ # Save
+ with open(FLICKR_UPLOADS_FILE, "w") as f:
+ json.dump(all_uploads, f, indent=2)
+
+ print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
if __name__ == "__main__":