Compare commits
No commits in common. "c5efd429ce8e27a7fc9eea95a9970b61d18c5bac" and "0062de8ede69af0dd59acbacc3a5ab969976187d" have entirely different histories.
c5efd429ce
...
0062de8ede
8 changed files with 19 additions and 972 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,4 +1,2 @@
|
|||
.mypy_cache
|
||||
__pycache__
|
||||
commons_contributions/thumbnail_cache.json
|
||||
commons_contributions/sent_mail_index.json
|
||||
|
|
|
|||
28
AGENTS.md
28
AGENTS.md
|
|
@ -66,34 +66,6 @@ static image servers:
|
|||
Converts a Flickr username/path alias to the NSID (internal user ID) needed
|
||||
for the Flickr mail URL. Scrapes the user's profile page for embedded params.
|
||||
|
||||
### Commons Uploads Display
|
||||
|
||||
Shows recent Wikimedia Commons uploads on the home page, filtered to only
|
||||
those obtained via Flickr mail requests.
|
||||
|
||||
**Data files** (in `commons_contributions/`):
|
||||
- `flickr_uploads.json`: List of Commons uploads from Flickr with metadata
|
||||
- `thumbnail_cache.json`: Cached Commons API thumbnail URLs (7-day TTL)
|
||||
- `sent_mail_index.json`: Index of sent mail messages (flickr_url → wikipedia_url)
|
||||
|
||||
**Key functions**:
|
||||
- `build_sent_mail_index()`: Parses sent mail JSON files, extracts Flickr and
|
||||
Wikipedia URLs from message bodies, caches the index
|
||||
- `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match,
|
||||
fetches thumbnails from Commons API
|
||||
- `normalize_flickr_url()`: Normalizes URLs for matching (removes protocol, www, trailing slash)
|
||||
|
||||
**CommonsUpload dataclass**:
|
||||
- `title`, `thumb_url`, `commons_url`, `flickr_url`, `creator`, `timestamp`
|
||||
- `wikipedia_url`, `creator_profile_url`: Extracted from sent mail
|
||||
- `is_wikidata_item` property: Detects Q-number URLs
|
||||
- `wiki_link_url`, `wiki_link_label`: Handles Wikidata vs Wikipedia links
|
||||
|
||||
**Maintenance script** (`update_flickr_uploads.py`):
|
||||
Run to find Flickr uploads from UploadWizard contributions that don't have
|
||||
the Flickr URL in the edit comment. Queries Commons API for image metadata
|
||||
and checks the Credit field for Flickr URLs.
|
||||
|
||||
## Request Flow
|
||||
|
||||
1. User enters Wikipedia article title/URL → `start()` extracts article name
|
||||
|
|
|
|||
|
|
@ -28,9 +28,6 @@ photographers on Flickr whose photos can be used to enhance Wikipedia articles.
|
|||
- **One-click message composition**: Click any photo to compose a permission
|
||||
request message with the photo displayed alongside.
|
||||
- **Pagination**: Browse through thousands of search results with page navigation.
|
||||
- **Recent uploads showcase**: The home page displays recent Wikimedia Commons
|
||||
uploads that were obtained via Flickr mail requests, with links to the
|
||||
Wikipedia article and photographer's Flickr profile.
|
||||
- Generate messages to request permission to use photos on Wikipedia.
|
||||
- Handle exceptions gracefully and provide detailed error information.
|
||||
|
||||
|
|
|
|||
591
main.py
591
main.py
|
|
@ -6,10 +6,8 @@ import dataclasses
|
|||
import inspect
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import typing
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote, unquote
|
||||
|
||||
import flask
|
||||
|
|
@ -18,36 +16,11 @@ import werkzeug
|
|||
from werkzeug.debug.tbtools import DebugTraceback
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
app.debug = False
|
||||
|
||||
enwiki = "en.wikipedia.org/wiki/"
|
||||
|
||||
# Path to Commons contributions data and sent mail
|
||||
COMMONS_UPLOADS_FILE = (
|
||||
Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
|
||||
)
|
||||
COMMONS_CACHE_FILE = (
|
||||
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
|
||||
)
|
||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||
SENT_MAIL_INDEX_CACHE = (
|
||||
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
|
||||
)
|
||||
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
||||
RECENT_UPLOADS_COUNT = 24
|
||||
|
||||
# User agent for Wikimedia API requests
|
||||
WIKIMEDIA_USER_AGENT = (
|
||||
"FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
)
|
||||
|
||||
WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
# Browser-like headers for Flickr requests
|
||||
BROWSER_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
|
|
@ -67,26 +40,18 @@ BROWSER_HEADERS = {
|
|||
# Flickr license codes to human-readable names
|
||||
FLICKR_LICENSES = {
|
||||
0: "All Rights Reserved",
|
||||
1: "CC BY-NC-SA 2.0",
|
||||
2: "CC BY-NC 2.0",
|
||||
3: "CC BY-NC-ND 2.0",
|
||||
4: "CC BY 2.0",
|
||||
5: "CC BY-SA 2.0",
|
||||
6: "CC BY-ND 2.0",
|
||||
1: "CC BY-NC-SA",
|
||||
2: "CC BY-NC",
|
||||
3: "CC BY-NC-ND",
|
||||
4: "CC BY",
|
||||
5: "CC BY-SA",
|
||||
6: "CC BY-ND",
|
||||
7: "No known copyright",
|
||||
8: "US Government",
|
||||
9: "CC0",
|
||||
10: "Public Domain",
|
||||
# CC 4.0 licenses (codes confirmed via Flickr)
|
||||
16: "CC BY-NC-ND 4.0",
|
||||
}
|
||||
|
||||
# Non-free CC licenses (NC or ND restrictions)
|
||||
NONFREE_CC_LICENSES = {1, 2, 3, 6, 11, 12, 13, 16}
|
||||
|
||||
# Wikipedia-compatible free licenses
|
||||
FREE_LICENSES = {4, 5, 7, 8, 9, 10, 14, 15}
|
||||
|
||||
|
||||
PHOTOS_PER_PAGE = 25
|
||||
|
||||
|
|
@ -126,454 +91,6 @@ class SearchResult:
|
|||
total_pages: int
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CommonsUpload:
|
||||
"""Represents a recent upload to Wikimedia Commons."""
|
||||
|
||||
title: str
|
||||
thumb_url: str
|
||||
commons_url: str
|
||||
flickr_url: str
|
||||
creator: str
|
||||
timestamp: str
|
||||
wikipedia_url: str = ""
|
||||
creator_profile_url: str = ""
|
||||
|
||||
@property
|
||||
def is_wikidata_item(self) -> bool:
|
||||
"""Check if the wikipedia_url is actually a Wikidata item."""
|
||||
if not self.wikipedia_url:
|
||||
return False
|
||||
# Match Q followed by digits at the end of the URL
|
||||
return bool(re.search(r"/Q\d+$", self.wikipedia_url))
|
||||
|
||||
@property
|
||||
def wiki_link_url(self) -> str:
|
||||
"""Get the correct URL (Wikidata if it's a Q item, otherwise Wikipedia)."""
|
||||
if not self.wikipedia_url:
|
||||
return ""
|
||||
if self.is_wikidata_item:
|
||||
# Extract Q-id and build Wikidata URL
|
||||
match = re.search(r"(Q\d+)$", self.wikipedia_url)
|
||||
if match:
|
||||
return f"https://www.wikidata.org/wiki/{match.group(1)}"
|
||||
return self.wikipedia_url
|
||||
|
||||
@property
|
||||
def wiki_link_label(self) -> str:
|
||||
"""Get the label for the wiki link."""
|
||||
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
||||
|
||||
|
||||
def normalize_flickr_url(url: str) -> str:
|
||||
"""Normalize a Flickr photo URL for comparison."""
|
||||
# Remove protocol
|
||||
url = url.replace("https://", "").replace("http://", "")
|
||||
# Remove www.
|
||||
url = url.replace("www.", "")
|
||||
# Remove trailing slash
|
||||
url = url.rstrip("/")
|
||||
# Ensure it starts with flickr.com
|
||||
if not url.startswith("flickr.com"):
|
||||
return ""
|
||||
return url
|
||||
|
||||
|
||||
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
||||
"""Extract flickr URL and Wikipedia URL from message body."""
|
||||
|
||||
flickr_url = ""
|
||||
wikipedia_url = ""
|
||||
|
||||
# Find flickr photo URLs
|
||||
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
||||
flickr_matches = re.findall(flickr_pattern, body)
|
||||
if flickr_matches:
|
||||
flickr_url = flickr_matches[0]
|
||||
if not flickr_url.startswith("http"):
|
||||
flickr_url = "https://" + flickr_url
|
||||
|
||||
# Find Wikipedia URLs
|
||||
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
||||
wiki_matches = re.findall(wiki_pattern, body)
|
||||
if wiki_matches:
|
||||
wikipedia_url = wiki_matches[0]
|
||||
if not wikipedia_url.startswith("http"):
|
||||
wikipedia_url = "https://" + wikipedia_url
|
||||
|
||||
return flickr_url, wikipedia_url
|
||||
|
||||
|
||||
def build_sent_mail_index() -> dict[str, dict[str, str]]:
|
||||
"""Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
|
||||
if not SENT_MAIL_DIR.exists():
|
||||
return {}
|
||||
|
||||
# Check if we have a cached index
|
||||
if SENT_MAIL_INDEX_CACHE.exists():
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_CACHE) as f:
|
||||
cache = json.load(f)
|
||||
# Check if cache is still valid (compare file count)
|
||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||
if cache.get("file_count") == len(json_files):
|
||||
return cache.get("index", {})
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
index: dict[str, dict[str, str]] = {}
|
||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||
|
||||
for json_file in json_files:
|
||||
try:
|
||||
with open(json_file) as f:
|
||||
message = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
# Skip replies - we want original requests
|
||||
subject = message.get("subject", "")
|
||||
if subject.startswith("Re:"):
|
||||
continue
|
||||
|
||||
body = message.get("body", "")
|
||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if not normalized:
|
||||
continue
|
||||
|
||||
# Extract creator profile URL from flickr URL
|
||||
# flickr.com/photos/username/12345 -> flickr.com/photos/username
|
||||
parts = flickr_url.split("/")
|
||||
creator_profile = ""
|
||||
for i, part in enumerate(parts):
|
||||
if part == "photos" and i + 1 < len(parts):
|
||||
username = parts[i + 1]
|
||||
creator_profile = f"https://www.flickr.com/photos/{username}"
|
||||
break
|
||||
|
||||
index[normalized] = {
|
||||
"wikipedia_url": wikipedia_url,
|
||||
"creator_profile_url": creator_profile,
|
||||
"recipient": message.get("recipient", ""),
|
||||
}
|
||||
|
||||
# Cache the index
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_CACHE, "w") as f:
|
||||
json.dump({"file_count": len(json_files), "index": index}, f)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
|
||||
"""Load the thumbnail cache from disk."""
|
||||
if not COMMONS_CACHE_FILE.exists():
|
||||
return {"timestamp": 0, "thumbnails": {}}
|
||||
try:
|
||||
with open(COMMONS_CACHE_FILE) as f:
|
||||
return typing.cast(dict[str, typing.Any], json.load(f))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {"timestamp": 0, "thumbnails": {}}
|
||||
|
||||
|
||||
def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
|
||||
"""Save the thumbnail cache to disk."""
|
||||
try:
|
||||
with open(COMMONS_CACHE_FILE, "w") as f:
|
||||
json.dump(cache, f)
|
||||
except OSError:
|
||||
pass # Ignore cache write errors
|
||||
|
||||
|
||||
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
||||
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
||||
if not titles:
|
||||
return {}
|
||||
|
||||
# Commons API allows up to 50 titles per request
|
||||
params = {
|
||||
"action": "query",
|
||||
"titles": "|".join(titles),
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url",
|
||||
"iiurlwidth": 150,
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
"https://commons.wikimedia.org/w/api.php",
|
||||
params=params,
|
||||
headers=headers,
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError):
|
||||
return {}
|
||||
|
||||
thumbnails: dict[str, str] = {}
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
title = page.get("title", "")
|
||||
imageinfo = page.get("imageinfo", [])
|
||||
if imageinfo:
|
||||
thumb_url = imageinfo[0].get("thumburl", "")
|
||||
if thumb_url:
|
||||
thumbnails[title] = thumb_url
|
||||
|
||||
return thumbnails
|
||||
|
||||
|
||||
def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
|
||||
"""Get recent Commons uploads with thumbnails, filtered to those contacted via Flickr mail.
|
||||
|
||||
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
||||
of uploads obtained via Flickr mail (not just the ones returned).
|
||||
"""
|
||||
if not COMMONS_UPLOADS_FILE.exists():
|
||||
return [], 0
|
||||
|
||||
try:
|
||||
with open(COMMONS_UPLOADS_FILE) as f:
|
||||
all_uploads = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return [], 0
|
||||
|
||||
# Build sent mail index
|
||||
sent_mail_index = build_sent_mail_index()
|
||||
|
||||
# Filter uploads to only those with matching sent mail
|
||||
# Count all matches, but only keep RECENT_UPLOADS_COUNT for display
|
||||
uploads_with_mail: list[dict[str, typing.Any]] = []
|
||||
total_matched = 0
|
||||
for upload in all_uploads:
|
||||
flickr_url = upload.get("flickr_url", "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if normalized and normalized in sent_mail_index:
|
||||
total_matched += 1
|
||||
if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
|
||||
upload["_mail_info"] = sent_mail_index[normalized]
|
||||
uploads_with_mail.append(upload)
|
||||
|
||||
if not uploads_with_mail:
|
||||
return [], 0
|
||||
|
||||
# Load cache and check if it's still valid
|
||||
cache = load_commons_thumbnail_cache()
|
||||
cache_age = time.time() - cache.get("timestamp", 0)
|
||||
cached_thumbs = cache.get("thumbnails", {})
|
||||
|
||||
# Find which titles need fetching
|
||||
titles = [u["title"] for u in uploads_with_mail]
|
||||
titles_to_fetch = [t for t in titles if t not in cached_thumbs]
|
||||
|
||||
# Fetch missing thumbnails or refresh if cache is old
|
||||
if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
|
||||
new_thumbs = fetch_commons_thumbnails(
|
||||
titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
|
||||
)
|
||||
cached_thumbs.update(new_thumbs)
|
||||
cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
|
||||
save_commons_thumbnail_cache(cache)
|
||||
|
||||
# Build the result list
|
||||
result: list[CommonsUpload] = []
|
||||
for upload in uploads_with_mail:
|
||||
title = upload["title"]
|
||||
thumb_url = cached_thumbs.get(title, "")
|
||||
if not thumb_url:
|
||||
continue
|
||||
|
||||
mail_info = upload.get("_mail_info", {})
|
||||
|
||||
# Convert title to Commons URL
|
||||
commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
|
||||
|
||||
result.append(
|
||||
CommonsUpload(
|
||||
title=title.replace("File:", "").rsplit(".", 1)[0],
|
||||
thumb_url=thumb_url,
|
||||
commons_url=commons_url,
|
||||
flickr_url=upload.get("flickr_url", ""),
|
||||
creator=upload.get("creator") or "Unknown",
|
||||
timestamp=upload.get("timestamp", "")[:10],
|
||||
wikipedia_url=mail_info.get("wikipedia_url", ""),
|
||||
creator_profile_url=mail_info.get("creator_profile_url", ""),
|
||||
)
|
||||
)
|
||||
|
||||
return result, total_matched
|
||||
|
||||
|
||||
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||
"""Get previous messages sent to a Flickr user.
|
||||
|
||||
Checks both the display name (flickr_user) and username (flickr_username)
|
||||
against the recipient field in the messages index.
|
||||
"""
|
||||
if not SENT_MAIL_INDEX_FILE.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_FILE) as f:
|
||||
messages = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return []
|
||||
|
||||
# Normalize for case-insensitive comparison
|
||||
flickr_user_lower = flickr_user.lower() if flickr_user else ""
|
||||
flickr_username_lower = flickr_username.lower() if flickr_username else ""
|
||||
|
||||
matches = []
|
||||
for msg in messages:
|
||||
recipient = msg.get("recipient", "").lower()
|
||||
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
|
||||
matches.append(msg)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def parse_category_input(category_input: str) -> str | None:
|
||||
"""Parse category title from URL or direct input.
|
||||
|
||||
Returns the category title with 'Category:' prefix, or None if invalid.
|
||||
"""
|
||||
category_input = category_input.strip()
|
||||
|
||||
# Handle URL format: https://en.wikipedia.org/wiki/Category:Example
|
||||
if "wikipedia.org" in category_input:
|
||||
match = re.search(r"/wiki/(Category:[^#?]+)", category_input)
|
||||
if match:
|
||||
return unquote(match.group(1)).replace("_", " ")
|
||||
return None
|
||||
|
||||
# Handle direct input - add Category: prefix if missing
|
||||
if category_input.startswith("Category:"):
|
||||
return category_input.replace("_", " ")
|
||||
|
||||
# Assume it's just the category name
|
||||
return f"Category:{category_input.replace('_', ' ')}"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ArticleWithoutImage:
|
||||
"""Represents a Wikipedia article that needs an image."""
|
||||
|
||||
title: str
|
||||
pageid: int
|
||||
|
||||
@property
|
||||
def wikipedia_url(self) -> str:
|
||||
"""URL to the Wikipedia article."""
|
||||
return f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}"
|
||||
|
||||
@property
|
||||
def search_url(self) -> str:
|
||||
"""URL to search for this article in Flickr Mail."""
|
||||
return f"/?enwp={quote(self.title)}"
|
||||
|
||||
|
||||
# Common non-content images to ignore when checking if an article has images
|
||||
NON_CONTENT_IMAGE_PATTERNS = [
|
||||
"OOjs UI icon",
|
||||
"Commons-logo",
|
||||
"Symbol ",
|
||||
"Edit-ltr",
|
||||
"Ambox ",
|
||||
"Question book",
|
||||
"Wiki letter",
|
||||
"Text document",
|
||||
"Folder ",
|
||||
"Crystal ",
|
||||
"Nuvola ",
|
||||
"Gnome-",
|
||||
"Disambig ",
|
||||
"DAB ",
|
||||
]
|
||||
|
||||
|
||||
def has_content_image(images: list[dict]) -> bool:
|
||||
"""Check if an article has a content image (not just UI icons/logos)."""
|
||||
for img in images:
|
||||
title = img.get("title", "")
|
||||
# Skip if it matches any non-content pattern
|
||||
is_non_content = any(pattern in title for pattern in NON_CONTENT_IMAGE_PATTERNS)
|
||||
if not is_non_content:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_articles_without_images(
|
||||
category: str, limit: int = 100
|
||||
) -> tuple[list[ArticleWithoutImage], str | None]:
|
||||
"""Get articles in a category that don't have images.
|
||||
|
||||
Uses generator=categorymembers with prop=images to efficiently check
|
||||
multiple articles in a single API request.
|
||||
|
||||
Returns a tuple of (articles_list, continue_token).
|
||||
"""
|
||||
params = {
|
||||
"action": "query",
|
||||
"generator": "categorymembers",
|
||||
"gcmtitle": category,
|
||||
"gcmtype": "page", # Only articles, not subcategories or files
|
||||
"gcmnamespace": "0", # Main namespace only
|
||||
"gcmlimit": str(limit),
|
||||
"prop": "images",
|
||||
"imlimit": "max", # Need enough to check all pages in batch
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
WIKIPEDIA_API, params=params, headers=headers, timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
print(f"Wikipedia API error: {e}")
|
||||
return [], None
|
||||
|
||||
articles_without_images: list[ArticleWithoutImage] = []
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
images = page.get("images", [])
|
||||
|
||||
# Skip if page has content images (not just UI icons)
|
||||
if has_content_image(images):
|
||||
continue
|
||||
|
||||
title = page.get("title", "")
|
||||
pageid = page.get("pageid", 0)
|
||||
|
||||
if title and pageid:
|
||||
articles_without_images.append(
|
||||
ArticleWithoutImage(title=title, pageid=pageid)
|
||||
)
|
||||
|
||||
# Sort by title for consistent display
|
||||
articles_without_images.sort(key=lambda a: a.title)
|
||||
|
||||
# Get continue token if there are more results
|
||||
continue_token = data.get("continue", {}).get("gcmcontinue")
|
||||
|
||||
return articles_without_images, continue_token
|
||||
|
||||
|
||||
def is_valid_flickr_image_url(url: str) -> bool:
|
||||
"""Check if URL is a valid Flickr static image URL."""
|
||||
valid_prefixes = (
|
||||
|
|
@ -604,9 +121,7 @@ def search_flickr(search_term: str, page: int = 1) -> SearchResult:
|
|||
|
||||
def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult:
|
||||
"""Parse Flickr search results HTML and extract photo data."""
|
||||
empty_result = SearchResult(
|
||||
photos=[], total_photos=0, current_page=page, total_pages=0
|
||||
)
|
||||
empty_result = SearchResult(photos=[], total_photos=0, current_page=page, total_pages=0)
|
||||
|
||||
# Find the modelExport JSON embedded in the page
|
||||
start = html.find("modelExport:")
|
||||
|
|
@ -750,16 +265,10 @@ def start() -> str:
|
|||
"""Start form."""
|
||||
enwp = flask.request.args.get("enwp")
|
||||
if not enwp:
|
||||
recent_uploads, total_uploads = get_recent_commons_uploads()
|
||||
return flask.render_template(
|
||||
"combined.html", recent_uploads=recent_uploads, total_uploads=total_uploads
|
||||
)
|
||||
return flask.render_template("combined.html")
|
||||
enwp = enwp.strip()
|
||||
if not enwp:
|
||||
recent_uploads, total_uploads = get_recent_commons_uploads()
|
||||
return flask.render_template(
|
||||
"combined.html", recent_uploads=recent_uploads, total_uploads=total_uploads
|
||||
)
|
||||
return flask.render_template("combined.html")
|
||||
|
||||
input_is = "url" if enwiki in enwp else "title"
|
||||
|
||||
|
|
@ -778,13 +287,9 @@ def start() -> str:
|
|||
wiki_part2 = name.replace(" ", "_")
|
||||
wikipedia_url = wiki_part1 + wiki_part2
|
||||
|
||||
# Remove disambiguation suffix like "(academic)" for Flickr search
|
||||
if "_(" in name:
|
||||
name = name[: name.find("_(")]
|
||||
name = name.replace("_", " ")
|
||||
if " (" in name:
|
||||
name = name[: name.find(" (")]
|
||||
|
||||
# Get category param if coming from category search
|
||||
cat = flask.request.args.get("cat")
|
||||
|
||||
flickr_url = flask.request.args.get("flickr")
|
||||
if not flickr_url:
|
||||
|
|
@ -797,7 +302,6 @@ def start() -> str:
|
|||
name=name,
|
||||
enwp=enwp,
|
||||
search_result=search_result,
|
||||
cat=cat,
|
||||
)
|
||||
|
||||
if "/in/" in flickr_url:
|
||||
|
|
@ -819,40 +323,6 @@ def start() -> str:
|
|||
if img_url and not is_valid_flickr_image_url(img_url):
|
||||
img_url = None
|
||||
|
||||
# Get flickr_user name and build profile URL
|
||||
flickr_user = flask.request.args.get("flickr_user", "")
|
||||
flickr_user_url = f"https://www.flickr.com/photos/{flickr_username}/"
|
||||
|
||||
# Check for previous messages to this user
|
||||
previous_messages = get_previous_messages(flickr_user, flickr_username)
|
||||
|
||||
# Get license code if provided
|
||||
license_code = flask.request.args.get("license", type=int)
|
||||
license_name = (
|
||||
FLICKR_LICENSES.get(license_code, "") if license_code is not None else ""
|
||||
)
|
||||
|
||||
is_free_license = license_code in FREE_LICENSES
|
||||
is_nonfree_cc = license_code in NONFREE_CC_LICENSES
|
||||
|
||||
# For free licenses, show upload options instead of message
|
||||
if is_free_license:
|
||||
return flask.render_template(
|
||||
"combined.html",
|
||||
name=name,
|
||||
enwp=enwp,
|
||||
flickr_url=flickr_url,
|
||||
img_url=img_url,
|
||||
license_code=license_code,
|
||||
license_name=license_name,
|
||||
is_free_license=True,
|
||||
wikipedia_url=wikipedia_url,
|
||||
flickr_user=flickr_user,
|
||||
flickr_user_url=flickr_user_url,
|
||||
cat=cat,
|
||||
previous_messages=previous_messages,
|
||||
)
|
||||
|
||||
msg = flask.render_template(
|
||||
"message.jinja",
|
||||
flickr_url=flickr_url,
|
||||
|
|
@ -861,8 +331,6 @@ def start() -> str:
|
|||
name=name,
|
||||
wiki_part1=wiki_part1,
|
||||
wiki_part2=wiki_part2,
|
||||
is_nonfree_cc=is_nonfree_cc,
|
||||
license_name=license_name,
|
||||
)
|
||||
|
||||
subject = f"Request to use your photo of {name} on Wikipedia"
|
||||
|
|
@ -878,43 +346,6 @@ def start() -> str:
|
|||
lines=lines,
|
||||
nsid=nsid,
|
||||
img_url=img_url,
|
||||
license_code=license_code,
|
||||
license_name=license_name,
|
||||
flickr_user=flickr_user,
|
||||
flickr_user_url=flickr_user_url,
|
||||
cat=cat,
|
||||
previous_messages=previous_messages,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/category")
|
||||
def category_search() -> str:
|
||||
"""Find articles in a Wikipedia category that need images."""
|
||||
cat = flask.request.args.get("cat", "").strip()
|
||||
|
||||
if not cat:
|
||||
return flask.render_template("category.html")
|
||||
|
||||
category = parse_category_input(cat)
|
||||
if not category:
|
||||
return flask.render_template(
|
||||
"category.html",
|
||||
error="Invalid category format. Please enter a category name or URL.",
|
||||
cat=cat,
|
||||
)
|
||||
|
||||
articles, continue_token = get_articles_without_images(category)
|
||||
|
||||
# Get the display name (without Category: prefix)
|
||||
category_name = category.replace("Category:", "")
|
||||
|
||||
return flask.render_template(
|
||||
"category.html",
|
||||
cat=cat,
|
||||
category=category,
|
||||
category_name=category_name,
|
||||
articles=articles,
|
||||
continue_token=continue_token,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,67 +0,0 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Category Search - Flickr mail{% endblock %}
|
||||
|
||||
{% block style %}
|
||||
<style>
|
||||
.article-link:visited { color: #6f42c1; }
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<h1>Find articles needing images</h1>
|
||||
<p class="text-muted">Enter a Wikipedia category to find articles without images</p>
|
||||
|
||||
<form action="{{ url_for('category_search') }}" method="get">
|
||||
<div class="mb-3">
|
||||
<label for="cat" class="form-label">Wikipedia category name or URL:</label>
|
||||
<input type="text" class="form-control" id="cat" name="cat" value="{{ cat }}"
|
||||
placeholder="e.g., Living people or https://en.wikipedia.org/wiki/Category:Living_people" required>
|
||||
</div>
|
||||
<input type="submit" class="btn btn-primary" value="Search">
|
||||
<a href="{{ url_for('start') }}" class="btn btn-outline-secondary ms-2">Back to main</a>
|
||||
</form>
|
||||
|
||||
{% if error %}
|
||||
<div class="alert alert-danger mt-3">{{ error }}</div>
|
||||
{% endif %}
|
||||
|
||||
{% if category and articles is defined %}
|
||||
<div class="mt-4">
|
||||
<h5>Articles without images in <a href="https://en.wikipedia.org/wiki/{{ category | replace(' ', '_') }}" target="_blank">{{ category_name }}</a></h5>
|
||||
|
||||
{% if articles %}
|
||||
<p class="text-muted small">Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}</p>
|
||||
|
||||
<div class="list-group">
|
||||
{% for article in articles %}
|
||||
<div class="list-group-item d-flex justify-content-between align-items-center">
|
||||
<a href="{{ url_for('start', enwp=article.title, cat=cat) }}" class="text-decoration-none article-link">{{ article.title }}</a>
|
||||
<a href="{{ article.wikipedia_url }}" target="_blank" class="badge bg-secondary text-decoration-none">Wikipedia</a>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
{% if continue_token %}
|
||||
<p class="text-muted small mt-3">Note: Only showing first batch of results. More articles may be available in this category.</p>
|
||||
{% endif %}
|
||||
|
||||
{% else %}
|
||||
<div class="alert alert-success mt-3">
|
||||
All articles in this category have images!
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="mt-4">
|
||||
<p class="text-muted small">
|
||||
<a href="{{ url_for('start') }}">Back to Flickr mail home</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
|
@ -13,55 +13,10 @@
|
|||
</div>
|
||||
|
||||
<input type="submit" value="Submit">
|
||||
<a href="{{ url_for('category_search') }}" class="btn btn-outline-secondary ms-2">Find articles by category</a>
|
||||
</form>
|
||||
|
||||
{% if recent_uploads is defined and recent_uploads and not name %}
|
||||
<div class="mt-4">
|
||||
<h5>Recent uploads to Wikimedia Commons</h5>
|
||||
<p class="text-muted small">{{ total_uploads }} photos obtained via Flickr mail requests</p>
|
||||
<div class="row row-cols-1 row-cols-md-2 row-cols-lg-3 g-3">
|
||||
{% for upload in recent_uploads %}
|
||||
<div class="col">
|
||||
<div class="card h-100">
|
||||
<div class="row g-0">
|
||||
<div class="col-4">
|
||||
<a href="{{ upload.commons_url }}">
|
||||
<img src="{{ upload.thumb_url }}" alt="{{ upload.title }}" class="img-fluid rounded-start" style="aspect-ratio: 1; object-fit: cover; width: 100%;">
|
||||
</a>
|
||||
</div>
|
||||
<div class="col-8">
|
||||
<div class="card-body p-2">
|
||||
<p class="card-text small mb-1 text-truncate" title="{{ upload.title }}">
|
||||
<a href="{{ upload.commons_url }}" class="text-decoration-none">{{ upload.title }}</a>
|
||||
</p>
|
||||
<p class="card-text small mb-1">
|
||||
{% if upload.creator_profile_url %}
|
||||
<a href="{{ upload.creator_profile_url }}" class="text-muted text-decoration-none">{{ upload.creator }}</a>
|
||||
{% else %}
|
||||
<span class="text-muted">{{ upload.creator }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
{% if upload.wikipedia_url %}
|
||||
<p class="card-text small mb-0">
|
||||
<a href="{{ upload.wiki_link_url }}" class="text-decoration-none"><small>{{ upload.wiki_link_label }}</small></a>
|
||||
</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if name and search_result is defined and search_result.photos %}
|
||||
|
||||
{% if cat %}
|
||||
<p><a href="{{ url_for('category_search', cat=cat) }}">← Back to category</a></p>
|
||||
{% endif %}
|
||||
<p>Wikipedia article: {{ name }}</p>
|
||||
<p>Select a photo to compose a message ({{ search_result.total_photos | default(0) }} results):</p>
|
||||
|
||||
|
|
@ -69,12 +24,12 @@
|
|||
{% for photo in search_result.photos %}
|
||||
<div class="col">
|
||||
<div class="card h-100">
|
||||
<a href="{{ url_for(request.endpoint, enwp=enwp, flickr=photo.flickr_url, img=photo.medium_url, license=photo.license, flickr_user=photo.realname or photo.username, cat=cat) }}">
|
||||
<a href="{{ url_for(request.endpoint, enwp=enwp, flickr=photo.flickr_url, img=photo.medium_url) }}">
|
||||
<img src="{{ photo.thumb_url }}" alt="{{ photo.title }}" class="card-img-top" style="aspect-ratio: 1; object-fit: cover;">
|
||||
</a>
|
||||
<div class="card-body p-2">
|
||||
<p class="card-text small mb-1 text-truncate" title="{{ photo.realname or photo.username }}">{{ photo.realname or photo.username }}</p>
|
||||
<span class="badge {{ 'bg-success' if photo.license in [4, 5, 7, 8, 9, 10, 14, 15] else 'bg-secondary' }}">{{ photo.license_name }}</span>
|
||||
<span class="badge {{ 'bg-success' if photo.license in [4, 5, 7, 8, 9, 10] else 'bg-secondary' }}">{{ photo.license_name }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -86,7 +41,7 @@
|
|||
<ul class="pagination justify-content-center">
|
||||
{% if search_result.current_page > 1 %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.current_page - 1, cat=cat) }}">Previous</a>
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.current_page - 1) }}">Previous</a>
|
||||
</li>
|
||||
{% else %}
|
||||
<li class="page-item disabled">
|
||||
|
|
@ -99,7 +54,7 @@
|
|||
|
||||
{% if start_page > 1 %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=1, cat=cat) }}">1</a>
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=1) }}">1</a>
|
||||
</li>
|
||||
{% if start_page > 2 %}
|
||||
<li class="page-item disabled"><span class="page-link">...</span></li>
|
||||
|
|
@ -108,7 +63,7 @@
|
|||
|
||||
{% for p in range(start_page, end_page + 1) %}
|
||||
<li class="page-item {{ 'active' if p == search_result.current_page else '' }}">
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=p, cat=cat) }}">{{ p }}</a>
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=p) }}">{{ p }}</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
|
||||
|
|
@ -117,13 +72,13 @@
|
|||
<li class="page-item disabled"><span class="page-link">...</span></li>
|
||||
{% endif %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.total_pages, cat=cat) }}">{{ search_result.total_pages }}</a>
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.total_pages) }}">{{ search_result.total_pages }}</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
|
||||
{% if search_result.current_page < search_result.total_pages %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.current_page + 1, cat=cat) }}">Next</a>
|
||||
<a class="page-link" href="{{ url_for(request.endpoint, enwp=enwp, page=search_result.current_page + 1) }}">Next</a>
|
||||
</li>
|
||||
{% else %}
|
||||
<li class="page-item disabled">
|
||||
|
|
@ -140,11 +95,8 @@
|
|||
|
||||
{% elif name and not flickr_url %}
|
||||
|
||||
{% if cat %}
|
||||
<p><a href="{{ url_for('category_search', cat=cat) }}">← Back to category</a></p>
|
||||
{% endif %}
|
||||
<p>Wikipedia article: {{ name }}</p>
|
||||
<div class="alert alert-warning">No photos found. Try a different search term.</div>
|
||||
<p class="text-warning">No photos found. Try a different search term.</p>
|
||||
<p><a href="https://flickr.com/search/?view_all=1&text={{ '"' + name + '"' | urlencode }}" target="_blank">Search on Flickr directly</a></p>
|
||||
|
||||
{% endif %}
|
||||
|
|
@ -162,36 +114,7 @@
|
|||
{% else %}
|
||||
<div class="col-12">
|
||||
{% endif %}
|
||||
{% if flickr_user %}
|
||||
<p><strong>User:</strong> <a href="{{ flickr_user_url }}" target="_blank">{{ flickr_user }}</a></p>
|
||||
{% endif %}
|
||||
{% if previous_messages %}
|
||||
<div class="alert alert-info">
|
||||
<strong>Previously contacted:</strong> You have sent {{ previous_messages | length }} message(s) to this user.
|
||||
<ul class="mb-0 mt-2">
|
||||
{% for msg in previous_messages %}
|
||||
<li><a href="{{ msg.url }}" target="_blank">{{ msg.subject }}</a> <small class="text-muted">({{ msg.date }})</small></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if is_free_license %}
|
||||
<div class="alert alert-success">
|
||||
<strong>Ready to upload!</strong> This photo is licensed under <span class="badge bg-success">{{ license_name }}</span> and can be used on Wikipedia.
|
||||
</div>
|
||||
<p>
|
||||
<a href="https://commons.wikimedia.org/wiki/Special:UploadWizard" class="btn btn-primary" target="_blank">
|
||||
Upload to Wikimedia Commons
|
||||
</a>
|
||||
</p>
|
||||
<p class="text-muted small">
|
||||
After uploading, you can add the image to the Wikipedia article for <a href="{{ wikipedia_url }}" target="_blank">{{ name }}</a>.
|
||||
</p>
|
||||
{% else %}
|
||||
<p><a href="https://www.flickr.com/mail/write/?to={{nsid}}" class="btn btn-primary">Send message on Flickr</a></p>
|
||||
{% if license_name %}
|
||||
<div class="mb-2"><strong>Current license:</strong> <span class="badge {{ 'bg-success' if license_code in [4, 5, 7, 8, 9, 10, 14, 15] else 'bg-warning text-dark' if license_code in [1, 2, 3, 6, 11, 12, 13, 16] else 'bg-secondary' }}">{{ license_name }}</span></div>
|
||||
{% endif %}
|
||||
<div class="mb-2"><strong>Subject:</strong> {{ subject }} <button class="btn btn-sm btn-outline-secondary" id="copy-subject">copy</button></div>
|
||||
<div>
|
||||
<h5>Message <button class="btn btn-sm btn-outline-secondary" id="copy-message">copy</button></h5>
|
||||
|
|
@ -199,10 +122,6 @@
|
|||
<p>{{ p }}</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
<p class="mt-3">
|
||||
<a href="{{ url_for('start', enwp=enwp, cat=cat) if cat else url_for('start', enwp=enwp) }}">← Back to search results</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
|
|
|||
|
|
@ -1,33 +1,5 @@
|
|||
{# vim:ft=jinja
|
||||
#}
|
||||
{% if is_nonfree_cc %}
|
||||
Hi,
|
||||
|
||||
I'd like to use your photo to illustrate the article about {{ name }} on Wikipedia.
|
||||
|
||||
{{ flickr_url }}
|
||||
|
||||
{{ wiki_part1 }}{{ wiki_part2 | urlencode }}
|
||||
|
||||
I noticed your photo is licensed under {{ license_name }}. Thank you for sharing your work with a Creative Commons license! Unfortunately, Wikipedia can only use images that allow commercial use and derivative works, so the current license restrictions prevent us from using it.
|
||||
|
||||
{% if 'NC' in license_name and 'ND' in license_name %}
|
||||
The "NonCommercial" restriction means the image can't be used on Wikipedia because Wikipedia content may be reused commercially. The "NoDerivs" restriction also prevents use because Wikipedia needs to allow image cropping and other modifications.
|
||||
{% elif 'NC' in license_name %}
|
||||
The "NonCommercial" restriction means the image can't be used on Wikipedia because Wikipedia content may be reused commercially by anyone.
|
||||
{% elif 'ND' in license_name %}
|
||||
The "NoDerivs" restriction means the image can't be used on Wikipedia because Wikipedia needs to allow modifications like cropping or colour correction.
|
||||
{% endif %}
|
||||
|
||||
Would you consider changing the license to Creative Commons Attribution (CC BY) or Attribution-ShareAlike (CC BY-SA)? These licenses still require credit to be given to you, but allow the broader use that Wikipedia requires.
|
||||
|
||||
To adjust the license settings, you can click on the license name on the right-hand side of the photo's page, just underneath the date.
|
||||
|
||||
Thanks,
|
||||
|
||||
Edward
|
||||
edward@4angle.com
|
||||
{% else %}
|
||||
Hi,
|
||||
|
||||
I'd like to use your photo to illustrate the article about {{ name }} on Wikipedia.
|
||||
|
|
@ -46,4 +18,3 @@ Thanks,
|
|||
|
||||
Edward
|
||||
edward@4angle.com
|
||||
{% endif %}
|
||||
|
|
|
|||
|
|
@ -1,174 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
||||
|
||||
For contributions with comment 'User created page with UploadWizard', queries the
|
||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
||||
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
|
||||
|
||||
def extract_flickr_url_from_credit(credit: str) -> str | None:
|
||||
"""Extract Flickr URL from the Credit field HTML."""
|
||||
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+'
|
||||
match = re.search(pattern, credit)
|
||||
return match.group(0) if match else None
|
||||
|
||||
|
||||
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
|
||||
"""Fetch image metadata from Commons API for multiple titles."""
|
||||
if not titles:
|
||||
return {}
|
||||
|
||||
# Commons API allows up to 50 titles per request
|
||||
params = {
|
||||
"action": "query",
|
||||
"titles": "|".join(titles),
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "extmetadata",
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
print(f"API error: {e}")
|
||||
return {}
|
||||
|
||||
results = {}
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
title = page.get("title", "")
|
||||
imageinfo = page.get("imageinfo", [])
|
||||
if imageinfo:
|
||||
extmeta = imageinfo[0].get("extmetadata", {})
|
||||
results[title] = {
|
||||
"credit": extmeta.get("Credit", {}).get("value", ""),
|
||||
"artist": extmeta.get("Artist", {}).get("value", ""),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def clean_artist_name(artist_html: str) -> str:
|
||||
"""Extract plain text artist name from HTML."""
|
||||
# Remove HTML tags
|
||||
text = re.sub(r"<[^>]+>", "", artist_html)
|
||||
# Clean up whitespace
|
||||
text = " ".join(text.split())
|
||||
return text
|
||||
|
||||
|
||||
def main():
|
||||
# Load contributions
|
||||
print("Loading contributions...")
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
|
||||
# Load existing flickr uploads
|
||||
existing_flickr_urls = set()
|
||||
existing_uploads = []
|
||||
if FLICKR_UPLOADS_FILE.exists():
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
existing_uploads = json.load(f)
|
||||
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
||||
# Also normalize existing URLs for comparison
|
||||
for u in existing_uploads:
|
||||
url = u.get("flickr_url", "")
|
||||
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
existing_flickr_urls.add(normalized)
|
||||
|
||||
print(f"Existing uploads: {len(existing_uploads)}")
|
||||
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
||||
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard_contributions = []
|
||||
for c in contributions:
|
||||
comment = c.get("comment", "")
|
||||
if comment == "User created page with UploadWizard":
|
||||
# Only include if it's a File: page
|
||||
title = c.get("title", "")
|
||||
if title.startswith("File:"):
|
||||
upload_wizard_contributions.append(c)
|
||||
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
||||
|
||||
# Process in batches of 50
|
||||
new_uploads = []
|
||||
batch_size = 50
|
||||
|
||||
for i in range(0, len(upload_wizard_contributions), batch_size):
|
||||
batch = upload_wizard_contributions[i : i + batch_size]
|
||||
titles = [c["title"] for c in batch]
|
||||
|
||||
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
||||
|
||||
metadata = get_image_metadata(titles)
|
||||
|
||||
for c in batch:
|
||||
title = c["title"]
|
||||
meta = metadata.get(title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
# Check if we already have this URL
|
||||
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
||||
continue
|
||||
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
|
||||
new_upload = {
|
||||
"pageid": c["pageid"],
|
||||
"revid": c["revid"],
|
||||
"title": title,
|
||||
"timestamp": c["timestamp"],
|
||||
"flickr_url": flickr_url,
|
||||
"creator": creator,
|
||||
}
|
||||
|
||||
new_uploads.append(new_upload)
|
||||
existing_flickr_urls.add(normalized)
|
||||
print(f" Found: {title[:50]} -> {flickr_url}")
|
||||
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard_contributions):
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
||||
|
||||
if new_uploads:
|
||||
# Merge and sort by timestamp (newest first)
|
||||
all_uploads = existing_uploads + new_uploads
|
||||
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
|
||||
# Save
|
||||
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
||||
json.dump(all_uploads, f, indent=2)
|
||||
|
||||
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue