Handle modern UploadWizard comments when indexing Flickr uploads

This commit is contained in:
Edward Betts 2026-02-07 13:41:27 +00:00
parent 4f67960fe1
commit 2819652afd

View file

@ -2,8 +2,12 @@
"""
Find UploadWizard contributions that are from Flickr and add them to the database.
For contributions with comment 'User created page with UploadWizard', queries the
Commons API to check if the image source is Flickr (by checking the Credit field).
Supports both UploadWizard comment styles:
- "User created page with UploadWizard" (older)
- "Uploaded a work by ... with UploadWizard" (newer, often includes Flickr URL)
If a Flickr URL is not present in the contribution comment, queries Commons API
to check if the image source is Flickr (by checking the Credit field).
"""
import json
@ -27,6 +31,13 @@ def extract_flickr_url_from_credit(credit: str) -> str | None:
return match.group(0) if match else None
def extract_flickr_url_from_comment(comment: str) -> str | None:
"""Extract Flickr URL directly from a contribution comment."""
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/\s]+/\d+'
match = re.search(pattern, comment or "")
return match.group(0) if match else None
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
"""Fetch image metadata from Commons API for multiple titles."""
if not titles:
@ -97,10 +108,12 @@ def main():
)
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
# Find UploadWizard contributions (page creations only)
# Find UploadWizard file uploads.
# Old format: "User created page with UploadWizard"
# New format: "Uploaded a work by ... with UploadWizard"
upload_wizard = (
session.query(Contribution)
.filter(Contribution.comment == "User created page with UploadWizard")
.filter(Contribution.comment.contains("UploadWizard"))
.filter(Contribution.title.startswith("File:"))
.all()
)
@ -127,6 +140,9 @@ def main():
credit = meta.get("credit", "")
artist = meta.get("artist", "")
# Prefer URL directly in comment; fall back to extmetadata Credit.
flickr_url = extract_flickr_url_from_comment(c.comment or "")
if not flickr_url:
flickr_url = extract_flickr_url_from_credit(credit)
if not flickr_url:
continue