Handle modern UploadWizard comments when indexing Flickr uploads
This commit is contained in:
parent
4f67960fe1
commit
2819652afd
1 changed files with 21 additions and 5 deletions
|
|
@ -2,8 +2,12 @@
|
||||||
"""
|
"""
|
||||||
Find UploadWizard contributions that are from Flickr and add them to the database.
|
Find UploadWizard contributions that are from Flickr and add them to the database.
|
||||||
|
|
||||||
For contributions with comment 'User created page with UploadWizard', queries the
|
Supports both UploadWizard comment styles:
|
||||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
- "User created page with UploadWizard" (older)
|
||||||
|
- "Uploaded a work by ... with UploadWizard" (newer, often includes Flickr URL)
|
||||||
|
|
||||||
|
If a Flickr URL is not present in the contribution comment, queries Commons API
|
||||||
|
to check if the image source is Flickr (by checking the Credit field).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
@ -27,6 +31,13 @@ def extract_flickr_url_from_credit(credit: str) -> str | None:
|
||||||
return match.group(0) if match else None
|
return match.group(0) if match else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_flickr_url_from_comment(comment: str) -> str | None:
|
||||||
|
"""Extract Flickr URL directly from a contribution comment."""
|
||||||
|
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/\s]+/\d+'
|
||||||
|
match = re.search(pattern, comment or "")
|
||||||
|
return match.group(0) if match else None
|
||||||
|
|
||||||
|
|
||||||
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
|
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
|
||||||
"""Fetch image metadata from Commons API for multiple titles."""
|
"""Fetch image metadata from Commons API for multiple titles."""
|
||||||
if not titles:
|
if not titles:
|
||||||
|
|
@ -97,10 +108,12 @@ def main():
|
||||||
)
|
)
|
||||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||||
|
|
||||||
# Find UploadWizard contributions (page creations only)
|
# Find UploadWizard file uploads.
|
||||||
|
# Old format: "User created page with UploadWizard"
|
||||||
|
# New format: "Uploaded a work by ... with UploadWizard"
|
||||||
upload_wizard = (
|
upload_wizard = (
|
||||||
session.query(Contribution)
|
session.query(Contribution)
|
||||||
.filter(Contribution.comment == "User created page with UploadWizard")
|
.filter(Contribution.comment.contains("UploadWizard"))
|
||||||
.filter(Contribution.title.startswith("File:"))
|
.filter(Contribution.title.startswith("File:"))
|
||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
|
|
@ -127,7 +140,10 @@ def main():
|
||||||
credit = meta.get("credit", "")
|
credit = meta.get("credit", "")
|
||||||
artist = meta.get("artist", "")
|
artist = meta.get("artist", "")
|
||||||
|
|
||||||
flickr_url = extract_flickr_url_from_credit(credit)
|
# Prefer URL directly in comment; fall back to extmetadata Credit.
|
||||||
|
flickr_url = extract_flickr_url_from_comment(c.comment or "")
|
||||||
|
if not flickr_url:
|
||||||
|
flickr_url = extract_flickr_url_from_credit(credit)
|
||||||
if not flickr_url:
|
if not flickr_url:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue