Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -2,3 +2,4 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
commons_contributions/thumbnail_cache.json
|
commons_contributions/thumbnail_cache.json
|
||||||
commons_contributions/sent_mail_index.json
|
commons_contributions/sent_mail_index.json
|
||||||
|
flickr_mail.db
|
||||||
|
|
|
||||||
147
download_commons_contributions.py
Executable file
147
download_commons_contributions.py
Executable file
|
|
@ -0,0 +1,147 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Download Wikimedia Commons contributions for a user."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from flickr_mail.database import init_db, get_session
|
||||||
|
from flickr_mail.models import Contribution
|
||||||
|
|
||||||
|
|
||||||
|
API_URL = "https://commons.wikimedia.org/w/api.php"
|
||||||
|
USERNAME = "Edward"
|
||||||
|
|
||||||
|
# Identify ourselves properly to Wikimedia
|
||||||
|
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
||||||
|
|
||||||
|
SESSION = requests.Session()
|
||||||
|
SESSION.headers.update({"User-Agent": USER_AGENT})
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_contributions(
|
||||||
|
continue_token: str | None = None,
|
||||||
|
) -> tuple[list[dict], str | None]:
|
||||||
|
"""Fetch a batch of contributions from the API."""
|
||||||
|
params = {
|
||||||
|
"action": "query",
|
||||||
|
"list": "usercontribs",
|
||||||
|
"ucuser": USERNAME,
|
||||||
|
"uclimit": "500",
|
||||||
|
"ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
|
||||||
|
"format": "json",
|
||||||
|
}
|
||||||
|
|
||||||
|
if continue_token:
|
||||||
|
params["uccontinue"] = continue_token
|
||||||
|
|
||||||
|
response = SESSION.get(API_URL, params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
contributions = data.get("query", {}).get("usercontribs", [])
|
||||||
|
|
||||||
|
# Get continuation token if more results available
|
||||||
|
new_continue = data.get("continue", {}).get("uccontinue")
|
||||||
|
|
||||||
|
return contributions, new_continue
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_contribution(session, c: dict) -> None:
|
||||||
|
"""Insert or update a contribution by revid."""
|
||||||
|
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||||
|
if existing:
|
||||||
|
return # Already have this revision
|
||||||
|
|
||||||
|
session.add(Contribution(
|
||||||
|
userid=c.get("userid"),
|
||||||
|
user=c.get("user"),
|
||||||
|
pageid=c.get("pageid"),
|
||||||
|
revid=c.get("revid"),
|
||||||
|
parentid=c.get("parentid"),
|
||||||
|
ns=c.get("ns"),
|
||||||
|
title=c.get("title"),
|
||||||
|
timestamp=c.get("timestamp"),
|
||||||
|
minor=c.get("minor"),
|
||||||
|
top=c.get("top"),
|
||||||
|
comment=c.get("comment"),
|
||||||
|
size=c.get("size"),
|
||||||
|
sizediff=c.get("sizediff"),
|
||||||
|
tags=json.dumps(c.get("tags", [])),
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Main entry point."""
|
||||||
|
init_db()
|
||||||
|
session = get_session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
existing_count = session.query(Contribution).count()
|
||||||
|
|
||||||
|
# Get the latest timestamp to know where to resume from
|
||||||
|
latest = (
|
||||||
|
session.query(Contribution)
|
||||||
|
.order_by(Contribution.timestamp.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_count > 0 and latest:
|
||||||
|
print(f"Database has {existing_count} contributions")
|
||||||
|
print(f"Latest: {latest.timestamp}")
|
||||||
|
print("Fetching new contributions...")
|
||||||
|
else:
|
||||||
|
print(f"Downloading contributions for user: {USERNAME}")
|
||||||
|
|
||||||
|
batch_num = 0
|
||||||
|
new_count = 0
|
||||||
|
continue_token = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
batch_num += 1
|
||||||
|
print(f" Fetching batch {batch_num}...", end=" ", flush=True)
|
||||||
|
|
||||||
|
contributions, continue_token = fetch_contributions(continue_token)
|
||||||
|
|
||||||
|
if not contributions:
|
||||||
|
print("no results")
|
||||||
|
break
|
||||||
|
|
||||||
|
batch_new = 0
|
||||||
|
for c in contributions:
|
||||||
|
# Stop if we've reached contributions we already have
|
||||||
|
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||||
|
if existing:
|
||||||
|
continue
|
||||||
|
upsert_contribution(session, c)
|
||||||
|
batch_new += 1
|
||||||
|
|
||||||
|
new_count += batch_new
|
||||||
|
print(f"got {len(contributions)}, {batch_new} new")
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
if batch_new == 0:
|
||||||
|
# All contributions in this batch already exist, we're caught up
|
||||||
|
print(" Caught up with existing data")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not continue_token:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Be polite to the API
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
total = session.query(Contribution).count()
|
||||||
|
print(f"\nDone! {new_count} new contributions, {total} total in database")
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
246
download_sent_mail.py
Executable file
246
download_sent_mail.py
Executable file
|
|
@ -0,0 +1,246 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Download sent FlickrMail messages for backup."""
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from flickr_mail.database import init_db, get_session
|
||||||
|
from flickr_mail.models import SentMessage
|
||||||
|
from flickr_mail.url_utils import (
|
||||||
|
creator_profile_from_flickr_url,
|
||||||
|
extract_urls_from_message,
|
||||||
|
normalize_flickr_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
BASE_URL = "https://www.flickr.com"
|
||||||
|
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
||||||
|
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-GB,en;q=0.9",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
|
"DNT": "1",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "same-origin",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
"Priority": "u=0, i",
|
||||||
|
}
|
||||||
|
|
||||||
|
COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cookies(cookie_str: str) -> dict[str, str]:
|
||||||
|
"""Parse cookie string into dictionary."""
|
||||||
|
cookies = {}
|
||||||
|
for item in cookie_str.split("; "):
|
||||||
|
if "=" in item:
|
||||||
|
key, value = item.split("=", 1)
|
||||||
|
cookies[key] = value
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
|
||||||
|
def create_session() -> requests.Session:
|
||||||
|
"""Create a requests session with authentication."""
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update(HEADERS)
|
||||||
|
session.cookies.update(parse_cookies(COOKIES_STR))
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
|
||||||
|
"""Fetch a page and return parsed HTML."""
|
||||||
|
response = session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
|
||||||
|
"""Extract message metadata from a sent mail list page."""
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
|
||||||
|
mail_rows = soup.select("tr.message_row")
|
||||||
|
|
||||||
|
for row in mail_rows:
|
||||||
|
msg = {}
|
||||||
|
|
||||||
|
# Get message ID from the row id attribute
|
||||||
|
row_id = row.get("id", "")
|
||||||
|
if row_id.startswith("message_row_"):
|
||||||
|
msg["message_id"] = row_id.replace("message_row_", "")
|
||||||
|
|
||||||
|
# Find message link in the subject cell
|
||||||
|
subj_cell = row.select_one("td.subj")
|
||||||
|
if subj_cell:
|
||||||
|
link = subj_cell.find("a")
|
||||||
|
if link:
|
||||||
|
msg["subject"] = link.get_text(strip=True)
|
||||||
|
msg["url"] = BASE_URL + link["href"]
|
||||||
|
|
||||||
|
# Recipient is in td.fromto
|
||||||
|
fromto_cell = row.select_one("td.fromto")
|
||||||
|
if fromto_cell:
|
||||||
|
msg["recipient"] = fromto_cell.get_text(strip=True)
|
||||||
|
|
||||||
|
# Date is in td.date
|
||||||
|
date_cell = row.select_one("td.date")
|
||||||
|
if date_cell:
|
||||||
|
msg["date"] = date_cell.get_text(strip=True)
|
||||||
|
|
||||||
|
if "message_id" in msg:
|
||||||
|
messages.append(msg)
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
def extract_message_content(soup: BeautifulSoup) -> dict:
|
||||||
|
"""Extract full message content from a message page."""
|
||||||
|
content = {}
|
||||||
|
|
||||||
|
# Find the ThinCase div containing the message
|
||||||
|
thin_case = soup.select_one(".ThinCase")
|
||||||
|
if not thin_case:
|
||||||
|
return content
|
||||||
|
|
||||||
|
# Find the table with message content
|
||||||
|
table = thin_case.find("table")
|
||||||
|
if not table:
|
||||||
|
return content
|
||||||
|
|
||||||
|
rows = table.find_all("tr", recursive=False)
|
||||||
|
|
||||||
|
# Row 0: To: <recipient>
|
||||||
|
# Row 1: Subject: <subject>
|
||||||
|
# Row 2: <empty> <body>
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_all("td", recursive=False)
|
||||||
|
if len(cells) >= 2:
|
||||||
|
header_cell = cells[0]
|
||||||
|
value_cell = cells[1]
|
||||||
|
|
||||||
|
header = header_cell.get_text(strip=True).lower()
|
||||||
|
|
||||||
|
if header == "to:":
|
||||||
|
# Get recipient username
|
||||||
|
username = value_cell.select_one(".username")
|
||||||
|
if username:
|
||||||
|
content["recipient"] = username.get_text(strip=True)
|
||||||
|
|
||||||
|
elif header == "subject:":
|
||||||
|
# Get subject from h3
|
||||||
|
h3 = value_cell.find("h3")
|
||||||
|
if h3:
|
||||||
|
content["subject"] = h3.get_text(strip=True)
|
||||||
|
|
||||||
|
elif header == "":
|
||||||
|
# This is the message body row (empty header cell)
|
||||||
|
# Get the content but exclude the delete form
|
||||||
|
form = value_cell.find("form")
|
||||||
|
if form:
|
||||||
|
form.decompose()
|
||||||
|
|
||||||
|
content["body"] = value_cell.get_text(separator="\n", strip=True)
|
||||||
|
content["body_html"] = str(value_cell)
|
||||||
|
break # Body found, stop processing
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Main entry point."""
|
||||||
|
init_db()
|
||||||
|
db_session = get_session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
existing_ids = {
|
||||||
|
r[0] for r in db_session.query(SentMessage.message_id).all()
|
||||||
|
}
|
||||||
|
print(f"Database has {len(existing_ids)} messages")
|
||||||
|
|
||||||
|
http_session = create_session()
|
||||||
|
|
||||||
|
# Scrape all pages to find new messages
|
||||||
|
total_pages = 29
|
||||||
|
new_messages: list[dict] = []
|
||||||
|
|
||||||
|
print("Fetching message list from all pages...")
|
||||||
|
for page in range(1, total_pages + 1):
|
||||||
|
url = SENT_MAIL_URL.format(page=page)
|
||||||
|
print(f" Fetching page {page}/{total_pages}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
soup = fetch_page(http_session, url)
|
||||||
|
page_messages = extract_messages_from_list_page(soup)
|
||||||
|
|
||||||
|
for msg in page_messages:
|
||||||
|
if msg["message_id"] not in existing_ids:
|
||||||
|
new_messages.append(msg)
|
||||||
|
|
||||||
|
time.sleep(1) # Be polite to the server
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error fetching page {page}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Found {len(new_messages)} new messages to download")
|
||||||
|
|
||||||
|
# Download individual messages
|
||||||
|
for i, msg in enumerate(new_messages, 1):
|
||||||
|
msg_id = msg["message_id"]
|
||||||
|
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
|
||||||
|
|
||||||
|
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
soup = fetch_page(http_session, url)
|
||||||
|
content = extract_message_content(soup)
|
||||||
|
|
||||||
|
# Merge with metadata
|
||||||
|
full_msg = {**msg, **content}
|
||||||
|
|
||||||
|
body = full_msg.get("body", "")
|
||||||
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||||
|
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
||||||
|
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
||||||
|
|
||||||
|
db_session.add(SentMessage(
|
||||||
|
message_id=msg_id,
|
||||||
|
subject=full_msg.get("subject", ""),
|
||||||
|
url=full_msg.get("url", ""),
|
||||||
|
recipient=full_msg.get("recipient", ""),
|
||||||
|
date=full_msg.get("date", ""),
|
||||||
|
body=body,
|
||||||
|
body_html=full_msg.get("body_html", ""),
|
||||||
|
flickr_url=flickr_url,
|
||||||
|
normalized_flickr_url=normalized,
|
||||||
|
wikipedia_url=wikipedia_url,
|
||||||
|
creator_profile_url=creator_profile,
|
||||||
|
))
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
time.sleep(1) # Be polite
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
db_session.rollback()
|
||||||
|
print(f" Error downloading message {msg_id}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
total = db_session.query(SentMessage).count()
|
||||||
|
print(f"Done! {total} messages in database")
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
db_session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
db_session.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
158
extract_flickr_uploads.py
Normal file
158
extract_flickr_uploads.py
Normal file
|
|
@ -0,0 +1,158 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Extract Flickr uploads from Wikimedia Commons contributions.
|
||||||
|
|
||||||
|
Filters contributions where the comment contains a flickr.com URL and extracts:
|
||||||
|
- pageid, revid, title, timestamp
|
||||||
|
- flickr_url: the Flickr photo URL
|
||||||
|
- creator: the photographer/author name
|
||||||
|
|
||||||
|
Links uploads to sent messages via normalized Flickr URL matching.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from flickr_mail.database import init_db, get_session
|
||||||
|
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||||
|
from flickr_mail.url_utils import normalize_flickr_url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_flickr_url(comment: str) -> str | None:
|
||||||
|
"""Extract the Flickr photo URL from a comment."""
|
||||||
|
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
|
||||||
|
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
|
||||||
|
patterns = [
|
||||||
|
# Plain URL (modern format)
|
||||||
|
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
|
||||||
|
# URL in wiki markup [url title]
|
||||||
|
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, comment)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_creator(comment: str) -> str | None:
|
||||||
|
"""Extract the creator/author name from a comment."""
|
||||||
|
# Modern format: "Uploaded a work by {creator} from https://..."
|
||||||
|
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
|
||||||
|
if modern_match:
|
||||||
|
return modern_match.group(1).strip()
|
||||||
|
|
||||||
|
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
|
||||||
|
# The author name comes after the URL, before ] or "from"
|
||||||
|
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
|
||||||
|
if author_match:
|
||||||
|
author = author_match.group(1).strip()
|
||||||
|
# Remove trailing location like "from Toronto, Canada"
|
||||||
|
author = re.sub(r'\s+from\s+.+$', '', author)
|
||||||
|
return author
|
||||||
|
|
||||||
|
# Handle truncated comments where Author field is cut off
|
||||||
|
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
|
||||||
|
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
|
||||||
|
if truncated_match:
|
||||||
|
author = truncated_match.group(1).strip()
|
||||||
|
if author:
|
||||||
|
return author
|
||||||
|
|
||||||
|
# Sometimes Author field is just plain text without URL
|
||||||
|
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
|
||||||
|
if author_plain:
|
||||||
|
author = author_plain.group(1).strip()
|
||||||
|
# Skip if it looks like a wiki user link
|
||||||
|
if not author.startswith('[[User:') and author:
|
||||||
|
return author
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Process contributions and extract Flickr uploads."""
|
||||||
|
init_db()
|
||||||
|
session = get_session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get existing upload revids to avoid duplicates
|
||||||
|
existing_revids = {
|
||||||
|
r[0] for r in session.query(FlickrUpload.revid).all()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build sent message index: normalized_flickr_url -> message
|
||||||
|
sent_messages = (
|
||||||
|
session.query(SentMessage)
|
||||||
|
.filter(SentMessage.normalized_flickr_url != "")
|
||||||
|
.filter(~SentMessage.subject.startswith("Re:"))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||||
|
print(f"Sent message index: {len(url_to_message)} entries")
|
||||||
|
|
||||||
|
# Query contributions with flickr.com in comment
|
||||||
|
contributions = (
|
||||||
|
session.query(Contribution)
|
||||||
|
.filter(Contribution.comment.ilike("%flickr.com%"))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Found {len(contributions)} contributions mentioning flickr.com")
|
||||||
|
|
||||||
|
new_count = 0
|
||||||
|
for contrib in contributions:
|
||||||
|
if contrib.revid in existing_revids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
flickr_url = extract_flickr_url(contrib.comment or "")
|
||||||
|
if not flickr_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
creator = extract_creator(contrib.comment or "")
|
||||||
|
normalized = normalize_flickr_url(flickr_url)
|
||||||
|
|
||||||
|
# Look up sent message for FK linking
|
||||||
|
msg = url_to_message.get(normalized) if normalized else None
|
||||||
|
|
||||||
|
session.add(FlickrUpload(
|
||||||
|
pageid=contrib.pageid,
|
||||||
|
revid=contrib.revid,
|
||||||
|
title=contrib.title,
|
||||||
|
timestamp=contrib.timestamp,
|
||||||
|
flickr_url=flickr_url,
|
||||||
|
normalized_flickr_url=normalized,
|
||||||
|
creator=creator,
|
||||||
|
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||||
|
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||||
|
sent_message_id=msg.message_id if msg else None,
|
||||||
|
))
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
total = session.query(FlickrUpload).count()
|
||||||
|
linked = session.query(FlickrUpload).filter(
|
||||||
|
FlickrUpload.sent_message_id.isnot(None)
|
||||||
|
).count()
|
||||||
|
|
||||||
|
print(f"Extracted {new_count} new Flickr uploads")
|
||||||
|
print(f"Total: {total} uploads, {linked} linked to sent messages")
|
||||||
|
|
||||||
|
# Show some stats
|
||||||
|
with_creator = session.query(FlickrUpload).filter(
|
||||||
|
FlickrUpload.creator.isnot(None)
|
||||||
|
).count()
|
||||||
|
print(f" - {with_creator} with creator identified")
|
||||||
|
print(f" - {total - with_creator} without creator")
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
0
flickr_mail/__init__.py
Normal file
0
flickr_mail/__init__.py
Normal file
31
flickr_mail/database.py
Normal file
31
flickr_mail/database.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""Database engine and session factory for flickr-mail."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, event
|
||||||
|
from sqlalchemy.orm import Session, sessionmaker
|
||||||
|
|
||||||
|
from flickr_mail.models import Base
|
||||||
|
|
||||||
|
DB_PATH = Path(__file__).parent.parent / "flickr_mail.db"
|
||||||
|
|
||||||
|
engine = create_engine(f"sqlite:///{DB_PATH}")
|
||||||
|
SessionLocal = sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
@event.listens_for(engine, "connect")
|
||||||
|
def set_sqlite_pragma(dbapi_connection, connection_record):
|
||||||
|
"""Enable WAL mode for concurrent read/write access."""
|
||||||
|
cursor = dbapi_connection.cursor()
|
||||||
|
cursor.execute("PRAGMA journal_mode=WAL")
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_db() -> None:
|
||||||
|
"""Create all tables."""
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
|
||||||
|
|
||||||
|
def get_session() -> Session:
|
||||||
|
"""Create a new database session."""
|
||||||
|
return SessionLocal()
|
||||||
93
flickr_mail/models.py
Normal file
93
flickr_mail/models.py
Normal file
|
|
@ -0,0 +1,93 @@
|
||||||
|
"""SQLAlchemy models for flickr-mail."""
|
||||||
|
|
||||||
|
from sqlalchemy import ForeignKey, Index, Text
|
||||||
|
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
|
||||||
|
class Base(DeclarativeBase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Contribution(Base):
|
||||||
|
__tablename__ = "contributions"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True)
|
||||||
|
userid: Mapped[int | None]
|
||||||
|
user: Mapped[str | None]
|
||||||
|
pageid: Mapped[int | None]
|
||||||
|
revid: Mapped[int | None] = mapped_column(unique=True)
|
||||||
|
parentid: Mapped[int | None]
|
||||||
|
ns: Mapped[int | None]
|
||||||
|
title: Mapped[str | None]
|
||||||
|
timestamp: Mapped[str | None]
|
||||||
|
minor: Mapped[str | None]
|
||||||
|
top: Mapped[str | None]
|
||||||
|
comment: Mapped[str | None] = mapped_column(Text)
|
||||||
|
size: Mapped[int | None]
|
||||||
|
sizediff: Mapped[int | None]
|
||||||
|
tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index("ix_contributions_timestamp", "timestamp"),
|
||||||
|
Index("ix_contributions_pageid", "pageid"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SentMessage(Base):
|
||||||
|
__tablename__ = "sent_messages"
|
||||||
|
|
||||||
|
message_id: Mapped[str] = mapped_column(primary_key=True)
|
||||||
|
subject: Mapped[str | None]
|
||||||
|
url: Mapped[str | None]
|
||||||
|
recipient: Mapped[str | None]
|
||||||
|
date: Mapped[str | None]
|
||||||
|
body: Mapped[str | None] = mapped_column(Text)
|
||||||
|
body_html: Mapped[str | None] = mapped_column(Text)
|
||||||
|
flickr_url: Mapped[str | None]
|
||||||
|
normalized_flickr_url: Mapped[str | None]
|
||||||
|
wikipedia_url: Mapped[str | None]
|
||||||
|
creator_profile_url: Mapped[str | None]
|
||||||
|
|
||||||
|
flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
|
||||||
|
back_populates="sent_message"
|
||||||
|
)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index("ix_sent_messages_recipient", "recipient"),
|
||||||
|
Index("ix_sent_messages_normalized_flickr_url", "normalized_flickr_url"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FlickrUpload(Base):
|
||||||
|
__tablename__ = "flickr_uploads"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True)
|
||||||
|
pageid: Mapped[int | None]
|
||||||
|
revid: Mapped[int | None]
|
||||||
|
title: Mapped[str | None]
|
||||||
|
timestamp: Mapped[str | None]
|
||||||
|
flickr_url: Mapped[str | None]
|
||||||
|
normalized_flickr_url: Mapped[str | None]
|
||||||
|
creator: Mapped[str | None]
|
||||||
|
wikipedia_url: Mapped[str | None]
|
||||||
|
creator_profile_url: Mapped[str | None]
|
||||||
|
sent_message_id: Mapped[str | None] = mapped_column(
|
||||||
|
ForeignKey("sent_messages.message_id")
|
||||||
|
)
|
||||||
|
|
||||||
|
sent_message: Mapped[SentMessage | None] = relationship(
|
||||||
|
back_populates="flickr_uploads"
|
||||||
|
)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index("ix_flickr_uploads_normalized_flickr_url", "normalized_flickr_url"),
|
||||||
|
Index("ix_flickr_uploads_timestamp", "timestamp"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ThumbnailCache(Base):
|
||||||
|
__tablename__ = "thumbnail_cache"
|
||||||
|
|
||||||
|
title: Mapped[str] = mapped_column(primary_key=True)
|
||||||
|
thumb_url: Mapped[str | None]
|
||||||
|
fetched_at: Mapped[int | None] # Unix timestamp
|
||||||
52
flickr_mail/url_utils.py
Normal file
52
flickr_mail/url_utils.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
"""Shared URL utility functions for flickr-mail."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_flickr_url(url: str) -> str:
|
||||||
|
"""Normalize a Flickr photo URL for comparison."""
|
||||||
|
# Remove protocol
|
||||||
|
url = url.replace("https://", "").replace("http://", "")
|
||||||
|
# Remove www.
|
||||||
|
url = url.replace("www.", "")
|
||||||
|
# Remove trailing slash
|
||||||
|
url = url.rstrip("/")
|
||||||
|
# Ensure it starts with flickr.com
|
||||||
|
if not url.startswith("flickr.com"):
|
||||||
|
return ""
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
||||||
|
"""Extract flickr URL and Wikipedia URL from message body."""
|
||||||
|
|
||||||
|
flickr_url = ""
|
||||||
|
wikipedia_url = ""
|
||||||
|
|
||||||
|
# Find flickr photo URLs
|
||||||
|
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
||||||
|
flickr_matches = re.findall(flickr_pattern, body)
|
||||||
|
if flickr_matches:
|
||||||
|
flickr_url = flickr_matches[0]
|
||||||
|
if not flickr_url.startswith("http"):
|
||||||
|
flickr_url = "https://" + flickr_url
|
||||||
|
|
||||||
|
# Find Wikipedia URLs
|
||||||
|
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
||||||
|
wiki_matches = re.findall(wiki_pattern, body)
|
||||||
|
if wiki_matches:
|
||||||
|
wikipedia_url = wiki_matches[0]
|
||||||
|
if not wikipedia_url.startswith("http"):
|
||||||
|
wikipedia_url = "https://" + wikipedia_url
|
||||||
|
|
||||||
|
return flickr_url, wikipedia_url
|
||||||
|
|
||||||
|
|
||||||
|
def creator_profile_from_flickr_url(flickr_url: str) -> str:
|
||||||
|
"""Extract creator profile URL from a flickr photo URL."""
|
||||||
|
parts = flickr_url.split("/")
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if part == "photos" and i + 1 < len(parts):
|
||||||
|
username = parts[i + 1]
|
||||||
|
return f"https://www.flickr.com/photos/{username}"
|
||||||
|
return ""
|
||||||
307
main.py
307
main.py
|
|
@ -9,14 +9,17 @@ import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import typing
|
import typing
|
||||||
from pathlib import Path
|
|
||||||
from urllib.parse import quote, unquote
|
from urllib.parse import quote, unquote
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
import requests
|
import requests
|
||||||
import werkzeug
|
import werkzeug
|
||||||
|
from sqlalchemy import func
|
||||||
from werkzeug.debug.tbtools import DebugTraceback
|
from werkzeug.debug.tbtools import DebugTraceback
|
||||||
|
|
||||||
|
from flickr_mail.database import get_session
|
||||||
|
from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
|
||||||
|
from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
@ -26,18 +29,6 @@ app.debug = False
|
||||||
|
|
||||||
enwiki = "en.wikipedia.org/wiki/"
|
enwiki = "en.wikipedia.org/wiki/"
|
||||||
|
|
||||||
# Path to Commons contributions data and sent mail
|
|
||||||
COMMONS_UPLOADS_FILE = (
|
|
||||||
Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
|
|
||||||
)
|
|
||||||
COMMONS_CACHE_FILE = (
|
|
||||||
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
|
|
||||||
)
|
|
||||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
|
||||||
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
|
||||||
SENT_MAIL_INDEX_CACHE = (
|
|
||||||
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
|
|
||||||
)
|
|
||||||
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
||||||
RECENT_UPLOADS_COUNT = 24
|
RECENT_UPLOADS_COUNT = 24
|
||||||
|
|
||||||
|
|
@ -165,132 +156,6 @@ class CommonsUpload:
|
||||||
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
||||||
|
|
||||||
|
|
||||||
def normalize_flickr_url(url: str) -> str:
|
|
||||||
"""Normalize a Flickr photo URL for comparison."""
|
|
||||||
# Remove protocol
|
|
||||||
url = url.replace("https://", "").replace("http://", "")
|
|
||||||
# Remove www.
|
|
||||||
url = url.replace("www.", "")
|
|
||||||
# Remove trailing slash
|
|
||||||
url = url.rstrip("/")
|
|
||||||
# Ensure it starts with flickr.com
|
|
||||||
if not url.startswith("flickr.com"):
|
|
||||||
return ""
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
|
||||||
"""Extract flickr URL and Wikipedia URL from message body."""
|
|
||||||
|
|
||||||
flickr_url = ""
|
|
||||||
wikipedia_url = ""
|
|
||||||
|
|
||||||
# Find flickr photo URLs
|
|
||||||
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
|
||||||
flickr_matches = re.findall(flickr_pattern, body)
|
|
||||||
if flickr_matches:
|
|
||||||
flickr_url = flickr_matches[0]
|
|
||||||
if not flickr_url.startswith("http"):
|
|
||||||
flickr_url = "https://" + flickr_url
|
|
||||||
|
|
||||||
# Find Wikipedia URLs
|
|
||||||
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
|
||||||
wiki_matches = re.findall(wiki_pattern, body)
|
|
||||||
if wiki_matches:
|
|
||||||
wikipedia_url = wiki_matches[0]
|
|
||||||
if not wikipedia_url.startswith("http"):
|
|
||||||
wikipedia_url = "https://" + wikipedia_url
|
|
||||||
|
|
||||||
return flickr_url, wikipedia_url
|
|
||||||
|
|
||||||
|
|
||||||
def build_sent_mail_index() -> dict[str, dict[str, str]]:
|
|
||||||
"""Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
|
|
||||||
if not SENT_MAIL_DIR.exists():
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Check if we have a cached index
|
|
||||||
if SENT_MAIL_INDEX_CACHE.exists():
|
|
||||||
try:
|
|
||||||
with open(SENT_MAIL_INDEX_CACHE) as f:
|
|
||||||
cache = json.load(f)
|
|
||||||
# Check if cache is still valid (compare file count)
|
|
||||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
|
||||||
if cache.get("file_count") == len(json_files):
|
|
||||||
return cache.get("index", {})
|
|
||||||
except (json.JSONDecodeError, OSError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
index: dict[str, dict[str, str]] = {}
|
|
||||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
|
||||||
|
|
||||||
for json_file in json_files:
|
|
||||||
try:
|
|
||||||
with open(json_file) as f:
|
|
||||||
message = json.load(f)
|
|
||||||
except (json.JSONDecodeError, OSError):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip replies - we want original requests
|
|
||||||
subject = message.get("subject", "")
|
|
||||||
if subject.startswith("Re:"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
body = message.get("body", "")
|
|
||||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
||||||
|
|
||||||
if not flickr_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
normalized = normalize_flickr_url(flickr_url)
|
|
||||||
if not normalized:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract creator profile URL from flickr URL
|
|
||||||
# flickr.com/photos/username/12345 -> flickr.com/photos/username
|
|
||||||
parts = flickr_url.split("/")
|
|
||||||
creator_profile = ""
|
|
||||||
for i, part in enumerate(parts):
|
|
||||||
if part == "photos" and i + 1 < len(parts):
|
|
||||||
username = parts[i + 1]
|
|
||||||
creator_profile = f"https://www.flickr.com/photos/{username}"
|
|
||||||
break
|
|
||||||
|
|
||||||
index[normalized] = {
|
|
||||||
"wikipedia_url": wikipedia_url,
|
|
||||||
"creator_profile_url": creator_profile,
|
|
||||||
"recipient": message.get("recipient", ""),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Cache the index
|
|
||||||
try:
|
|
||||||
with open(SENT_MAIL_INDEX_CACHE, "w") as f:
|
|
||||||
json.dump({"file_count": len(json_files), "index": index}, f)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
|
|
||||||
"""Load the thumbnail cache from disk."""
|
|
||||||
if not COMMONS_CACHE_FILE.exists():
|
|
||||||
return {"timestamp": 0, "thumbnails": {}}
|
|
||||||
try:
|
|
||||||
with open(COMMONS_CACHE_FILE) as f:
|
|
||||||
return typing.cast(dict[str, typing.Any], json.load(f))
|
|
||||||
except (json.JSONDecodeError, OSError):
|
|
||||||
return {"timestamp": 0, "thumbnails": {}}
|
|
||||||
|
|
||||||
|
|
||||||
def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
|
|
||||||
"""Save the thumbnail cache to disk."""
|
|
||||||
try:
|
|
||||||
with open(COMMONS_CACHE_FILE, "w") as f:
|
|
||||||
json.dump(cache, f)
|
|
||||||
except OSError:
|
|
||||||
pass # Ignore cache write errors
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
||||||
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
||||||
|
|
@ -340,79 +205,72 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
|
||||||
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
||||||
of uploads obtained via Flickr mail (not just the ones returned).
|
of uploads obtained via Flickr mail (not just the ones returned).
|
||||||
"""
|
"""
|
||||||
if not COMMONS_UPLOADS_FILE.exists():
|
session = get_session()
|
||||||
return [], 0
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(COMMONS_UPLOADS_FILE) as f:
|
query = (
|
||||||
all_uploads = json.load(f)
|
session.query(FlickrUpload, SentMessage)
|
||||||
except (json.JSONDecodeError, OSError):
|
.join(SentMessage)
|
||||||
return [], 0
|
.order_by(FlickrUpload.timestamp.desc())
|
||||||
|
|
||||||
# Build sent mail index
|
|
||||||
sent_mail_index = build_sent_mail_index()
|
|
||||||
|
|
||||||
# Filter uploads to only those with matching sent mail
|
|
||||||
# Count all matches, but only keep RECENT_UPLOADS_COUNT for display
|
|
||||||
uploads_with_mail: list[dict[str, typing.Any]] = []
|
|
||||||
total_matched = 0
|
|
||||||
for upload in all_uploads:
|
|
||||||
flickr_url = upload.get("flickr_url", "")
|
|
||||||
normalized = normalize_flickr_url(flickr_url)
|
|
||||||
if normalized and normalized in sent_mail_index:
|
|
||||||
total_matched += 1
|
|
||||||
if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
|
|
||||||
upload["_mail_info"] = sent_mail_index[normalized]
|
|
||||||
uploads_with_mail.append(upload)
|
|
||||||
|
|
||||||
if not uploads_with_mail:
|
|
||||||
return [], 0
|
|
||||||
|
|
||||||
# Load cache and check if it's still valid
|
|
||||||
cache = load_commons_thumbnail_cache()
|
|
||||||
cache_age = time.time() - cache.get("timestamp", 0)
|
|
||||||
cached_thumbs = cache.get("thumbnails", {})
|
|
||||||
|
|
||||||
# Find which titles need fetching
|
|
||||||
titles = [u["title"] for u in uploads_with_mail]
|
|
||||||
titles_to_fetch = [t for t in titles if t not in cached_thumbs]
|
|
||||||
|
|
||||||
# Fetch missing thumbnails or refresh if cache is old
|
|
||||||
if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
|
|
||||||
new_thumbs = fetch_commons_thumbnails(
|
|
||||||
titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
|
|
||||||
)
|
)
|
||||||
cached_thumbs.update(new_thumbs)
|
total_matched = query.count()
|
||||||
cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
|
if total_matched == 0:
|
||||||
save_commons_thumbnail_cache(cache)
|
return [], 0
|
||||||
|
|
||||||
# Build the result list
|
recent = query.limit(RECENT_UPLOADS_COUNT).all()
|
||||||
result: list[CommonsUpload] = []
|
|
||||||
for upload in uploads_with_mail:
|
|
||||||
title = upload["title"]
|
|
||||||
thumb_url = cached_thumbs.get(title, "")
|
|
||||||
if not thumb_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
mail_info = upload.get("_mail_info", {})
|
# Get thumbnails from cache
|
||||||
|
titles = [upload.title for upload, msg in recent]
|
||||||
|
now = int(time.time())
|
||||||
|
cached = {
|
||||||
|
tc.title: tc
|
||||||
|
for tc in session.query(ThumbnailCache)
|
||||||
|
.filter(ThumbnailCache.title.in_(titles))
|
||||||
|
.all()
|
||||||
|
}
|
||||||
|
|
||||||
# Convert title to Commons URL
|
# Find titles needing fetch (missing or expired)
|
||||||
commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
|
titles_to_fetch = [
|
||||||
|
t for t in titles
|
||||||
|
if t not in cached or (now - (cached[t].fetched_at or 0)) > COMMONS_CACHE_MAX_AGE
|
||||||
|
]
|
||||||
|
|
||||||
result.append(
|
if titles_to_fetch:
|
||||||
CommonsUpload(
|
new_thumbs = fetch_commons_thumbnails(titles_to_fetch)
|
||||||
title=title.replace("File:", "").rsplit(".", 1)[0],
|
for title, thumb_url in new_thumbs.items():
|
||||||
thumb_url=thumb_url,
|
existing = cached.get(title)
|
||||||
commons_url=commons_url,
|
if existing:
|
||||||
flickr_url=upload.get("flickr_url", ""),
|
existing.thumb_url = thumb_url
|
||||||
creator=upload.get("creator") or "Unknown",
|
existing.fetched_at = now
|
||||||
timestamp=upload.get("timestamp", "")[:10],
|
else:
|
||||||
wikipedia_url=mail_info.get("wikipedia_url", ""),
|
tc = ThumbnailCache(title=title, thumb_url=thumb_url, fetched_at=now)
|
||||||
creator_profile_url=mail_info.get("creator_profile_url", ""),
|
session.add(tc)
|
||||||
|
cached[title] = tc
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
result: list[CommonsUpload] = []
|
||||||
|
for upload, msg in recent:
|
||||||
|
thumb_url = cached[upload.title].thumb_url if upload.title in cached else ""
|
||||||
|
if not thumb_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
commons_url = f"https://commons.wikimedia.org/wiki/{upload.title.replace(' ', '_')}"
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
CommonsUpload(
|
||||||
|
title=upload.title.replace("File:", "").rsplit(".", 1)[0],
|
||||||
|
thumb_url=thumb_url,
|
||||||
|
commons_url=commons_url,
|
||||||
|
flickr_url=upload.flickr_url or "",
|
||||||
|
creator=upload.creator or "Unknown",
|
||||||
|
timestamp=(upload.timestamp or "")[:10],
|
||||||
|
wikipedia_url=upload.wikipedia_url or "",
|
||||||
|
creator_profile_url=upload.creator_profile_url or "",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
return result, total_matched
|
return result, total_matched
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||||
|
|
@ -421,26 +279,33 @@ def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||||
Checks both the display name (flickr_user) and username (flickr_username)
|
Checks both the display name (flickr_user) and username (flickr_username)
|
||||||
against the recipient field in the messages index.
|
against the recipient field in the messages index.
|
||||||
"""
|
"""
|
||||||
if not SENT_MAIL_INDEX_FILE.exists():
|
names = set()
|
||||||
|
if flickr_user:
|
||||||
|
names.add(flickr_user.lower())
|
||||||
|
if flickr_username:
|
||||||
|
names.add(flickr_username.lower())
|
||||||
|
if not names:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
session = get_session()
|
||||||
try:
|
try:
|
||||||
with open(SENT_MAIL_INDEX_FILE) as f:
|
messages = (
|
||||||
messages = json.load(f)
|
session.query(SentMessage)
|
||||||
except (json.JSONDecodeError, OSError):
|
.filter(func.lower(SentMessage.recipient).in_(names))
|
||||||
return []
|
.all()
|
||||||
|
)
|
||||||
# Normalize for case-insensitive comparison
|
return [
|
||||||
flickr_user_lower = flickr_user.lower() if flickr_user else ""
|
{
|
||||||
flickr_username_lower = flickr_username.lower() if flickr_username else ""
|
"message_id": m.message_id,
|
||||||
|
"subject": m.subject,
|
||||||
matches = []
|
"url": m.url,
|
||||||
for msg in messages:
|
"recipient": m.recipient,
|
||||||
recipient = msg.get("recipient", "").lower()
|
"date": m.date,
|
||||||
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
|
}
|
||||||
matches.append(msg)
|
for m in messages
|
||||||
|
]
|
||||||
return matches
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
def parse_category_input(category_input: str) -> str | None:
|
def parse_category_input(category_input: str) -> str | None:
|
||||||
|
|
|
||||||
233
migrate_json_to_db.py
Normal file
233
migrate_json_to_db.py
Normal file
|
|
@ -0,0 +1,233 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""One-time migration from JSON files to SQLite database."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from flickr_mail.database import init_db, get_session
|
||||||
|
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
|
||||||
|
from flickr_mail.url_utils import (
|
||||||
|
creator_profile_from_flickr_url,
|
||||||
|
extract_urls_from_message,
|
||||||
|
normalize_flickr_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
|
||||||
|
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||||
|
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||||
|
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
|
||||||
|
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
|
||||||
|
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_contributions(session) -> int:
|
||||||
|
"""Migrate contributions.json to contributions table."""
|
||||||
|
if not CONTRIBUTIONS_FILE.exists():
|
||||||
|
print("No contributions.json found, skipping")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
with open(CONTRIBUTIONS_FILE) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
contributions = data.get("contributions", [])
|
||||||
|
print(f"Migrating {len(contributions)} contributions...")
|
||||||
|
|
||||||
|
for c in contributions:
|
||||||
|
session.add(Contribution(
|
||||||
|
userid=c.get("userid"),
|
||||||
|
user=c.get("user"),
|
||||||
|
pageid=c.get("pageid"),
|
||||||
|
revid=c.get("revid"),
|
||||||
|
parentid=c.get("parentid"),
|
||||||
|
ns=c.get("ns"),
|
||||||
|
title=c.get("title"),
|
||||||
|
timestamp=c.get("timestamp"),
|
||||||
|
minor=c.get("minor"),
|
||||||
|
top=c.get("top"),
|
||||||
|
comment=c.get("comment"),
|
||||||
|
size=c.get("size"),
|
||||||
|
sizediff=c.get("sizediff"),
|
||||||
|
tags=json.dumps(c.get("tags", [])),
|
||||||
|
))
|
||||||
|
|
||||||
|
session.flush()
|
||||||
|
count = session.query(Contribution).count()
|
||||||
|
print(f" -> {count} contributions migrated")
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_sent_messages(session) -> dict[str, str]:
|
||||||
|
"""Migrate sent messages to sent_messages table.
|
||||||
|
|
||||||
|
Returns a dict of normalized_flickr_url -> message_id for FK linking.
|
||||||
|
"""
|
||||||
|
if not SENT_MAIL_INDEX.exists():
|
||||||
|
print("No messages_index.json found, skipping")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with open(SENT_MAIL_INDEX) as f:
|
||||||
|
index = json.load(f)
|
||||||
|
|
||||||
|
print(f"Migrating {len(index)} sent messages...")
|
||||||
|
|
||||||
|
url_to_message_id: dict[str, str] = {}
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for msg_meta in index:
|
||||||
|
msg_id = msg_meta.get("message_id", "")
|
||||||
|
if not msg_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Load the full message from individual file
|
||||||
|
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
|
||||||
|
if msg_file.exists():
|
||||||
|
with open(msg_file) as f:
|
||||||
|
msg = json.load(f)
|
||||||
|
else:
|
||||||
|
msg = msg_meta
|
||||||
|
|
||||||
|
body = msg.get("body", "")
|
||||||
|
subject = msg.get("subject", "")
|
||||||
|
|
||||||
|
# Extract URLs from body
|
||||||
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||||
|
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
||||||
|
|
||||||
|
# Extract creator profile URL
|
||||||
|
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
||||||
|
|
||||||
|
session.add(SentMessage(
|
||||||
|
message_id=msg_id,
|
||||||
|
subject=msg.get("subject", ""),
|
||||||
|
url=msg.get("url", ""),
|
||||||
|
recipient=msg.get("recipient", ""),
|
||||||
|
date=msg.get("date", ""),
|
||||||
|
body=body,
|
||||||
|
body_html=msg.get("body_html", ""),
|
||||||
|
flickr_url=flickr_url,
|
||||||
|
normalized_flickr_url=normalized,
|
||||||
|
wikipedia_url=wikipedia_url,
|
||||||
|
creator_profile_url=creator_profile_url,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Build URL -> message_id map for FK linking (skip replies)
|
||||||
|
if normalized and not subject.startswith("Re:"):
|
||||||
|
url_to_message_id[normalized] = msg_id
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
session.flush()
|
||||||
|
actual = session.query(SentMessage).count()
|
||||||
|
print(f" -> {actual} sent messages migrated")
|
||||||
|
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
|
||||||
|
return url_to_message_id
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
|
||||||
|
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
|
||||||
|
if not FLICKR_UPLOADS_FILE.exists():
|
||||||
|
print("No flickr_uploads.json found, skipping")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
with open(FLICKR_UPLOADS_FILE) as f:
|
||||||
|
uploads = json.load(f)
|
||||||
|
|
||||||
|
print(f"Migrating {len(uploads)} flickr uploads...")
|
||||||
|
|
||||||
|
linked = 0
|
||||||
|
for u in uploads:
|
||||||
|
flickr_url = u.get("flickr_url", "")
|
||||||
|
normalized = normalize_flickr_url(flickr_url)
|
||||||
|
|
||||||
|
# Look up sent message FK
|
||||||
|
sent_message_id = url_to_message_id.get(normalized) if normalized else None
|
||||||
|
if sent_message_id:
|
||||||
|
linked += 1
|
||||||
|
|
||||||
|
# Get wikipedia_url and creator_profile_url from the linked message
|
||||||
|
wikipedia_url = ""
|
||||||
|
creator_profile_url = ""
|
||||||
|
if sent_message_id:
|
||||||
|
msg = session.get(SentMessage, sent_message_id)
|
||||||
|
if msg:
|
||||||
|
wikipedia_url = msg.wikipedia_url or ""
|
||||||
|
creator_profile_url = msg.creator_profile_url or ""
|
||||||
|
|
||||||
|
session.add(FlickrUpload(
|
||||||
|
pageid=u.get("pageid"),
|
||||||
|
revid=u.get("revid"),
|
||||||
|
title=u.get("title"),
|
||||||
|
timestamp=u.get("timestamp"),
|
||||||
|
flickr_url=flickr_url,
|
||||||
|
normalized_flickr_url=normalized,
|
||||||
|
creator=u.get("creator"),
|
||||||
|
wikipedia_url=wikipedia_url,
|
||||||
|
creator_profile_url=creator_profile_url,
|
||||||
|
sent_message_id=sent_message_id,
|
||||||
|
))
|
||||||
|
|
||||||
|
session.flush()
|
||||||
|
count = session.query(FlickrUpload).count()
|
||||||
|
print(f" -> {count} flickr uploads migrated")
|
||||||
|
print(f" -> {linked} linked to sent messages")
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_thumbnail_cache(session) -> int:
|
||||||
|
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
|
||||||
|
if not THUMBNAIL_CACHE_FILE.exists():
|
||||||
|
print("No thumbnail_cache.json found, skipping")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
with open(THUMBNAIL_CACHE_FILE) as f:
|
||||||
|
cache = json.load(f)
|
||||||
|
|
||||||
|
thumbnails = cache.get("thumbnails", {})
|
||||||
|
cache_timestamp = int(cache.get("timestamp", 0))
|
||||||
|
|
||||||
|
print(f"Migrating {len(thumbnails)} cached thumbnails...")
|
||||||
|
|
||||||
|
for title, thumb_url in thumbnails.items():
|
||||||
|
session.add(ThumbnailCache(
|
||||||
|
title=title,
|
||||||
|
thumb_url=thumb_url,
|
||||||
|
fetched_at=cache_timestamp,
|
||||||
|
))
|
||||||
|
|
||||||
|
session.flush()
|
||||||
|
count = session.query(ThumbnailCache).count()
|
||||||
|
print(f" -> {count} thumbnail cache entries migrated")
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
print("Initializing database...")
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
session = get_session()
|
||||||
|
try:
|
||||||
|
# Check if already migrated
|
||||||
|
existing = session.query(Contribution).count()
|
||||||
|
if existing > 0:
|
||||||
|
print(f"Database already contains {existing} contributions. Aborting.")
|
||||||
|
print("Delete flickr_mail.db to re-run migration.")
|
||||||
|
return
|
||||||
|
|
||||||
|
migrate_contributions(session)
|
||||||
|
url_to_message_id = migrate_sent_messages(session)
|
||||||
|
migrate_flickr_uploads(session, url_to_message_id)
|
||||||
|
migrate_thumbnail_cache(session)
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
print("\nMigration complete!")
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
Find UploadWizard contributions that are from Flickr and add them to the database.
|
||||||
|
|
||||||
For contributions with comment 'User created page with UploadWizard', queries the
|
For contributions with comment 'User created page with UploadWizard', queries the
|
||||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||||
|
|
@ -9,12 +9,13 @@ Commons API to check if the image source is Flickr (by checking the Credit field
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
from flickr_mail.database import init_db, get_session
|
||||||
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||||
|
from flickr_mail.url_utils import normalize_flickr_url
|
||||||
|
|
||||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||||
|
|
||||||
|
|
@ -75,99 +76,101 @@ def clean_artist_name(artist_html: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Load contributions
|
init_db()
|
||||||
print("Loading contributions...")
|
session = get_session()
|
||||||
with open(CONTRIBUTIONS_FILE) as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
contributions = data.get("contributions", [])
|
try:
|
||||||
|
# Get existing normalized flickr URLs to avoid duplicates
|
||||||
|
existing_urls = {
|
||||||
|
r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
|
||||||
|
if r[0]
|
||||||
|
}
|
||||||
|
print(f"Existing uploads: {session.query(FlickrUpload).count()}")
|
||||||
|
print(f"Existing flickr URLs: {len(existing_urls)}")
|
||||||
|
|
||||||
# Load existing flickr uploads
|
# Build sent message index for FK linking
|
||||||
existing_flickr_urls = set()
|
sent_messages = (
|
||||||
existing_uploads = []
|
session.query(SentMessage)
|
||||||
if FLICKR_UPLOADS_FILE.exists():
|
.filter(SentMessage.normalized_flickr_url != "")
|
||||||
with open(FLICKR_UPLOADS_FILE) as f:
|
.filter(~SentMessage.subject.startswith("Re:"))
|
||||||
existing_uploads = json.load(f)
|
.all()
|
||||||
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
)
|
||||||
# Also normalize existing URLs for comparison
|
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||||
for u in existing_uploads:
|
|
||||||
url = u.get("flickr_url", "")
|
|
||||||
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
|
||||||
existing_flickr_urls.add(normalized)
|
|
||||||
|
|
||||||
print(f"Existing uploads: {len(existing_uploads)}")
|
# Find UploadWizard contributions (page creations only)
|
||||||
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
upload_wizard = (
|
||||||
|
session.query(Contribution)
|
||||||
|
.filter(Contribution.comment == "User created page with UploadWizard")
|
||||||
|
.filter(Contribution.title.startswith("File:"))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
# Find UploadWizard contributions (page creations only)
|
print(f"UploadWizard contributions to check: {len(upload_wizard)}")
|
||||||
upload_wizard_contributions = []
|
|
||||||
for c in contributions:
|
|
||||||
comment = c.get("comment", "")
|
|
||||||
if comment == "User created page with UploadWizard":
|
|
||||||
# Only include if it's a File: page
|
|
||||||
title = c.get("title", "")
|
|
||||||
if title.startswith("File:"):
|
|
||||||
upload_wizard_contributions.append(c)
|
|
||||||
|
|
||||||
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
# Process in batches of 50
|
||||||
|
new_count = 0
|
||||||
|
batch_size = 50
|
||||||
|
|
||||||
# Process in batches of 50
|
for i in range(0, len(upload_wizard), batch_size):
|
||||||
new_uploads = []
|
batch = upload_wizard[i : i + batch_size]
|
||||||
batch_size = 50
|
titles = [c.title for c in batch]
|
||||||
|
|
||||||
for i in range(0, len(upload_wizard_contributions), batch_size):
|
print(
|
||||||
batch = upload_wizard_contributions[i : i + batch_size]
|
f"Processing batch {i // batch_size + 1}/"
|
||||||
titles = [c["title"] for c in batch]
|
f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
|
||||||
|
)
|
||||||
|
|
||||||
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
metadata = get_image_metadata(titles)
|
||||||
|
|
||||||
metadata = get_image_metadata(titles)
|
for c in batch:
|
||||||
|
meta = metadata.get(c.title, {})
|
||||||
|
credit = meta.get("credit", "")
|
||||||
|
artist = meta.get("artist", "")
|
||||||
|
|
||||||
for c in batch:
|
flickr_url = extract_flickr_url_from_credit(credit)
|
||||||
title = c["title"]
|
if not flickr_url:
|
||||||
meta = metadata.get(title, {})
|
continue
|
||||||
credit = meta.get("credit", "")
|
|
||||||
artist = meta.get("artist", "")
|
|
||||||
|
|
||||||
flickr_url = extract_flickr_url_from_credit(credit)
|
normalized = normalize_flickr_url(flickr_url)
|
||||||
if not flickr_url:
|
if normalized in existing_urls:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if we already have this URL
|
creator = clean_artist_name(artist) if artist else None
|
||||||
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
|
||||||
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
|
||||||
continue
|
|
||||||
|
|
||||||
creator = clean_artist_name(artist) if artist else None
|
# Look up sent message for FK linking
|
||||||
|
msg = url_to_message.get(normalized) if normalized else None
|
||||||
|
|
||||||
new_upload = {
|
session.add(FlickrUpload(
|
||||||
"pageid": c["pageid"],
|
pageid=c.pageid,
|
||||||
"revid": c["revid"],
|
revid=c.revid,
|
||||||
"title": title,
|
title=c.title,
|
||||||
"timestamp": c["timestamp"],
|
timestamp=c.timestamp,
|
||||||
"flickr_url": flickr_url,
|
flickr_url=flickr_url,
|
||||||
"creator": creator,
|
normalized_flickr_url=normalized,
|
||||||
}
|
creator=creator,
|
||||||
|
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||||
|
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||||
|
sent_message_id=msg.message_id if msg else None,
|
||||||
|
))
|
||||||
|
new_count += 1
|
||||||
|
existing_urls.add(normalized)
|
||||||
|
print(f" Found: {c.title[:50]} -> {flickr_url}")
|
||||||
|
|
||||||
new_uploads.append(new_upload)
|
session.commit()
|
||||||
existing_flickr_urls.add(normalized)
|
|
||||||
print(f" Found: {title[:50]} -> {flickr_url}")
|
|
||||||
|
|
||||||
# Rate limiting
|
# Rate limiting
|
||||||
if i + batch_size < len(upload_wizard_contributions):
|
if i + batch_size < len(upload_wizard):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
total = session.query(FlickrUpload).count()
|
||||||
|
print(f"\nFound {new_count} new Flickr uploads")
|
||||||
|
print(f"Total: {total} uploads in database")
|
||||||
|
|
||||||
if new_uploads:
|
except Exception:
|
||||||
# Merge and sort by timestamp (newest first)
|
session.rollback()
|
||||||
all_uploads = existing_uploads + new_uploads
|
raise
|
||||||
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
finally:
|
||||||
|
session.close()
|
||||||
# Save
|
|
||||||
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
|
||||||
json.dump(all_uploads, f, indent=2)
|
|
||||||
|
|
||||||
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue