Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -2,3 +2,4 @@
|
|||
__pycache__
|
||||
commons_contributions/thumbnail_cache.json
|
||||
commons_contributions/sent_mail_index.json
|
||||
flickr_mail.db
|
||||
|
|
|
|||
147
download_commons_contributions.py
Executable file
147
download_commons_contributions.py
Executable file
|
|
@ -0,0 +1,147 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download Wikimedia Commons contributions for a user."""
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution
|
||||
|
||||
|
||||
API_URL = "https://commons.wikimedia.org/w/api.php"
|
||||
USERNAME = "Edward"
|
||||
|
||||
# Identify ourselves properly to Wikimedia
|
||||
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
|
||||
def fetch_contributions(
|
||||
continue_token: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Fetch a batch of contributions from the API."""
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "usercontribs",
|
||||
"ucuser": USERNAME,
|
||||
"uclimit": "500",
|
||||
"ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
if continue_token:
|
||||
params["uccontinue"] = continue_token
|
||||
|
||||
response = SESSION.get(API_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
contributions = data.get("query", {}).get("usercontribs", [])
|
||||
|
||||
# Get continuation token if more results available
|
||||
new_continue = data.get("continue", {}).get("uccontinue")
|
||||
|
||||
return contributions, new_continue
|
||||
|
||||
|
||||
def upsert_contribution(session, c: dict) -> None:
|
||||
"""Insert or update a contribution by revid."""
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
return # Already have this revision
|
||||
|
||||
session.add(Contribution(
|
||||
userid=c.get("userid"),
|
||||
user=c.get("user"),
|
||||
pageid=c.get("pageid"),
|
||||
revid=c.get("revid"),
|
||||
parentid=c.get("parentid"),
|
||||
ns=c.get("ns"),
|
||||
title=c.get("title"),
|
||||
timestamp=c.get("timestamp"),
|
||||
minor=c.get("minor"),
|
||||
top=c.get("top"),
|
||||
comment=c.get("comment"),
|
||||
size=c.get("size"),
|
||||
sizediff=c.get("sizediff"),
|
||||
tags=json.dumps(c.get("tags", [])),
|
||||
))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
try:
|
||||
existing_count = session.query(Contribution).count()
|
||||
|
||||
# Get the latest timestamp to know where to resume from
|
||||
latest = (
|
||||
session.query(Contribution)
|
||||
.order_by(Contribution.timestamp.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
if existing_count > 0 and latest:
|
||||
print(f"Database has {existing_count} contributions")
|
||||
print(f"Latest: {latest.timestamp}")
|
||||
print("Fetching new contributions...")
|
||||
else:
|
||||
print(f"Downloading contributions for user: {USERNAME}")
|
||||
|
||||
batch_num = 0
|
||||
new_count = 0
|
||||
continue_token = None
|
||||
|
||||
while True:
|
||||
batch_num += 1
|
||||
print(f" Fetching batch {batch_num}...", end=" ", flush=True)
|
||||
|
||||
contributions, continue_token = fetch_contributions(continue_token)
|
||||
|
||||
if not contributions:
|
||||
print("no results")
|
||||
break
|
||||
|
||||
batch_new = 0
|
||||
for c in contributions:
|
||||
# Stop if we've reached contributions we already have
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
continue
|
||||
upsert_contribution(session, c)
|
||||
batch_new += 1
|
||||
|
||||
new_count += batch_new
|
||||
print(f"got {len(contributions)}, {batch_new} new")
|
||||
|
||||
session.commit()
|
||||
|
||||
if batch_new == 0:
|
||||
# All contributions in this batch already exist, we're caught up
|
||||
print(" Caught up with existing data")
|
||||
break
|
||||
|
||||
if not continue_token:
|
||||
break
|
||||
|
||||
# Be polite to the API
|
||||
time.sleep(0.5)
|
||||
|
||||
total = session.query(Contribution).count()
|
||||
print(f"\nDone! {new_count} new contributions, {total} total in database")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
246
download_sent_mail.py
Executable file
246
download_sent_mail.py
Executable file
|
|
@ -0,0 +1,246 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download sent FlickrMail messages for backup."""
|
||||
|
||||
import time
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import SentMessage
|
||||
from flickr_mail.url_utils import (
|
||||
creator_profile_from_flickr_url,
|
||||
extract_urls_from_message,
|
||||
normalize_flickr_url,
|
||||
)
|
||||
|
||||
BASE_URL = "https://www.flickr.com"
|
||||
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
||||
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"Sec-Fetch-User": "?1",
|
||||
"Priority": "u=0, i",
|
||||
}
|
||||
|
||||
COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
|
||||
|
||||
|
||||
def parse_cookies(cookie_str: str) -> dict[str, str]:
|
||||
"""Parse cookie string into dictionary."""
|
||||
cookies = {}
|
||||
for item in cookie_str.split("; "):
|
||||
if "=" in item:
|
||||
key, value = item.split("=", 1)
|
||||
cookies[key] = value
|
||||
return cookies
|
||||
|
||||
|
||||
def create_session() -> requests.Session:
|
||||
"""Create a requests session with authentication."""
|
||||
session = requests.Session()
|
||||
session.headers.update(HEADERS)
|
||||
session.cookies.update(parse_cookies(COOKIES_STR))
|
||||
return session
|
||||
|
||||
|
||||
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
|
||||
"""Fetch a page and return parsed HTML."""
|
||||
response = session.get(url)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
|
||||
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
|
||||
"""Extract message metadata from a sent mail list page."""
|
||||
messages = []
|
||||
|
||||
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
|
||||
mail_rows = soup.select("tr.message_row")
|
||||
|
||||
for row in mail_rows:
|
||||
msg = {}
|
||||
|
||||
# Get message ID from the row id attribute
|
||||
row_id = row.get("id", "")
|
||||
if row_id.startswith("message_row_"):
|
||||
msg["message_id"] = row_id.replace("message_row_", "")
|
||||
|
||||
# Find message link in the subject cell
|
||||
subj_cell = row.select_one("td.subj")
|
||||
if subj_cell:
|
||||
link = subj_cell.find("a")
|
||||
if link:
|
||||
msg["subject"] = link.get_text(strip=True)
|
||||
msg["url"] = BASE_URL + link["href"]
|
||||
|
||||
# Recipient is in td.fromto
|
||||
fromto_cell = row.select_one("td.fromto")
|
||||
if fromto_cell:
|
||||
msg["recipient"] = fromto_cell.get_text(strip=True)
|
||||
|
||||
# Date is in td.date
|
||||
date_cell = row.select_one("td.date")
|
||||
if date_cell:
|
||||
msg["date"] = date_cell.get_text(strip=True)
|
||||
|
||||
if "message_id" in msg:
|
||||
messages.append(msg)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def extract_message_content(soup: BeautifulSoup) -> dict:
|
||||
"""Extract full message content from a message page."""
|
||||
content = {}
|
||||
|
||||
# Find the ThinCase div containing the message
|
||||
thin_case = soup.select_one(".ThinCase")
|
||||
if not thin_case:
|
||||
return content
|
||||
|
||||
# Find the table with message content
|
||||
table = thin_case.find("table")
|
||||
if not table:
|
||||
return content
|
||||
|
||||
rows = table.find_all("tr", recursive=False)
|
||||
|
||||
# Row 0: To: <recipient>
|
||||
# Row 1: Subject: <subject>
|
||||
# Row 2: <empty> <body>
|
||||
for row in rows:
|
||||
cells = row.find_all("td", recursive=False)
|
||||
if len(cells) >= 2:
|
||||
header_cell = cells[0]
|
||||
value_cell = cells[1]
|
||||
|
||||
header = header_cell.get_text(strip=True).lower()
|
||||
|
||||
if header == "to:":
|
||||
# Get recipient username
|
||||
username = value_cell.select_one(".username")
|
||||
if username:
|
||||
content["recipient"] = username.get_text(strip=True)
|
||||
|
||||
elif header == "subject:":
|
||||
# Get subject from h3
|
||||
h3 = value_cell.find("h3")
|
||||
if h3:
|
||||
content["subject"] = h3.get_text(strip=True)
|
||||
|
||||
elif header == "":
|
||||
# This is the message body row (empty header cell)
|
||||
# Get the content but exclude the delete form
|
||||
form = value_cell.find("form")
|
||||
if form:
|
||||
form.decompose()
|
||||
|
||||
content["body"] = value_cell.get_text(separator="\n", strip=True)
|
||||
content["body_html"] = str(value_cell)
|
||||
break # Body found, stop processing
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
init_db()
|
||||
db_session = get_session()
|
||||
|
||||
try:
|
||||
existing_ids = {
|
||||
r[0] for r in db_session.query(SentMessage.message_id).all()
|
||||
}
|
||||
print(f"Database has {len(existing_ids)} messages")
|
||||
|
||||
http_session = create_session()
|
||||
|
||||
# Scrape all pages to find new messages
|
||||
total_pages = 29
|
||||
new_messages: list[dict] = []
|
||||
|
||||
print("Fetching message list from all pages...")
|
||||
for page in range(1, total_pages + 1):
|
||||
url = SENT_MAIL_URL.format(page=page)
|
||||
print(f" Fetching page {page}/{total_pages}...")
|
||||
|
||||
try:
|
||||
soup = fetch_page(http_session, url)
|
||||
page_messages = extract_messages_from_list_page(soup)
|
||||
|
||||
for msg in page_messages:
|
||||
if msg["message_id"] not in existing_ids:
|
||||
new_messages.append(msg)
|
||||
|
||||
time.sleep(1) # Be polite to the server
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error fetching page {page}: {e}")
|
||||
continue
|
||||
|
||||
print(f"Found {len(new_messages)} new messages to download")
|
||||
|
||||
# Download individual messages
|
||||
for i, msg in enumerate(new_messages, 1):
|
||||
msg_id = msg["message_id"]
|
||||
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
|
||||
|
||||
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
|
||||
|
||||
try:
|
||||
soup = fetch_page(http_session, url)
|
||||
content = extract_message_content(soup)
|
||||
|
||||
# Merge with metadata
|
||||
full_msg = {**msg, **content}
|
||||
|
||||
body = full_msg.get("body", "")
|
||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
||||
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
||||
|
||||
db_session.add(SentMessage(
|
||||
message_id=msg_id,
|
||||
subject=full_msg.get("subject", ""),
|
||||
url=full_msg.get("url", ""),
|
||||
recipient=full_msg.get("recipient", ""),
|
||||
date=full_msg.get("date", ""),
|
||||
body=body,
|
||||
body_html=full_msg.get("body_html", ""),
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
wikipedia_url=wikipedia_url,
|
||||
creator_profile_url=creator_profile,
|
||||
))
|
||||
db_session.commit()
|
||||
|
||||
time.sleep(1) # Be polite
|
||||
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
print(f" Error downloading message {msg_id}: {e}")
|
||||
continue
|
||||
|
||||
total = db_session.query(SentMessage).count()
|
||||
print(f"Done! {total} messages in database")
|
||||
|
||||
except Exception:
|
||||
db_session.rollback()
|
||||
raise
|
||||
finally:
|
||||
db_session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
158
extract_flickr_uploads.py
Normal file
158
extract_flickr_uploads.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Flickr uploads from Wikimedia Commons contributions.
|
||||
|
||||
Filters contributions where the comment contains a flickr.com URL and extracts:
|
||||
- pageid, revid, title, timestamp
|
||||
- flickr_url: the Flickr photo URL
|
||||
- creator: the photographer/author name
|
||||
|
||||
Links uploads to sent messages via normalized Flickr URL matching.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||
from flickr_mail.url_utils import normalize_flickr_url
|
||||
|
||||
|
||||
def extract_flickr_url(comment: str) -> str | None:
|
||||
"""Extract the Flickr photo URL from a comment."""
|
||||
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
|
||||
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
|
||||
patterns = [
|
||||
# Plain URL (modern format)
|
||||
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
|
||||
# URL in wiki markup [url title]
|
||||
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, comment)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_creator(comment: str) -> str | None:
|
||||
"""Extract the creator/author name from a comment."""
|
||||
# Modern format: "Uploaded a work by {creator} from https://..."
|
||||
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
|
||||
if modern_match:
|
||||
return modern_match.group(1).strip()
|
||||
|
||||
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
|
||||
# The author name comes after the URL, before ] or "from"
|
||||
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
|
||||
if author_match:
|
||||
author = author_match.group(1).strip()
|
||||
# Remove trailing location like "from Toronto, Canada"
|
||||
author = re.sub(r'\s+from\s+.+$', '', author)
|
||||
return author
|
||||
|
||||
# Handle truncated comments where Author field is cut off
|
||||
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
|
||||
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
|
||||
if truncated_match:
|
||||
author = truncated_match.group(1).strip()
|
||||
if author:
|
||||
return author
|
||||
|
||||
# Sometimes Author field is just plain text without URL
|
||||
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
|
||||
if author_plain:
|
||||
author = author_plain.group(1).strip()
|
||||
# Skip if it looks like a wiki user link
|
||||
if not author.startswith('[[User:') and author:
|
||||
return author
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Process contributions and extract Flickr uploads."""
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
try:
|
||||
# Get existing upload revids to avoid duplicates
|
||||
existing_revids = {
|
||||
r[0] for r in session.query(FlickrUpload.revid).all()
|
||||
}
|
||||
|
||||
# Build sent message index: normalized_flickr_url -> message
|
||||
sent_messages = (
|
||||
session.query(SentMessage)
|
||||
.filter(SentMessage.normalized_flickr_url != "")
|
||||
.filter(~SentMessage.subject.startswith("Re:"))
|
||||
.all()
|
||||
)
|
||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||
print(f"Sent message index: {len(url_to_message)} entries")
|
||||
|
||||
# Query contributions with flickr.com in comment
|
||||
contributions = (
|
||||
session.query(Contribution)
|
||||
.filter(Contribution.comment.ilike("%flickr.com%"))
|
||||
.all()
|
||||
)
|
||||
|
||||
print(f"Found {len(contributions)} contributions mentioning flickr.com")
|
||||
|
||||
new_count = 0
|
||||
for contrib in contributions:
|
||||
if contrib.revid in existing_revids:
|
||||
continue
|
||||
|
||||
flickr_url = extract_flickr_url(contrib.comment or "")
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
creator = extract_creator(contrib.comment or "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
|
||||
# Look up sent message for FK linking
|
||||
msg = url_to_message.get(normalized) if normalized else None
|
||||
|
||||
session.add(FlickrUpload(
|
||||
pageid=contrib.pageid,
|
||||
revid=contrib.revid,
|
||||
title=contrib.title,
|
||||
timestamp=contrib.timestamp,
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=creator,
|
||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||
sent_message_id=msg.message_id if msg else None,
|
||||
))
|
||||
new_count += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
total = session.query(FlickrUpload).count()
|
||||
linked = session.query(FlickrUpload).filter(
|
||||
FlickrUpload.sent_message_id.isnot(None)
|
||||
).count()
|
||||
|
||||
print(f"Extracted {new_count} new Flickr uploads")
|
||||
print(f"Total: {total} uploads, {linked} linked to sent messages")
|
||||
|
||||
# Show some stats
|
||||
with_creator = session.query(FlickrUpload).filter(
|
||||
FlickrUpload.creator.isnot(None)
|
||||
).count()
|
||||
print(f" - {with_creator} with creator identified")
|
||||
print(f" - {total - with_creator} without creator")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
flickr_mail/__init__.py
Normal file
0
flickr_mail/__init__.py
Normal file
31
flickr_mail/database.py
Normal file
31
flickr_mail/database.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""Database engine and session factory for flickr-mail."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import create_engine, event
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from flickr_mail.models import Base
|
||||
|
||||
DB_PATH = Path(__file__).parent.parent / "flickr_mail.db"
|
||||
|
||||
engine = create_engine(f"sqlite:///{DB_PATH}")
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
|
||||
|
||||
@event.listens_for(engine, "connect")
|
||||
def set_sqlite_pragma(dbapi_connection, connection_record):
|
||||
"""Enable WAL mode for concurrent read/write access."""
|
||||
cursor = dbapi_connection.cursor()
|
||||
cursor.execute("PRAGMA journal_mode=WAL")
|
||||
cursor.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""Create all tables."""
|
||||
Base.metadata.create_all(engine)
|
||||
|
||||
|
||||
def get_session() -> Session:
|
||||
"""Create a new database session."""
|
||||
return SessionLocal()
|
||||
93
flickr_mail/models.py
Normal file
93
flickr_mail/models.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""SQLAlchemy models for flickr-mail."""
|
||||
|
||||
from sqlalchemy import ForeignKey, Index, Text
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class Contribution(Base):
|
||||
__tablename__ = "contributions"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
userid: Mapped[int | None]
|
||||
user: Mapped[str | None]
|
||||
pageid: Mapped[int | None]
|
||||
revid: Mapped[int | None] = mapped_column(unique=True)
|
||||
parentid: Mapped[int | None]
|
||||
ns: Mapped[int | None]
|
||||
title: Mapped[str | None]
|
||||
timestamp: Mapped[str | None]
|
||||
minor: Mapped[str | None]
|
||||
top: Mapped[str | None]
|
||||
comment: Mapped[str | None] = mapped_column(Text)
|
||||
size: Mapped[int | None]
|
||||
sizediff: Mapped[int | None]
|
||||
tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_contributions_timestamp", "timestamp"),
|
||||
Index("ix_contributions_pageid", "pageid"),
|
||||
)
|
||||
|
||||
|
||||
class SentMessage(Base):
|
||||
__tablename__ = "sent_messages"
|
||||
|
||||
message_id: Mapped[str] = mapped_column(primary_key=True)
|
||||
subject: Mapped[str | None]
|
||||
url: Mapped[str | None]
|
||||
recipient: Mapped[str | None]
|
||||
date: Mapped[str | None]
|
||||
body: Mapped[str | None] = mapped_column(Text)
|
||||
body_html: Mapped[str | None] = mapped_column(Text)
|
||||
flickr_url: Mapped[str | None]
|
||||
normalized_flickr_url: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str | None]
|
||||
creator_profile_url: Mapped[str | None]
|
||||
|
||||
flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
|
||||
back_populates="sent_message"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_sent_messages_recipient", "recipient"),
|
||||
Index("ix_sent_messages_normalized_flickr_url", "normalized_flickr_url"),
|
||||
)
|
||||
|
||||
|
||||
class FlickrUpload(Base):
|
||||
__tablename__ = "flickr_uploads"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
pageid: Mapped[int | None]
|
||||
revid: Mapped[int | None]
|
||||
title: Mapped[str | None]
|
||||
timestamp: Mapped[str | None]
|
||||
flickr_url: Mapped[str | None]
|
||||
normalized_flickr_url: Mapped[str | None]
|
||||
creator: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str | None]
|
||||
creator_profile_url: Mapped[str | None]
|
||||
sent_message_id: Mapped[str | None] = mapped_column(
|
||||
ForeignKey("sent_messages.message_id")
|
||||
)
|
||||
|
||||
sent_message: Mapped[SentMessage | None] = relationship(
|
||||
back_populates="flickr_uploads"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_flickr_uploads_normalized_flickr_url", "normalized_flickr_url"),
|
||||
Index("ix_flickr_uploads_timestamp", "timestamp"),
|
||||
)
|
||||
|
||||
|
||||
class ThumbnailCache(Base):
|
||||
__tablename__ = "thumbnail_cache"
|
||||
|
||||
title: Mapped[str] = mapped_column(primary_key=True)
|
||||
thumb_url: Mapped[str | None]
|
||||
fetched_at: Mapped[int | None] # Unix timestamp
|
||||
52
flickr_mail/url_utils.py
Normal file
52
flickr_mail/url_utils.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
"""Shared URL utility functions for flickr-mail."""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def normalize_flickr_url(url: str) -> str:
|
||||
"""Normalize a Flickr photo URL for comparison."""
|
||||
# Remove protocol
|
||||
url = url.replace("https://", "").replace("http://", "")
|
||||
# Remove www.
|
||||
url = url.replace("www.", "")
|
||||
# Remove trailing slash
|
||||
url = url.rstrip("/")
|
||||
# Ensure it starts with flickr.com
|
||||
if not url.startswith("flickr.com"):
|
||||
return ""
|
||||
return url
|
||||
|
||||
|
||||
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
||||
"""Extract flickr URL and Wikipedia URL from message body."""
|
||||
|
||||
flickr_url = ""
|
||||
wikipedia_url = ""
|
||||
|
||||
# Find flickr photo URLs
|
||||
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
||||
flickr_matches = re.findall(flickr_pattern, body)
|
||||
if flickr_matches:
|
||||
flickr_url = flickr_matches[0]
|
||||
if not flickr_url.startswith("http"):
|
||||
flickr_url = "https://" + flickr_url
|
||||
|
||||
# Find Wikipedia URLs
|
||||
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
||||
wiki_matches = re.findall(wiki_pattern, body)
|
||||
if wiki_matches:
|
||||
wikipedia_url = wiki_matches[0]
|
||||
if not wikipedia_url.startswith("http"):
|
||||
wikipedia_url = "https://" + wikipedia_url
|
||||
|
||||
return flickr_url, wikipedia_url
|
||||
|
||||
|
||||
def creator_profile_from_flickr_url(flickr_url: str) -> str:
|
||||
"""Extract creator profile URL from a flickr photo URL."""
|
||||
parts = flickr_url.split("/")
|
||||
for i, part in enumerate(parts):
|
||||
if part == "photos" and i + 1 < len(parts):
|
||||
username = parts[i + 1]
|
||||
return f"https://www.flickr.com/photos/{username}"
|
||||
return ""
|
||||
307
main.py
307
main.py
|
|
@ -9,14 +9,17 @@ import sys
|
|||
import time
|
||||
import traceback
|
||||
import typing
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote, unquote
|
||||
|
||||
import flask
|
||||
import requests
|
||||
import werkzeug
|
||||
from sqlalchemy import func
|
||||
from werkzeug.debug.tbtools import DebugTraceback
|
||||
|
||||
from flickr_mail.database import get_session
|
||||
from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
|
||||
from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
|
||||
|
||||
import re
|
||||
|
||||
|
|
@ -26,18 +29,6 @@ app.debug = False
|
|||
|
||||
enwiki = "en.wikipedia.org/wiki/"
|
||||
|
||||
# Path to Commons contributions data and sent mail
|
||||
COMMONS_UPLOADS_FILE = (
|
||||
Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
|
||||
)
|
||||
COMMONS_CACHE_FILE = (
|
||||
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
|
||||
)
|
||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||
SENT_MAIL_INDEX_CACHE = (
|
||||
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
|
||||
)
|
||||
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
||||
RECENT_UPLOADS_COUNT = 24
|
||||
|
||||
|
|
@ -165,132 +156,6 @@ class CommonsUpload:
|
|||
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
||||
|
||||
|
||||
def normalize_flickr_url(url: str) -> str:
|
||||
"""Normalize a Flickr photo URL for comparison."""
|
||||
# Remove protocol
|
||||
url = url.replace("https://", "").replace("http://", "")
|
||||
# Remove www.
|
||||
url = url.replace("www.", "")
|
||||
# Remove trailing slash
|
||||
url = url.rstrip("/")
|
||||
# Ensure it starts with flickr.com
|
||||
if not url.startswith("flickr.com"):
|
||||
return ""
|
||||
return url
|
||||
|
||||
|
||||
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
||||
"""Extract flickr URL and Wikipedia URL from message body."""
|
||||
|
||||
flickr_url = ""
|
||||
wikipedia_url = ""
|
||||
|
||||
# Find flickr photo URLs
|
||||
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
||||
flickr_matches = re.findall(flickr_pattern, body)
|
||||
if flickr_matches:
|
||||
flickr_url = flickr_matches[0]
|
||||
if not flickr_url.startswith("http"):
|
||||
flickr_url = "https://" + flickr_url
|
||||
|
||||
# Find Wikipedia URLs
|
||||
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
||||
wiki_matches = re.findall(wiki_pattern, body)
|
||||
if wiki_matches:
|
||||
wikipedia_url = wiki_matches[0]
|
||||
if not wikipedia_url.startswith("http"):
|
||||
wikipedia_url = "https://" + wikipedia_url
|
||||
|
||||
return flickr_url, wikipedia_url
|
||||
|
||||
|
||||
def build_sent_mail_index() -> dict[str, dict[str, str]]:
|
||||
"""Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
|
||||
if not SENT_MAIL_DIR.exists():
|
||||
return {}
|
||||
|
||||
# Check if we have a cached index
|
||||
if SENT_MAIL_INDEX_CACHE.exists():
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_CACHE) as f:
|
||||
cache = json.load(f)
|
||||
# Check if cache is still valid (compare file count)
|
||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||
if cache.get("file_count") == len(json_files):
|
||||
return cache.get("index", {})
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
index: dict[str, dict[str, str]] = {}
|
||||
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||
|
||||
for json_file in json_files:
|
||||
try:
|
||||
with open(json_file) as f:
|
||||
message = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
# Skip replies - we want original requests
|
||||
subject = message.get("subject", "")
|
||||
if subject.startswith("Re:"):
|
||||
continue
|
||||
|
||||
body = message.get("body", "")
|
||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if not normalized:
|
||||
continue
|
||||
|
||||
# Extract creator profile URL from flickr URL
|
||||
# flickr.com/photos/username/12345 -> flickr.com/photos/username
|
||||
parts = flickr_url.split("/")
|
||||
creator_profile = ""
|
||||
for i, part in enumerate(parts):
|
||||
if part == "photos" and i + 1 < len(parts):
|
||||
username = parts[i + 1]
|
||||
creator_profile = f"https://www.flickr.com/photos/{username}"
|
||||
break
|
||||
|
||||
index[normalized] = {
|
||||
"wikipedia_url": wikipedia_url,
|
||||
"creator_profile_url": creator_profile,
|
||||
"recipient": message.get("recipient", ""),
|
||||
}
|
||||
|
||||
# Cache the index
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_CACHE, "w") as f:
|
||||
json.dump({"file_count": len(json_files), "index": index}, f)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
|
||||
"""Load the thumbnail cache from disk."""
|
||||
if not COMMONS_CACHE_FILE.exists():
|
||||
return {"timestamp": 0, "thumbnails": {}}
|
||||
try:
|
||||
with open(COMMONS_CACHE_FILE) as f:
|
||||
return typing.cast(dict[str, typing.Any], json.load(f))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {"timestamp": 0, "thumbnails": {}}
|
||||
|
||||
|
||||
def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
|
||||
"""Save the thumbnail cache to disk."""
|
||||
try:
|
||||
with open(COMMONS_CACHE_FILE, "w") as f:
|
||||
json.dump(cache, f)
|
||||
except OSError:
|
||||
pass # Ignore cache write errors
|
||||
|
||||
|
||||
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
||||
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
||||
|
|
@ -340,79 +205,72 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
|
|||
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
||||
of uploads obtained via Flickr mail (not just the ones returned).
|
||||
"""
|
||||
if not COMMONS_UPLOADS_FILE.exists():
|
||||
return [], 0
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
with open(COMMONS_UPLOADS_FILE) as f:
|
||||
all_uploads = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return [], 0
|
||||
|
||||
# Build sent mail index
|
||||
sent_mail_index = build_sent_mail_index()
|
||||
|
||||
# Filter uploads to only those with matching sent mail
|
||||
# Count all matches, but only keep RECENT_UPLOADS_COUNT for display
|
||||
uploads_with_mail: list[dict[str, typing.Any]] = []
|
||||
total_matched = 0
|
||||
for upload in all_uploads:
|
||||
flickr_url = upload.get("flickr_url", "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if normalized and normalized in sent_mail_index:
|
||||
total_matched += 1
|
||||
if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
|
||||
upload["_mail_info"] = sent_mail_index[normalized]
|
||||
uploads_with_mail.append(upload)
|
||||
|
||||
if not uploads_with_mail:
|
||||
return [], 0
|
||||
|
||||
# Load cache and check if it's still valid
|
||||
cache = load_commons_thumbnail_cache()
|
||||
cache_age = time.time() - cache.get("timestamp", 0)
|
||||
cached_thumbs = cache.get("thumbnails", {})
|
||||
|
||||
# Find which titles need fetching
|
||||
titles = [u["title"] for u in uploads_with_mail]
|
||||
titles_to_fetch = [t for t in titles if t not in cached_thumbs]
|
||||
|
||||
# Fetch missing thumbnails or refresh if cache is old
|
||||
if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
|
||||
new_thumbs = fetch_commons_thumbnails(
|
||||
titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
|
||||
query = (
|
||||
session.query(FlickrUpload, SentMessage)
|
||||
.join(SentMessage)
|
||||
.order_by(FlickrUpload.timestamp.desc())
|
||||
)
|
||||
cached_thumbs.update(new_thumbs)
|
||||
cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
|
||||
save_commons_thumbnail_cache(cache)
|
||||
total_matched = query.count()
|
||||
if total_matched == 0:
|
||||
return [], 0
|
||||
|
||||
# Build the result list
|
||||
result: list[CommonsUpload] = []
|
||||
for upload in uploads_with_mail:
|
||||
title = upload["title"]
|
||||
thumb_url = cached_thumbs.get(title, "")
|
||||
if not thumb_url:
|
||||
continue
|
||||
recent = query.limit(RECENT_UPLOADS_COUNT).all()
|
||||
|
||||
mail_info = upload.get("_mail_info", {})
|
||||
# Get thumbnails from cache
|
||||
titles = [upload.title for upload, msg in recent]
|
||||
now = int(time.time())
|
||||
cached = {
|
||||
tc.title: tc
|
||||
for tc in session.query(ThumbnailCache)
|
||||
.filter(ThumbnailCache.title.in_(titles))
|
||||
.all()
|
||||
}
|
||||
|
||||
# Convert title to Commons URL
|
||||
commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
|
||||
# Find titles needing fetch (missing or expired)
|
||||
titles_to_fetch = [
|
||||
t for t in titles
|
||||
if t not in cached or (now - (cached[t].fetched_at or 0)) > COMMONS_CACHE_MAX_AGE
|
||||
]
|
||||
|
||||
result.append(
|
||||
CommonsUpload(
|
||||
title=title.replace("File:", "").rsplit(".", 1)[0],
|
||||
thumb_url=thumb_url,
|
||||
commons_url=commons_url,
|
||||
flickr_url=upload.get("flickr_url", ""),
|
||||
creator=upload.get("creator") or "Unknown",
|
||||
timestamp=upload.get("timestamp", "")[:10],
|
||||
wikipedia_url=mail_info.get("wikipedia_url", ""),
|
||||
creator_profile_url=mail_info.get("creator_profile_url", ""),
|
||||
if titles_to_fetch:
|
||||
new_thumbs = fetch_commons_thumbnails(titles_to_fetch)
|
||||
for title, thumb_url in new_thumbs.items():
|
||||
existing = cached.get(title)
|
||||
if existing:
|
||||
existing.thumb_url = thumb_url
|
||||
existing.fetched_at = now
|
||||
else:
|
||||
tc = ThumbnailCache(title=title, thumb_url=thumb_url, fetched_at=now)
|
||||
session.add(tc)
|
||||
cached[title] = tc
|
||||
session.commit()
|
||||
|
||||
result: list[CommonsUpload] = []
|
||||
for upload, msg in recent:
|
||||
thumb_url = cached[upload.title].thumb_url if upload.title in cached else ""
|
||||
if not thumb_url:
|
||||
continue
|
||||
|
||||
commons_url = f"https://commons.wikimedia.org/wiki/{upload.title.replace(' ', '_')}"
|
||||
|
||||
result.append(
|
||||
CommonsUpload(
|
||||
title=upload.title.replace("File:", "").rsplit(".", 1)[0],
|
||||
thumb_url=thumb_url,
|
||||
commons_url=commons_url,
|
||||
flickr_url=upload.flickr_url or "",
|
||||
creator=upload.creator or "Unknown",
|
||||
timestamp=(upload.timestamp or "")[:10],
|
||||
wikipedia_url=upload.wikipedia_url or "",
|
||||
creator_profile_url=upload.creator_profile_url or "",
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
return result, total_matched
|
||||
return result, total_matched
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||
|
|
@ -421,26 +279,33 @@ def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
|||
Checks both the display name (flickr_user) and username (flickr_username)
|
||||
against the recipient field in the messages index.
|
||||
"""
|
||||
if not SENT_MAIL_INDEX_FILE.exists():
|
||||
names = set()
|
||||
if flickr_user:
|
||||
names.add(flickr_user.lower())
|
||||
if flickr_username:
|
||||
names.add(flickr_username.lower())
|
||||
if not names:
|
||||
return []
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
with open(SENT_MAIL_INDEX_FILE) as f:
|
||||
messages = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return []
|
||||
|
||||
# Normalize for case-insensitive comparison
|
||||
flickr_user_lower = flickr_user.lower() if flickr_user else ""
|
||||
flickr_username_lower = flickr_username.lower() if flickr_username else ""
|
||||
|
||||
matches = []
|
||||
for msg in messages:
|
||||
recipient = msg.get("recipient", "").lower()
|
||||
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
|
||||
matches.append(msg)
|
||||
|
||||
return matches
|
||||
messages = (
|
||||
session.query(SentMessage)
|
||||
.filter(func.lower(SentMessage.recipient).in_(names))
|
||||
.all()
|
||||
)
|
||||
return [
|
||||
{
|
||||
"message_id": m.message_id,
|
||||
"subject": m.subject,
|
||||
"url": m.url,
|
||||
"recipient": m.recipient,
|
||||
"date": m.date,
|
||||
}
|
||||
for m in messages
|
||||
]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def parse_category_input(category_input: str) -> str | None:
|
||||
|
|
|
|||
233
migrate_json_to_db.py
Normal file
233
migrate_json_to_db.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
#!/usr/bin/env python3
|
||||
"""One-time migration from JSON files to SQLite database."""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
|
||||
from flickr_mail.url_utils import (
|
||||
creator_profile_from_flickr_url,
|
||||
extract_urls_from_message,
|
||||
normalize_flickr_url,
|
||||
)
|
||||
|
||||
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
|
||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
|
||||
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
|
||||
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
|
||||
|
||||
|
||||
def migrate_contributions(session) -> int:
|
||||
"""Migrate contributions.json to contributions table."""
|
||||
if not CONTRIBUTIONS_FILE.exists():
|
||||
print("No contributions.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
print(f"Migrating {len(contributions)} contributions...")
|
||||
|
||||
for c in contributions:
|
||||
session.add(Contribution(
|
||||
userid=c.get("userid"),
|
||||
user=c.get("user"),
|
||||
pageid=c.get("pageid"),
|
||||
revid=c.get("revid"),
|
||||
parentid=c.get("parentid"),
|
||||
ns=c.get("ns"),
|
||||
title=c.get("title"),
|
||||
timestamp=c.get("timestamp"),
|
||||
minor=c.get("minor"),
|
||||
top=c.get("top"),
|
||||
comment=c.get("comment"),
|
||||
size=c.get("size"),
|
||||
sizediff=c.get("sizediff"),
|
||||
tags=json.dumps(c.get("tags", [])),
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(Contribution).count()
|
||||
print(f" -> {count} contributions migrated")
|
||||
return count
|
||||
|
||||
|
||||
def migrate_sent_messages(session) -> dict[str, str]:
|
||||
"""Migrate sent messages to sent_messages table.
|
||||
|
||||
Returns a dict of normalized_flickr_url -> message_id for FK linking.
|
||||
"""
|
||||
if not SENT_MAIL_INDEX.exists():
|
||||
print("No messages_index.json found, skipping")
|
||||
return {}
|
||||
|
||||
with open(SENT_MAIL_INDEX) as f:
|
||||
index = json.load(f)
|
||||
|
||||
print(f"Migrating {len(index)} sent messages...")
|
||||
|
||||
url_to_message_id: dict[str, str] = {}
|
||||
count = 0
|
||||
|
||||
for msg_meta in index:
|
||||
msg_id = msg_meta.get("message_id", "")
|
||||
if not msg_id:
|
||||
continue
|
||||
|
||||
# Load the full message from individual file
|
||||
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
|
||||
if msg_file.exists():
|
||||
with open(msg_file) as f:
|
||||
msg = json.load(f)
|
||||
else:
|
||||
msg = msg_meta
|
||||
|
||||
body = msg.get("body", "")
|
||||
subject = msg.get("subject", "")
|
||||
|
||||
# Extract URLs from body
|
||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
||||
|
||||
# Extract creator profile URL
|
||||
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
||||
|
||||
session.add(SentMessage(
|
||||
message_id=msg_id,
|
||||
subject=msg.get("subject", ""),
|
||||
url=msg.get("url", ""),
|
||||
recipient=msg.get("recipient", ""),
|
||||
date=msg.get("date", ""),
|
||||
body=body,
|
||||
body_html=msg.get("body_html", ""),
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
wikipedia_url=wikipedia_url,
|
||||
creator_profile_url=creator_profile_url,
|
||||
))
|
||||
|
||||
# Build URL -> message_id map for FK linking (skip replies)
|
||||
if normalized and not subject.startswith("Re:"):
|
||||
url_to_message_id[normalized] = msg_id
|
||||
|
||||
count += 1
|
||||
|
||||
session.flush()
|
||||
actual = session.query(SentMessage).count()
|
||||
print(f" -> {actual} sent messages migrated")
|
||||
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
|
||||
return url_to_message_id
|
||||
|
||||
|
||||
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
|
||||
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
|
||||
if not FLICKR_UPLOADS_FILE.exists():
|
||||
print("No flickr_uploads.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
uploads = json.load(f)
|
||||
|
||||
print(f"Migrating {len(uploads)} flickr uploads...")
|
||||
|
||||
linked = 0
|
||||
for u in uploads:
|
||||
flickr_url = u.get("flickr_url", "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
|
||||
# Look up sent message FK
|
||||
sent_message_id = url_to_message_id.get(normalized) if normalized else None
|
||||
if sent_message_id:
|
||||
linked += 1
|
||||
|
||||
# Get wikipedia_url and creator_profile_url from the linked message
|
||||
wikipedia_url = ""
|
||||
creator_profile_url = ""
|
||||
if sent_message_id:
|
||||
msg = session.get(SentMessage, sent_message_id)
|
||||
if msg:
|
||||
wikipedia_url = msg.wikipedia_url or ""
|
||||
creator_profile_url = msg.creator_profile_url or ""
|
||||
|
||||
session.add(FlickrUpload(
|
||||
pageid=u.get("pageid"),
|
||||
revid=u.get("revid"),
|
||||
title=u.get("title"),
|
||||
timestamp=u.get("timestamp"),
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=u.get("creator"),
|
||||
wikipedia_url=wikipedia_url,
|
||||
creator_profile_url=creator_profile_url,
|
||||
sent_message_id=sent_message_id,
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(FlickrUpload).count()
|
||||
print(f" -> {count} flickr uploads migrated")
|
||||
print(f" -> {linked} linked to sent messages")
|
||||
return count
|
||||
|
||||
|
||||
def migrate_thumbnail_cache(session) -> int:
|
||||
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
|
||||
if not THUMBNAIL_CACHE_FILE.exists():
|
||||
print("No thumbnail_cache.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(THUMBNAIL_CACHE_FILE) as f:
|
||||
cache = json.load(f)
|
||||
|
||||
thumbnails = cache.get("thumbnails", {})
|
||||
cache_timestamp = int(cache.get("timestamp", 0))
|
||||
|
||||
print(f"Migrating {len(thumbnails)} cached thumbnails...")
|
||||
|
||||
for title, thumb_url in thumbnails.items():
|
||||
session.add(ThumbnailCache(
|
||||
title=title,
|
||||
thumb_url=thumb_url,
|
||||
fetched_at=cache_timestamp,
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(ThumbnailCache).count()
|
||||
print(f" -> {count} thumbnail cache entries migrated")
|
||||
return count
|
||||
|
||||
|
||||
def main() -> None:
|
||||
print("Initializing database...")
|
||||
init_db()
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
# Check if already migrated
|
||||
existing = session.query(Contribution).count()
|
||||
if existing > 0:
|
||||
print(f"Database already contains {existing} contributions. Aborting.")
|
||||
print("Delete flickr_mail.db to re-run migration.")
|
||||
return
|
||||
|
||||
migrate_contributions(session)
|
||||
url_to_message_id = migrate_sent_messages(session)
|
||||
migrate_flickr_uploads(session, url_to_message_id)
|
||||
migrate_thumbnail_cache(session)
|
||||
|
||||
session.commit()
|
||||
print("\nMigration complete!")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
||||
Find UploadWizard contributions that are from Flickr and add them to the database.
|
||||
|
||||
For contributions with comment 'User created page with UploadWizard', queries the
|
||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||
|
|
@ -9,12 +9,13 @@ Commons API to check if the image source is Flickr (by checking the Credit field
|
|||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
||||
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||
from flickr_mail.url_utils import normalize_flickr_url
|
||||
|
||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
|
||||
|
|
@ -75,99 +76,101 @@ def clean_artist_name(artist_html: str) -> str:
|
|||
|
||||
|
||||
def main():
|
||||
# Load contributions
|
||||
print("Loading contributions...")
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
try:
|
||||
# Get existing normalized flickr URLs to avoid duplicates
|
||||
existing_urls = {
|
||||
r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
|
||||
if r[0]
|
||||
}
|
||||
print(f"Existing uploads: {session.query(FlickrUpload).count()}")
|
||||
print(f"Existing flickr URLs: {len(existing_urls)}")
|
||||
|
||||
# Load existing flickr uploads
|
||||
existing_flickr_urls = set()
|
||||
existing_uploads = []
|
||||
if FLICKR_UPLOADS_FILE.exists():
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
existing_uploads = json.load(f)
|
||||
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
||||
# Also normalize existing URLs for comparison
|
||||
for u in existing_uploads:
|
||||
url = u.get("flickr_url", "")
|
||||
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
existing_flickr_urls.add(normalized)
|
||||
# Build sent message index for FK linking
|
||||
sent_messages = (
|
||||
session.query(SentMessage)
|
||||
.filter(SentMessage.normalized_flickr_url != "")
|
||||
.filter(~SentMessage.subject.startswith("Re:"))
|
||||
.all()
|
||||
)
|
||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||
|
||||
print(f"Existing uploads: {len(existing_uploads)}")
|
||||
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard = (
|
||||
session.query(Contribution)
|
||||
.filter(Contribution.comment == "User created page with UploadWizard")
|
||||
.filter(Contribution.title.startswith("File:"))
|
||||
.all()
|
||||
)
|
||||
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard_contributions = []
|
||||
for c in contributions:
|
||||
comment = c.get("comment", "")
|
||||
if comment == "User created page with UploadWizard":
|
||||
# Only include if it's a File: page
|
||||
title = c.get("title", "")
|
||||
if title.startswith("File:"):
|
||||
upload_wizard_contributions.append(c)
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard)}")
|
||||
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
||||
# Process in batches of 50
|
||||
new_count = 0
|
||||
batch_size = 50
|
||||
|
||||
# Process in batches of 50
|
||||
new_uploads = []
|
||||
batch_size = 50
|
||||
for i in range(0, len(upload_wizard), batch_size):
|
||||
batch = upload_wizard[i : i + batch_size]
|
||||
titles = [c.title for c in batch]
|
||||
|
||||
for i in range(0, len(upload_wizard_contributions), batch_size):
|
||||
batch = upload_wizard_contributions[i : i + batch_size]
|
||||
titles = [c["title"] for c in batch]
|
||||
print(
|
||||
f"Processing batch {i // batch_size + 1}/"
|
||||
f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
|
||||
)
|
||||
|
||||
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
||||
metadata = get_image_metadata(titles)
|
||||
|
||||
metadata = get_image_metadata(titles)
|
||||
for c in batch:
|
||||
meta = metadata.get(c.title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
|
||||
for c in batch:
|
||||
title = c["title"]
|
||||
meta = metadata.get(title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if normalized in existing_urls:
|
||||
continue
|
||||
|
||||
# Check if we already have this URL
|
||||
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
||||
continue
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
# Look up sent message for FK linking
|
||||
msg = url_to_message.get(normalized) if normalized else None
|
||||
|
||||
new_upload = {
|
||||
"pageid": c["pageid"],
|
||||
"revid": c["revid"],
|
||||
"title": title,
|
||||
"timestamp": c["timestamp"],
|
||||
"flickr_url": flickr_url,
|
||||
"creator": creator,
|
||||
}
|
||||
session.add(FlickrUpload(
|
||||
pageid=c.pageid,
|
||||
revid=c.revid,
|
||||
title=c.title,
|
||||
timestamp=c.timestamp,
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=creator,
|
||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||
sent_message_id=msg.message_id if msg else None,
|
||||
))
|
||||
new_count += 1
|
||||
existing_urls.add(normalized)
|
||||
print(f" Found: {c.title[:50]} -> {flickr_url}")
|
||||
|
||||
new_uploads.append(new_upload)
|
||||
existing_flickr_urls.add(normalized)
|
||||
print(f" Found: {title[:50]} -> {flickr_url}")
|
||||
session.commit()
|
||||
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard_contributions):
|
||||
time.sleep(0.5)
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard):
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
||||
total = session.query(FlickrUpload).count()
|
||||
print(f"\nFound {new_count} new Flickr uploads")
|
||||
print(f"Total: {total} uploads in database")
|
||||
|
||||
if new_uploads:
|
||||
# Merge and sort by timestamp (newest first)
|
||||
all_uploads = existing_uploads + new_uploads
|
||||
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
|
||||
# Save
|
||||
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
||||
json.dump(all_uploads, f, indent=2)
|
||||
|
||||
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue