Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
147
download_commons_contributions.py
Executable file
147
download_commons_contributions.py
Executable file
|
|
@ -0,0 +1,147 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download Wikimedia Commons contributions for a user."""
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution
|
||||
|
||||
|
||||
API_URL = "https://commons.wikimedia.org/w/api.php"
|
||||
USERNAME = "Edward"
|
||||
|
||||
# Identify ourselves properly to Wikimedia
|
||||
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
|
||||
def fetch_contributions(
|
||||
continue_token: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Fetch a batch of contributions from the API."""
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "usercontribs",
|
||||
"ucuser": USERNAME,
|
||||
"uclimit": "500",
|
||||
"ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
if continue_token:
|
||||
params["uccontinue"] = continue_token
|
||||
|
||||
response = SESSION.get(API_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
contributions = data.get("query", {}).get("usercontribs", [])
|
||||
|
||||
# Get continuation token if more results available
|
||||
new_continue = data.get("continue", {}).get("uccontinue")
|
||||
|
||||
return contributions, new_continue
|
||||
|
||||
|
||||
def upsert_contribution(session, c: dict) -> None:
|
||||
"""Insert or update a contribution by revid."""
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
return # Already have this revision
|
||||
|
||||
session.add(Contribution(
|
||||
userid=c.get("userid"),
|
||||
user=c.get("user"),
|
||||
pageid=c.get("pageid"),
|
||||
revid=c.get("revid"),
|
||||
parentid=c.get("parentid"),
|
||||
ns=c.get("ns"),
|
||||
title=c.get("title"),
|
||||
timestamp=c.get("timestamp"),
|
||||
minor=c.get("minor"),
|
||||
top=c.get("top"),
|
||||
comment=c.get("comment"),
|
||||
size=c.get("size"),
|
||||
sizediff=c.get("sizediff"),
|
||||
tags=json.dumps(c.get("tags", [])),
|
||||
))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
try:
|
||||
existing_count = session.query(Contribution).count()
|
||||
|
||||
# Get the latest timestamp to know where to resume from
|
||||
latest = (
|
||||
session.query(Contribution)
|
||||
.order_by(Contribution.timestamp.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
if existing_count > 0 and latest:
|
||||
print(f"Database has {existing_count} contributions")
|
||||
print(f"Latest: {latest.timestamp}")
|
||||
print("Fetching new contributions...")
|
||||
else:
|
||||
print(f"Downloading contributions for user: {USERNAME}")
|
||||
|
||||
batch_num = 0
|
||||
new_count = 0
|
||||
continue_token = None
|
||||
|
||||
while True:
|
||||
batch_num += 1
|
||||
print(f" Fetching batch {batch_num}...", end=" ", flush=True)
|
||||
|
||||
contributions, continue_token = fetch_contributions(continue_token)
|
||||
|
||||
if not contributions:
|
||||
print("no results")
|
||||
break
|
||||
|
||||
batch_new = 0
|
||||
for c in contributions:
|
||||
# Stop if we've reached contributions we already have
|
||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
||||
if existing:
|
||||
continue
|
||||
upsert_contribution(session, c)
|
||||
batch_new += 1
|
||||
|
||||
new_count += batch_new
|
||||
print(f"got {len(contributions)}, {batch_new} new")
|
||||
|
||||
session.commit()
|
||||
|
||||
if batch_new == 0:
|
||||
# All contributions in this batch already exist, we're caught up
|
||||
print(" Caught up with existing data")
|
||||
break
|
||||
|
||||
if not continue_token:
|
||||
break
|
||||
|
||||
# Be polite to the API
|
||||
time.sleep(0.5)
|
||||
|
||||
total = session.query(Contribution).count()
|
||||
print(f"\nDone! {new_count} new contributions, {total} total in database")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue