Compare commits
No commits in common. "9f0fb018782800bee18b8cd0894e453cd869b8e6" and "c5efd429ce8e27a7fc9eea95a9970b61d18c5bac" have entirely different histories.
9f0fb01878
...
c5efd429ce
13 changed files with 321 additions and 1201 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -2,4 +2,3 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
commons_contributions/thumbnail_cache.json
|
commons_contributions/thumbnail_cache.json
|
||||||
commons_contributions/sent_mail_index.json
|
commons_contributions/sent_mail_index.json
|
||||||
flickr_mail.db
|
|
||||||
|
|
|
||||||
65
AGENTS.md
65
AGENTS.md
|
|
@ -14,9 +14,7 @@ licensing.
|
||||||
- **templates/**: Jinja2 templates using Bootstrap 5 for styling
|
- **templates/**: Jinja2 templates using Bootstrap 5 for styling
|
||||||
- `base.html`: Base template with Bootstrap CSS/JS
|
- `base.html`: Base template with Bootstrap CSS/JS
|
||||||
- `combined.html`: Main UI template for search, results, and message composition
|
- `combined.html`: Main UI template for search, results, and message composition
|
||||||
- `message.jinja`: Template for the permission request message body (with
|
- `message.jinja`: Template for the permission request message body
|
||||||
alternate text for non-free CC licenses)
|
|
||||||
- `category.html`: Category search page with visited link styling
|
|
||||||
- `show_error.html`: Error display template
|
- `show_error.html`: Error display template
|
||||||
|
|
||||||
## Key Components
|
## Key Components
|
||||||
|
|
@ -50,22 +48,10 @@ Represents a photo with:
|
||||||
|
|
||||||
### License Codes
|
### License Codes
|
||||||
|
|
||||||
Flickr uses numeric codes for licenses. Codes 1-6 are CC 2.0, codes 11-16 are
|
Wikipedia-compatible licenses (can be used): 4 (CC BY), 5 (CC BY-SA), 7 (No
|
||||||
CC 4.0 equivalents.
|
known copyright), 8 (US Government), 9 (CC0), 10 (Public Domain).
|
||||||
|
|
||||||
Wikipedia-compatible (`FREE_LICENSES`): 4 (CC BY 2.0), 5 (CC BY-SA 2.0),
|
Not compatible: 0 (All Rights Reserved), 1-3 (NC variants), 6 (ND).
|
||||||
7 (No known copyright), 8 (US Government), 9 (CC0), 10 (Public Domain),
|
|
||||||
14 (CC BY 4.0), 15 (CC BY-SA 4.0).
|
|
||||||
|
|
||||||
Non-free CC (`NONFREE_CC_LICENSES`): 1 (CC BY-NC-SA 2.0), 2 (CC BY-NC 2.0),
|
|
||||||
3 (CC BY-NC-ND 2.0), 6 (CC BY-ND 2.0), 11-13 (4.0 NC variants),
|
|
||||||
16 (CC BY-ND 4.0).
|
|
||||||
|
|
||||||
Not compatible: 0 (All Rights Reserved).
|
|
||||||
|
|
||||||
For free licenses, the message page shows an UploadWizard link instead of a
|
|
||||||
message. For non-free CC licenses, a tailored message explains which
|
|
||||||
restrictions (NC/ND) prevent Wikipedia use.
|
|
||||||
|
|
||||||
### URL Validation (`is_valid_flickr_image_url`)
|
### URL Validation (`is_valid_flickr_image_url`)
|
||||||
|
|
||||||
|
|
@ -108,40 +94,15 @@ Run to find Flickr uploads from UploadWizard contributions that don't have
|
||||||
the Flickr URL in the edit comment. Queries Commons API for image metadata
|
the Flickr URL in the edit comment. Queries Commons API for image metadata
|
||||||
and checks the Credit field for Flickr URLs.
|
and checks the Credit field for Flickr URLs.
|
||||||
|
|
||||||
### Category Search (`/category` route)
|
|
||||||
|
|
||||||
Finds Wikipedia articles in a category that don't have images.
|
|
||||||
|
|
||||||
**Key functions**:
|
|
||||||
- `parse_category_input()`: Accepts category name, `Category:` prefix, or full
|
|
||||||
Wikipedia URL
|
|
||||||
- `get_articles_without_images()`: Uses MediaWiki API with
|
|
||||||
`generator=categorymembers` and `prop=images` for efficient batch queries
|
|
||||||
- `has_content_image()`: Filters out non-content images (UI icons, logos) using
|
|
||||||
`NON_CONTENT_IMAGE_PATTERNS`
|
|
||||||
|
|
||||||
The `cat` URL parameter is preserved through search results and message pages
|
|
||||||
to allow back-navigation to the category.
|
|
||||||
|
|
||||||
### Previous Message Detection (`get_previous_messages`)
|
|
||||||
|
|
||||||
Checks `sent_mail/messages_index.json` for previous messages to a Flickr user.
|
|
||||||
Matches by both display name and username (case-insensitive). Results shown as
|
|
||||||
an info alert on the message page.
|
|
||||||
|
|
||||||
## Request Flow
|
## Request Flow
|
||||||
|
|
||||||
1. User enters Wikipedia article title/URL → `start()` extracts article name.
|
1. User enters Wikipedia article title/URL → `start()` extracts article name
|
||||||
Alternatively, user searches by category via `/category` route.
|
2. `search_flickr()` fetches and parses Flickr search results
|
||||||
2. `search_flickr()` fetches and parses Flickr search results.
|
3. Results displayed as clickable photo grid with license badges
|
||||||
Disambiguation suffixes like "(academic)" are removed for the search.
|
4. User clicks photo → page reloads with `flickr` and `img` params
|
||||||
3. Results displayed as clickable photo grid with license badges.
|
5. `flickr_usrename_to_nsid()` looks up the photographer's NSID
|
||||||
4. User clicks photo → page reloads with `flickr`, `img`, `license`, and
|
6. Message template rendered with photo details
|
||||||
`flickr_user` params.
|
7. User copies message and clicks link to Flickr's mail compose page
|
||||||
5. If license is Wikipedia-compatible: show UploadWizard link.
|
|
||||||
6. Otherwise: `flickr_usrename_to_nsid()` looks up the user's NSID, previous
|
|
||||||
messages are checked, and the appropriate message template is rendered.
|
|
||||||
7. User copies message and clicks link to Flickr's mail compose page.
|
|
||||||
|
|
||||||
## Testing Changes
|
## Testing Changes
|
||||||
|
|
||||||
|
|
@ -162,8 +123,6 @@ print(result.photos[0].title, result.photos[0].license_name)
|
||||||
## Potential Improvements
|
## Potential Improvements
|
||||||
|
|
||||||
- Cache search results to reduce Flickr requests
|
- Cache search results to reduce Flickr requests
|
||||||
- Add filtering by license type in search results
|
- Add filtering by license type
|
||||||
- Handle Flickr rate limiting/blocks more gracefully
|
- Handle Flickr rate limiting/blocks more gracefully
|
||||||
- Add tests for the parsing logic
|
- Add tests for the parsing logic
|
||||||
- Add pagination for category search (continue token is already returned)
|
|
||||||
- Confirm CC 4.0 license codes 11-15 (only 16 confirmed so far)
|
|
||||||
|
|
|
||||||
28
README.md
28
README.md
|
|
@ -22,23 +22,16 @@ photographers on Flickr whose photos can be used to enhance Wikipedia articles.
|
||||||
- **Integrated Flickr search**: Enter a Wikipedia article title and see Flickr
|
- **Integrated Flickr search**: Enter a Wikipedia article title and see Flickr
|
||||||
photos directly in the interface - no need to visit Flickr's search page.
|
photos directly in the interface - no need to visit Flickr's search page.
|
||||||
- **Photo grid with metadata**: Search results display as a grid of thumbnails
|
- **Photo grid with metadata**: Search results display as a grid of thumbnails
|
||||||
showing the user's name and license for each photo.
|
showing the photographer's name and license for each photo.
|
||||||
- **License handling**: Photos with Wikipedia-compatible licenses (CC BY,
|
- **License highlighting**: Photos with Wikipedia-compatible licenses (CC BY,
|
||||||
CC BY-SA, CC0, Public Domain) are highlighted with a green badge and link
|
CC BY-SA, CC0, Public Domain) are highlighted with a green badge.
|
||||||
directly to the Commons UploadWizard. Non-free CC licenses (NC/ND) show a
|
|
||||||
tailored message explaining Wikipedia's requirements. Supports both CC 2.0
|
|
||||||
and CC 4.0 license codes.
|
|
||||||
- **One-click message composition**: Click any photo to compose a permission
|
- **One-click message composition**: Click any photo to compose a permission
|
||||||
request message with the photo displayed alongside, showing the user's Flickr
|
request message with the photo displayed alongside.
|
||||||
profile and current license.
|
|
||||||
- **Previous message detection**: The message page checks sent mail history and
|
|
||||||
warns if you have previously contacted the user.
|
|
||||||
- **Category search**: Find Wikipedia articles without images in a given
|
|
||||||
category, with links to search Flickr for each article.
|
|
||||||
- **Pagination**: Browse through thousands of search results with page navigation.
|
- **Pagination**: Browse through thousands of search results with page navigation.
|
||||||
- **Recent uploads showcase**: The home page displays recent Wikimedia Commons
|
- **Recent uploads showcase**: The home page displays recent Wikimedia Commons
|
||||||
uploads that were obtained via Flickr mail requests, with links to the
|
uploads that were obtained via Flickr mail requests, with links to the
|
||||||
Wikipedia article and user's Flickr profile.
|
Wikipedia article and photographer's Flickr profile.
|
||||||
|
- Generate messages to request permission to use photos on Wikipedia.
|
||||||
- Handle exceptions gracefully and provide detailed error information.
|
- Handle exceptions gracefully and provide detailed error information.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
@ -47,14 +40,11 @@ To use the tool, follow these steps:
|
||||||
|
|
||||||
1. Start the tool by running the script.
|
1. Start the tool by running the script.
|
||||||
2. Access the tool through a web browser.
|
2. Access the tool through a web browser.
|
||||||
3. Enter a Wikipedia article title or URL, or use "Find articles by category"
|
3. Enter the Wikipedia article title or URL.
|
||||||
to discover articles that need images.
|
|
||||||
4. Browse the Flickr search results displayed in the interface.
|
4. Browse the Flickr search results displayed in the interface.
|
||||||
5. Click on a photo to select it. If the license is Wikipedia-compatible, you'll
|
5. Click on a photo to select it and compose a permission request message.
|
||||||
be linked to the Commons UploadWizard. Otherwise, a message is composed to
|
|
||||||
request a license change.
|
|
||||||
6. Copy the subject and message, then click "Send message on Flickr" to contact
|
6. Copy the subject and message, then click "Send message on Flickr" to contact
|
||||||
the user.
|
the photographer.
|
||||||
|
|
||||||
## Error Handling
|
## Error Handling
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,147 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""Download Wikimedia Commons contributions for a user."""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from flickr_mail.database import init_db, get_session
|
|
||||||
from flickr_mail.models import Contribution
|
|
||||||
|
|
||||||
|
|
||||||
API_URL = "https://commons.wikimedia.org/w/api.php"
|
|
||||||
USERNAME = "Edward"
|
|
||||||
|
|
||||||
# Identify ourselves properly to Wikimedia
|
|
||||||
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
|
||||||
|
|
||||||
SESSION = requests.Session()
|
|
||||||
SESSION.headers.update({"User-Agent": USER_AGENT})
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_contributions(
|
|
||||||
continue_token: str | None = None,
|
|
||||||
) -> tuple[list[dict], str | None]:
|
|
||||||
"""Fetch a batch of contributions from the API."""
|
|
||||||
params = {
|
|
||||||
"action": "query",
|
|
||||||
"list": "usercontribs",
|
|
||||||
"ucuser": USERNAME,
|
|
||||||
"uclimit": "500",
|
|
||||||
"ucprop": "ids|title|timestamp|comment|size|sizediff|flags|tags",
|
|
||||||
"format": "json",
|
|
||||||
}
|
|
||||||
|
|
||||||
if continue_token:
|
|
||||||
params["uccontinue"] = continue_token
|
|
||||||
|
|
||||||
response = SESSION.get(API_URL, params=params)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
contributions = data.get("query", {}).get("usercontribs", [])
|
|
||||||
|
|
||||||
# Get continuation token if more results available
|
|
||||||
new_continue = data.get("continue", {}).get("uccontinue")
|
|
||||||
|
|
||||||
return contributions, new_continue
|
|
||||||
|
|
||||||
|
|
||||||
def upsert_contribution(session, c: dict) -> None:
|
|
||||||
"""Insert or update a contribution by revid."""
|
|
||||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
|
||||||
if existing:
|
|
||||||
return # Already have this revision
|
|
||||||
|
|
||||||
session.add(Contribution(
|
|
||||||
userid=c.get("userid"),
|
|
||||||
user=c.get("user"),
|
|
||||||
pageid=c.get("pageid"),
|
|
||||||
revid=c.get("revid"),
|
|
||||||
parentid=c.get("parentid"),
|
|
||||||
ns=c.get("ns"),
|
|
||||||
title=c.get("title"),
|
|
||||||
timestamp=c.get("timestamp"),
|
|
||||||
minor=c.get("minor"),
|
|
||||||
top=c.get("top"),
|
|
||||||
comment=c.get("comment"),
|
|
||||||
size=c.get("size"),
|
|
||||||
sizediff=c.get("sizediff"),
|
|
||||||
tags=json.dumps(c.get("tags", [])),
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
"""Main entry point."""
|
|
||||||
init_db()
|
|
||||||
session = get_session()
|
|
||||||
|
|
||||||
try:
|
|
||||||
existing_count = session.query(Contribution).count()
|
|
||||||
|
|
||||||
# Get the latest timestamp to know where to resume from
|
|
||||||
latest = (
|
|
||||||
session.query(Contribution)
|
|
||||||
.order_by(Contribution.timestamp.desc())
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_count > 0 and latest:
|
|
||||||
print(f"Database has {existing_count} contributions")
|
|
||||||
print(f"Latest: {latest.timestamp}")
|
|
||||||
print("Fetching new contributions...")
|
|
||||||
else:
|
|
||||||
print(f"Downloading contributions for user: {USERNAME}")
|
|
||||||
|
|
||||||
batch_num = 0
|
|
||||||
new_count = 0
|
|
||||||
continue_token = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
batch_num += 1
|
|
||||||
print(f" Fetching batch {batch_num}...", end=" ", flush=True)
|
|
||||||
|
|
||||||
contributions, continue_token = fetch_contributions(continue_token)
|
|
||||||
|
|
||||||
if not contributions:
|
|
||||||
print("no results")
|
|
||||||
break
|
|
||||||
|
|
||||||
batch_new = 0
|
|
||||||
for c in contributions:
|
|
||||||
# Stop if we've reached contributions we already have
|
|
||||||
existing = session.query(Contribution).filter_by(revid=c["revid"]).first()
|
|
||||||
if existing:
|
|
||||||
continue
|
|
||||||
upsert_contribution(session, c)
|
|
||||||
batch_new += 1
|
|
||||||
|
|
||||||
new_count += batch_new
|
|
||||||
print(f"got {len(contributions)}, {batch_new} new")
|
|
||||||
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
if batch_new == 0:
|
|
||||||
# All contributions in this batch already exist, we're caught up
|
|
||||||
print(" Caught up with existing data")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not continue_token:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Be polite to the API
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
total = session.query(Contribution).count()
|
|
||||||
print(f"\nDone! {new_count} new contributions, {total} total in database")
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
session.rollback()
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,246 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""Download sent FlickrMail messages for backup."""
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from flickr_mail.database import init_db, get_session
|
|
||||||
from flickr_mail.models import SentMessage
|
|
||||||
from flickr_mail.url_utils import (
|
|
||||||
creator_profile_from_flickr_url,
|
|
||||||
extract_urls_from_message,
|
|
||||||
normalize_flickr_url,
|
|
||||||
)
|
|
||||||
|
|
||||||
BASE_URL = "https://www.flickr.com"
|
|
||||||
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
|
||||||
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
|
||||||
|
|
||||||
HEADERS = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-GB,en;q=0.9",
|
|
||||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
||||||
"DNT": "1",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Upgrade-Insecure-Requests": "1",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "same-origin",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
"Priority": "u=0, i",
|
|
||||||
}
|
|
||||||
|
|
||||||
COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
|
|
||||||
|
|
||||||
|
|
||||||
def parse_cookies(cookie_str: str) -> dict[str, str]:
|
|
||||||
"""Parse cookie string into dictionary."""
|
|
||||||
cookies = {}
|
|
||||||
for item in cookie_str.split("; "):
|
|
||||||
if "=" in item:
|
|
||||||
key, value = item.split("=", 1)
|
|
||||||
cookies[key] = value
|
|
||||||
return cookies
|
|
||||||
|
|
||||||
|
|
||||||
def create_session() -> requests.Session:
|
|
||||||
"""Create a requests session with authentication."""
|
|
||||||
session = requests.Session()
|
|
||||||
session.headers.update(HEADERS)
|
|
||||||
session.cookies.update(parse_cookies(COOKIES_STR))
|
|
||||||
return session
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
|
|
||||||
"""Fetch a page and return parsed HTML."""
|
|
||||||
response = session.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
return BeautifulSoup(response.text, "html.parser")
|
|
||||||
|
|
||||||
|
|
||||||
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
|
|
||||||
"""Extract message metadata from a sent mail list page."""
|
|
||||||
messages = []
|
|
||||||
|
|
||||||
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
|
|
||||||
mail_rows = soup.select("tr.message_row")
|
|
||||||
|
|
||||||
for row in mail_rows:
|
|
||||||
msg = {}
|
|
||||||
|
|
||||||
# Get message ID from the row id attribute
|
|
||||||
row_id = row.get("id", "")
|
|
||||||
if row_id.startswith("message_row_"):
|
|
||||||
msg["message_id"] = row_id.replace("message_row_", "")
|
|
||||||
|
|
||||||
# Find message link in the subject cell
|
|
||||||
subj_cell = row.select_one("td.subj")
|
|
||||||
if subj_cell:
|
|
||||||
link = subj_cell.find("a")
|
|
||||||
if link:
|
|
||||||
msg["subject"] = link.get_text(strip=True)
|
|
||||||
msg["url"] = BASE_URL + link["href"]
|
|
||||||
|
|
||||||
# Recipient is in td.fromto
|
|
||||||
fromto_cell = row.select_one("td.fromto")
|
|
||||||
if fromto_cell:
|
|
||||||
msg["recipient"] = fromto_cell.get_text(strip=True)
|
|
||||||
|
|
||||||
# Date is in td.date
|
|
||||||
date_cell = row.select_one("td.date")
|
|
||||||
if date_cell:
|
|
||||||
msg["date"] = date_cell.get_text(strip=True)
|
|
||||||
|
|
||||||
if "message_id" in msg:
|
|
||||||
messages.append(msg)
|
|
||||||
|
|
||||||
return messages
|
|
||||||
|
|
||||||
|
|
||||||
def extract_message_content(soup: BeautifulSoup) -> dict:
|
|
||||||
"""Extract full message content from a message page."""
|
|
||||||
content = {}
|
|
||||||
|
|
||||||
# Find the ThinCase div containing the message
|
|
||||||
thin_case = soup.select_one(".ThinCase")
|
|
||||||
if not thin_case:
|
|
||||||
return content
|
|
||||||
|
|
||||||
# Find the table with message content
|
|
||||||
table = thin_case.find("table")
|
|
||||||
if not table:
|
|
||||||
return content
|
|
||||||
|
|
||||||
rows = table.find_all("tr", recursive=False)
|
|
||||||
|
|
||||||
# Row 0: To: <recipient>
|
|
||||||
# Row 1: Subject: <subject>
|
|
||||||
# Row 2: <empty> <body>
|
|
||||||
for row in rows:
|
|
||||||
cells = row.find_all("td", recursive=False)
|
|
||||||
if len(cells) >= 2:
|
|
||||||
header_cell = cells[0]
|
|
||||||
value_cell = cells[1]
|
|
||||||
|
|
||||||
header = header_cell.get_text(strip=True).lower()
|
|
||||||
|
|
||||||
if header == "to:":
|
|
||||||
# Get recipient username
|
|
||||||
username = value_cell.select_one(".username")
|
|
||||||
if username:
|
|
||||||
content["recipient"] = username.get_text(strip=True)
|
|
||||||
|
|
||||||
elif header == "subject:":
|
|
||||||
# Get subject from h3
|
|
||||||
h3 = value_cell.find("h3")
|
|
||||||
if h3:
|
|
||||||
content["subject"] = h3.get_text(strip=True)
|
|
||||||
|
|
||||||
elif header == "":
|
|
||||||
# This is the message body row (empty header cell)
|
|
||||||
# Get the content but exclude the delete form
|
|
||||||
form = value_cell.find("form")
|
|
||||||
if form:
|
|
||||||
form.decompose()
|
|
||||||
|
|
||||||
content["body"] = value_cell.get_text(separator="\n", strip=True)
|
|
||||||
content["body_html"] = str(value_cell)
|
|
||||||
break # Body found, stop processing
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
"""Main entry point."""
|
|
||||||
init_db()
|
|
||||||
db_session = get_session()
|
|
||||||
|
|
||||||
try:
|
|
||||||
existing_ids = {
|
|
||||||
r[0] for r in db_session.query(SentMessage.message_id).all()
|
|
||||||
}
|
|
||||||
print(f"Database has {len(existing_ids)} messages")
|
|
||||||
|
|
||||||
http_session = create_session()
|
|
||||||
|
|
||||||
# Scrape all pages to find new messages
|
|
||||||
total_pages = 29
|
|
||||||
new_messages: list[dict] = []
|
|
||||||
|
|
||||||
print("Fetching message list from all pages...")
|
|
||||||
for page in range(1, total_pages + 1):
|
|
||||||
url = SENT_MAIL_URL.format(page=page)
|
|
||||||
print(f" Fetching page {page}/{total_pages}...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
soup = fetch_page(http_session, url)
|
|
||||||
page_messages = extract_messages_from_list_page(soup)
|
|
||||||
|
|
||||||
for msg in page_messages:
|
|
||||||
if msg["message_id"] not in existing_ids:
|
|
||||||
new_messages.append(msg)
|
|
||||||
|
|
||||||
time.sleep(1) # Be polite to the server
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Error fetching page {page}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"Found {len(new_messages)} new messages to download")
|
|
||||||
|
|
||||||
# Download individual messages
|
|
||||||
for i, msg in enumerate(new_messages, 1):
|
|
||||||
msg_id = msg["message_id"]
|
|
||||||
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
|
|
||||||
|
|
||||||
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
soup = fetch_page(http_session, url)
|
|
||||||
content = extract_message_content(soup)
|
|
||||||
|
|
||||||
# Merge with metadata
|
|
||||||
full_msg = {**msg, **content}
|
|
||||||
|
|
||||||
body = full_msg.get("body", "")
|
|
||||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
||||||
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
|
||||||
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
|
||||||
|
|
||||||
db_session.add(SentMessage(
|
|
||||||
message_id=msg_id,
|
|
||||||
subject=full_msg.get("subject", ""),
|
|
||||||
url=full_msg.get("url", ""),
|
|
||||||
recipient=full_msg.get("recipient", ""),
|
|
||||||
date=full_msg.get("date", ""),
|
|
||||||
body=body,
|
|
||||||
body_html=full_msg.get("body_html", ""),
|
|
||||||
flickr_url=flickr_url,
|
|
||||||
normalized_flickr_url=normalized,
|
|
||||||
wikipedia_url=wikipedia_url,
|
|
||||||
creator_profile_url=creator_profile,
|
|
||||||
))
|
|
||||||
db_session.commit()
|
|
||||||
|
|
||||||
time.sleep(1) # Be polite
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
db_session.rollback()
|
|
||||||
print(f" Error downloading message {msg_id}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
total = db_session.query(SentMessage).count()
|
|
||||||
print(f"Done! {total} messages in database")
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
db_session.rollback()
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
db_session.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,158 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Extract Flickr uploads from Wikimedia Commons contributions.
|
|
||||||
|
|
||||||
Filters contributions where the comment contains a flickr.com URL and extracts:
|
|
||||||
- pageid, revid, title, timestamp
|
|
||||||
- flickr_url: the Flickr photo URL
|
|
||||||
- creator: the photographer/author name
|
|
||||||
|
|
||||||
Links uploads to sent messages via normalized Flickr URL matching.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from flickr_mail.database import init_db, get_session
|
|
||||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
|
||||||
from flickr_mail.url_utils import normalize_flickr_url
|
|
||||||
|
|
||||||
|
|
||||||
def extract_flickr_url(comment: str) -> str | None:
|
|
||||||
"""Extract the Flickr photo URL from a comment."""
|
|
||||||
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
|
|
||||||
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
|
|
||||||
patterns = [
|
|
||||||
# Plain URL (modern format)
|
|
||||||
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
|
|
||||||
# URL in wiki markup [url title]
|
|
||||||
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
|
|
||||||
]
|
|
||||||
|
|
||||||
for pattern in patterns:
|
|
||||||
match = re.search(pattern, comment)
|
|
||||||
if match:
|
|
||||||
return match.group(1)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_creator(comment: str) -> str | None:
|
|
||||||
"""Extract the creator/author name from a comment."""
|
|
||||||
# Modern format: "Uploaded a work by {creator} from https://..."
|
|
||||||
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
|
|
||||||
if modern_match:
|
|
||||||
return modern_match.group(1).strip()
|
|
||||||
|
|
||||||
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
|
|
||||||
# The author name comes after the URL, before ] or "from"
|
|
||||||
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
|
|
||||||
if author_match:
|
|
||||||
author = author_match.group(1).strip()
|
|
||||||
# Remove trailing location like "from Toronto, Canada"
|
|
||||||
author = re.sub(r'\s+from\s+.+$', '', author)
|
|
||||||
return author
|
|
||||||
|
|
||||||
# Handle truncated comments where Author field is cut off
|
|
||||||
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
|
|
||||||
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
|
|
||||||
if truncated_match:
|
|
||||||
author = truncated_match.group(1).strip()
|
|
||||||
if author:
|
|
||||||
return author
|
|
||||||
|
|
||||||
# Sometimes Author field is just plain text without URL
|
|
||||||
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
|
|
||||||
if author_plain:
|
|
||||||
author = author_plain.group(1).strip()
|
|
||||||
# Skip if it looks like a wiki user link
|
|
||||||
if not author.startswith('[[User:') and author:
|
|
||||||
return author
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
"""Process contributions and extract Flickr uploads."""
|
|
||||||
init_db()
|
|
||||||
session = get_session()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Get existing upload revids to avoid duplicates
|
|
||||||
existing_revids = {
|
|
||||||
r[0] for r in session.query(FlickrUpload.revid).all()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Build sent message index: normalized_flickr_url -> message
|
|
||||||
sent_messages = (
|
|
||||||
session.query(SentMessage)
|
|
||||||
.filter(SentMessage.normalized_flickr_url != "")
|
|
||||||
.filter(~SentMessage.subject.startswith("Re:"))
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
|
||||||
print(f"Sent message index: {len(url_to_message)} entries")
|
|
||||||
|
|
||||||
# Query contributions with flickr.com in comment
|
|
||||||
contributions = (
|
|
||||||
session.query(Contribution)
|
|
||||||
.filter(Contribution.comment.ilike("%flickr.com%"))
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Found {len(contributions)} contributions mentioning flickr.com")
|
|
||||||
|
|
||||||
new_count = 0
|
|
||||||
for contrib in contributions:
|
|
||||||
if contrib.revid in existing_revids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
flickr_url = extract_flickr_url(contrib.comment or "")
|
|
||||||
if not flickr_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
creator = extract_creator(contrib.comment or "")
|
|
||||||
normalized = normalize_flickr_url(flickr_url)
|
|
||||||
|
|
||||||
# Look up sent message for FK linking
|
|
||||||
msg = url_to_message.get(normalized) if normalized else None
|
|
||||||
|
|
||||||
session.add(FlickrUpload(
|
|
||||||
pageid=contrib.pageid,
|
|
||||||
revid=contrib.revid,
|
|
||||||
title=contrib.title,
|
|
||||||
timestamp=contrib.timestamp,
|
|
||||||
flickr_url=flickr_url,
|
|
||||||
normalized_flickr_url=normalized,
|
|
||||||
creator=creator,
|
|
||||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
|
||||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
|
||||||
sent_message_id=msg.message_id if msg else None,
|
|
||||||
))
|
|
||||||
new_count += 1
|
|
||||||
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
total = session.query(FlickrUpload).count()
|
|
||||||
linked = session.query(FlickrUpload).filter(
|
|
||||||
FlickrUpload.sent_message_id.isnot(None)
|
|
||||||
).count()
|
|
||||||
|
|
||||||
print(f"Extracted {new_count} new Flickr uploads")
|
|
||||||
print(f"Total: {total} uploads, {linked} linked to sent messages")
|
|
||||||
|
|
||||||
# Show some stats
|
|
||||||
with_creator = session.query(FlickrUpload).filter(
|
|
||||||
FlickrUpload.creator.isnot(None)
|
|
||||||
).count()
|
|
||||||
print(f" - {with_creator} with creator identified")
|
|
||||||
print(f" - {total - with_creator} without creator")
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
session.rollback()
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
"""Database engine and session factory for flickr-mail."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from sqlalchemy import create_engine, event
|
|
||||||
from sqlalchemy.orm import Session, sessionmaker
|
|
||||||
|
|
||||||
from flickr_mail.models import Base
|
|
||||||
|
|
||||||
DB_PATH = Path(__file__).parent.parent / "flickr_mail.db"
|
|
||||||
|
|
||||||
engine = create_engine(f"sqlite:///{DB_PATH}")
|
|
||||||
SessionLocal = sessionmaker(bind=engine)
|
|
||||||
|
|
||||||
|
|
||||||
@event.listens_for(engine, "connect")
|
|
||||||
def set_sqlite_pragma(dbapi_connection, connection_record):
|
|
||||||
"""Enable WAL mode for concurrent read/write access."""
|
|
||||||
cursor = dbapi_connection.cursor()
|
|
||||||
cursor.execute("PRAGMA journal_mode=WAL")
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
|
|
||||||
def init_db() -> None:
|
|
||||||
"""Create all tables."""
|
|
||||||
Base.metadata.create_all(engine)
|
|
||||||
|
|
||||||
|
|
||||||
def get_session() -> Session:
|
|
||||||
"""Create a new database session."""
|
|
||||||
return SessionLocal()
|
|
||||||
|
|
@ -1,93 +0,0 @@
|
||||||
"""SQLAlchemy models for flickr-mail."""
|
|
||||||
|
|
||||||
from sqlalchemy import ForeignKey, Index, Text
|
|
||||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
||||||
|
|
||||||
|
|
||||||
class Base(DeclarativeBase):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Contribution(Base):
|
|
||||||
__tablename__ = "contributions"
|
|
||||||
|
|
||||||
id: Mapped[int] = mapped_column(primary_key=True)
|
|
||||||
userid: Mapped[int | None]
|
|
||||||
user: Mapped[str | None]
|
|
||||||
pageid: Mapped[int | None]
|
|
||||||
revid: Mapped[int | None] = mapped_column(unique=True)
|
|
||||||
parentid: Mapped[int | None]
|
|
||||||
ns: Mapped[int | None]
|
|
||||||
title: Mapped[str | None]
|
|
||||||
timestamp: Mapped[str | None]
|
|
||||||
minor: Mapped[str | None]
|
|
||||||
top: Mapped[str | None]
|
|
||||||
comment: Mapped[str | None] = mapped_column(Text)
|
|
||||||
size: Mapped[int | None]
|
|
||||||
sizediff: Mapped[int | None]
|
|
||||||
tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text
|
|
||||||
|
|
||||||
__table_args__ = (
|
|
||||||
Index("ix_contributions_timestamp", "timestamp"),
|
|
||||||
Index("ix_contributions_pageid", "pageid"),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class SentMessage(Base):
|
|
||||||
__tablename__ = "sent_messages"
|
|
||||||
|
|
||||||
message_id: Mapped[str] = mapped_column(primary_key=True)
|
|
||||||
subject: Mapped[str | None]
|
|
||||||
url: Mapped[str | None]
|
|
||||||
recipient: Mapped[str | None]
|
|
||||||
date: Mapped[str | None]
|
|
||||||
body: Mapped[str | None] = mapped_column(Text)
|
|
||||||
body_html: Mapped[str | None] = mapped_column(Text)
|
|
||||||
flickr_url: Mapped[str | None]
|
|
||||||
normalized_flickr_url: Mapped[str | None]
|
|
||||||
wikipedia_url: Mapped[str | None]
|
|
||||||
creator_profile_url: Mapped[str | None]
|
|
||||||
|
|
||||||
flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
|
|
||||||
back_populates="sent_message"
|
|
||||||
)
|
|
||||||
|
|
||||||
__table_args__ = (
|
|
||||||
Index("ix_sent_messages_recipient", "recipient"),
|
|
||||||
Index("ix_sent_messages_normalized_flickr_url", "normalized_flickr_url"),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class FlickrUpload(Base):
|
|
||||||
__tablename__ = "flickr_uploads"
|
|
||||||
|
|
||||||
id: Mapped[int] = mapped_column(primary_key=True)
|
|
||||||
pageid: Mapped[int | None]
|
|
||||||
revid: Mapped[int | None]
|
|
||||||
title: Mapped[str | None]
|
|
||||||
timestamp: Mapped[str | None]
|
|
||||||
flickr_url: Mapped[str | None]
|
|
||||||
normalized_flickr_url: Mapped[str | None]
|
|
||||||
creator: Mapped[str | None]
|
|
||||||
wikipedia_url: Mapped[str | None]
|
|
||||||
creator_profile_url: Mapped[str | None]
|
|
||||||
sent_message_id: Mapped[str | None] = mapped_column(
|
|
||||||
ForeignKey("sent_messages.message_id")
|
|
||||||
)
|
|
||||||
|
|
||||||
sent_message: Mapped[SentMessage | None] = relationship(
|
|
||||||
back_populates="flickr_uploads"
|
|
||||||
)
|
|
||||||
|
|
||||||
__table_args__ = (
|
|
||||||
Index("ix_flickr_uploads_normalized_flickr_url", "normalized_flickr_url"),
|
|
||||||
Index("ix_flickr_uploads_timestamp", "timestamp"),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ThumbnailCache(Base):
|
|
||||||
__tablename__ = "thumbnail_cache"
|
|
||||||
|
|
||||||
title: Mapped[str] = mapped_column(primary_key=True)
|
|
||||||
thumb_url: Mapped[str | None]
|
|
||||||
fetched_at: Mapped[int | None] # Unix timestamp
|
|
||||||
|
|
@ -1,52 +0,0 @@
|
||||||
"""Shared URL utility functions for flickr-mail."""
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_flickr_url(url: str) -> str:
|
|
||||||
"""Normalize a Flickr photo URL for comparison."""
|
|
||||||
# Remove protocol
|
|
||||||
url = url.replace("https://", "").replace("http://", "")
|
|
||||||
# Remove www.
|
|
||||||
url = url.replace("www.", "")
|
|
||||||
# Remove trailing slash
|
|
||||||
url = url.rstrip("/")
|
|
||||||
# Ensure it starts with flickr.com
|
|
||||||
if not url.startswith("flickr.com"):
|
|
||||||
return ""
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
|
||||||
"""Extract flickr URL and Wikipedia URL from message body."""
|
|
||||||
|
|
||||||
flickr_url = ""
|
|
||||||
wikipedia_url = ""
|
|
||||||
|
|
||||||
# Find flickr photo URLs
|
|
||||||
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
|
||||||
flickr_matches = re.findall(flickr_pattern, body)
|
|
||||||
if flickr_matches:
|
|
||||||
flickr_url = flickr_matches[0]
|
|
||||||
if not flickr_url.startswith("http"):
|
|
||||||
flickr_url = "https://" + flickr_url
|
|
||||||
|
|
||||||
# Find Wikipedia URLs
|
|
||||||
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
|
||||||
wiki_matches = re.findall(wiki_pattern, body)
|
|
||||||
if wiki_matches:
|
|
||||||
wikipedia_url = wiki_matches[0]
|
|
||||||
if not wikipedia_url.startswith("http"):
|
|
||||||
wikipedia_url = "https://" + wikipedia_url
|
|
||||||
|
|
||||||
return flickr_url, wikipedia_url
|
|
||||||
|
|
||||||
|
|
||||||
def creator_profile_from_flickr_url(flickr_url: str) -> str:
|
|
||||||
"""Extract creator profile URL from a flickr photo URL."""
|
|
||||||
parts = flickr_url.split("/")
|
|
||||||
for i, part in enumerate(parts):
|
|
||||||
if part == "photos" and i + 1 < len(parts):
|
|
||||||
username = parts[i + 1]
|
|
||||||
return f"https://www.flickr.com/photos/{username}"
|
|
||||||
return ""
|
|
||||||
307
main.py
307
main.py
|
|
@ -9,17 +9,14 @@ import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import typing
|
import typing
|
||||||
|
from pathlib import Path
|
||||||
from urllib.parse import quote, unquote
|
from urllib.parse import quote, unquote
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
import requests
|
import requests
|
||||||
import werkzeug
|
import werkzeug
|
||||||
from sqlalchemy import func
|
|
||||||
from werkzeug.debug.tbtools import DebugTraceback
|
from werkzeug.debug.tbtools import DebugTraceback
|
||||||
|
|
||||||
from flickr_mail.database import get_session
|
|
||||||
from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
|
|
||||||
from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
@ -29,6 +26,18 @@ app.debug = False
|
||||||
|
|
||||||
enwiki = "en.wikipedia.org/wiki/"
|
enwiki = "en.wikipedia.org/wiki/"
|
||||||
|
|
||||||
|
# Path to Commons contributions data and sent mail
|
||||||
|
COMMONS_UPLOADS_FILE = (
|
||||||
|
Path(__file__).parent / "commons_contributions" / "flickr_uploads.json"
|
||||||
|
)
|
||||||
|
COMMONS_CACHE_FILE = (
|
||||||
|
Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json"
|
||||||
|
)
|
||||||
|
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||||
|
SENT_MAIL_INDEX_FILE = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||||
|
SENT_MAIL_INDEX_CACHE = (
|
||||||
|
Path(__file__).parent / "commons_contributions" / "sent_mail_index.json"
|
||||||
|
)
|
||||||
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days
|
||||||
RECENT_UPLOADS_COUNT = 24
|
RECENT_UPLOADS_COUNT = 24
|
||||||
|
|
||||||
|
|
@ -156,6 +165,132 @@ class CommonsUpload:
|
||||||
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
return "Wikidata item" if self.is_wikidata_item else "Wikipedia article"
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_flickr_url(url: str) -> str:
|
||||||
|
"""Normalize a Flickr photo URL for comparison."""
|
||||||
|
# Remove protocol
|
||||||
|
url = url.replace("https://", "").replace("http://", "")
|
||||||
|
# Remove www.
|
||||||
|
url = url.replace("www.", "")
|
||||||
|
# Remove trailing slash
|
||||||
|
url = url.rstrip("/")
|
||||||
|
# Ensure it starts with flickr.com
|
||||||
|
if not url.startswith("flickr.com"):
|
||||||
|
return ""
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_urls_from_message(body: str) -> tuple[str, str]:
|
||||||
|
"""Extract flickr URL and Wikipedia URL from message body."""
|
||||||
|
|
||||||
|
flickr_url = ""
|
||||||
|
wikipedia_url = ""
|
||||||
|
|
||||||
|
# Find flickr photo URLs
|
||||||
|
flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+"
|
||||||
|
flickr_matches = re.findall(flickr_pattern, body)
|
||||||
|
if flickr_matches:
|
||||||
|
flickr_url = flickr_matches[0]
|
||||||
|
if not flickr_url.startswith("http"):
|
||||||
|
flickr_url = "https://" + flickr_url
|
||||||
|
|
||||||
|
# Find Wikipedia URLs
|
||||||
|
wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+"
|
||||||
|
wiki_matches = re.findall(wiki_pattern, body)
|
||||||
|
if wiki_matches:
|
||||||
|
wikipedia_url = wiki_matches[0]
|
||||||
|
if not wikipedia_url.startswith("http"):
|
||||||
|
wikipedia_url = "https://" + wikipedia_url
|
||||||
|
|
||||||
|
return flickr_url, wikipedia_url
|
||||||
|
|
||||||
|
|
||||||
|
def build_sent_mail_index() -> dict[str, dict[str, str]]:
|
||||||
|
"""Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}."""
|
||||||
|
if not SENT_MAIL_DIR.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Check if we have a cached index
|
||||||
|
if SENT_MAIL_INDEX_CACHE.exists():
|
||||||
|
try:
|
||||||
|
with open(SENT_MAIL_INDEX_CACHE) as f:
|
||||||
|
cache = json.load(f)
|
||||||
|
# Check if cache is still valid (compare file count)
|
||||||
|
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||||
|
if cache.get("file_count") == len(json_files):
|
||||||
|
return cache.get("index", {})
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
index: dict[str, dict[str, str]] = {}
|
||||||
|
json_files = list(SENT_MAIL_DIR.glob("*.json"))
|
||||||
|
|
||||||
|
for json_file in json_files:
|
||||||
|
try:
|
||||||
|
with open(json_file) as f:
|
||||||
|
message = json.load(f)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip replies - we want original requests
|
||||||
|
subject = message.get("subject", "")
|
||||||
|
if subject.startswith("Re:"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
body = message.get("body", "")
|
||||||
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||||
|
|
||||||
|
if not flickr_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
normalized = normalize_flickr_url(flickr_url)
|
||||||
|
if not normalized:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract creator profile URL from flickr URL
|
||||||
|
# flickr.com/photos/username/12345 -> flickr.com/photos/username
|
||||||
|
parts = flickr_url.split("/")
|
||||||
|
creator_profile = ""
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if part == "photos" and i + 1 < len(parts):
|
||||||
|
username = parts[i + 1]
|
||||||
|
creator_profile = f"https://www.flickr.com/photos/{username}"
|
||||||
|
break
|
||||||
|
|
||||||
|
index[normalized] = {
|
||||||
|
"wikipedia_url": wikipedia_url,
|
||||||
|
"creator_profile_url": creator_profile,
|
||||||
|
"recipient": message.get("recipient", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cache the index
|
||||||
|
try:
|
||||||
|
with open(SENT_MAIL_INDEX_CACHE, "w") as f:
|
||||||
|
json.dump({"file_count": len(json_files), "index": index}, f)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def load_commons_thumbnail_cache() -> dict[str, typing.Any]:
|
||||||
|
"""Load the thumbnail cache from disk."""
|
||||||
|
if not COMMONS_CACHE_FILE.exists():
|
||||||
|
return {"timestamp": 0, "thumbnails": {}}
|
||||||
|
try:
|
||||||
|
with open(COMMONS_CACHE_FILE) as f:
|
||||||
|
return typing.cast(dict[str, typing.Any], json.load(f))
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
return {"timestamp": 0, "thumbnails": {}}
|
||||||
|
|
||||||
|
|
||||||
|
def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None:
|
||||||
|
"""Save the thumbnail cache to disk."""
|
||||||
|
try:
|
||||||
|
with open(COMMONS_CACHE_FILE, "w") as f:
|
||||||
|
json.dump(cache, f)
|
||||||
|
except OSError:
|
||||||
|
pass # Ignore cache write errors
|
||||||
|
|
||||||
|
|
||||||
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]:
|
||||||
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
"""Fetch thumbnail URLs from Commons API for the given file titles."""
|
||||||
|
|
@ -205,72 +340,79 @@ def get_recent_commons_uploads() -> tuple[list[CommonsUpload], int]:
|
||||||
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
Returns a tuple of (uploads_list, total_count) where total_count is the total number
|
||||||
of uploads obtained via Flickr mail (not just the ones returned).
|
of uploads obtained via Flickr mail (not just the ones returned).
|
||||||
"""
|
"""
|
||||||
session = get_session()
|
if not COMMONS_UPLOADS_FILE.exists():
|
||||||
|
return [], 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
query = (
|
with open(COMMONS_UPLOADS_FILE) as f:
|
||||||
session.query(FlickrUpload, SentMessage)
|
all_uploads = json.load(f)
|
||||||
.join(SentMessage)
|
except (json.JSONDecodeError, OSError):
|
||||||
.order_by(FlickrUpload.timestamp.desc())
|
return [], 0
|
||||||
|
|
||||||
|
# Build sent mail index
|
||||||
|
sent_mail_index = build_sent_mail_index()
|
||||||
|
|
||||||
|
# Filter uploads to only those with matching sent mail
|
||||||
|
# Count all matches, but only keep RECENT_UPLOADS_COUNT for display
|
||||||
|
uploads_with_mail: list[dict[str, typing.Any]] = []
|
||||||
|
total_matched = 0
|
||||||
|
for upload in all_uploads:
|
||||||
|
flickr_url = upload.get("flickr_url", "")
|
||||||
|
normalized = normalize_flickr_url(flickr_url)
|
||||||
|
if normalized and normalized in sent_mail_index:
|
||||||
|
total_matched += 1
|
||||||
|
if len(uploads_with_mail) < RECENT_UPLOADS_COUNT:
|
||||||
|
upload["_mail_info"] = sent_mail_index[normalized]
|
||||||
|
uploads_with_mail.append(upload)
|
||||||
|
|
||||||
|
if not uploads_with_mail:
|
||||||
|
return [], 0
|
||||||
|
|
||||||
|
# Load cache and check if it's still valid
|
||||||
|
cache = load_commons_thumbnail_cache()
|
||||||
|
cache_age = time.time() - cache.get("timestamp", 0)
|
||||||
|
cached_thumbs = cache.get("thumbnails", {})
|
||||||
|
|
||||||
|
# Find which titles need fetching
|
||||||
|
titles = [u["title"] for u in uploads_with_mail]
|
||||||
|
titles_to_fetch = [t for t in titles if t not in cached_thumbs]
|
||||||
|
|
||||||
|
# Fetch missing thumbnails or refresh if cache is old
|
||||||
|
if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE:
|
||||||
|
new_thumbs = fetch_commons_thumbnails(
|
||||||
|
titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch
|
||||||
)
|
)
|
||||||
total_matched = query.count()
|
cached_thumbs.update(new_thumbs)
|
||||||
if total_matched == 0:
|
cache = {"timestamp": time.time(), "thumbnails": cached_thumbs}
|
||||||
return [], 0
|
save_commons_thumbnail_cache(cache)
|
||||||
|
|
||||||
recent = query.limit(RECENT_UPLOADS_COUNT).all()
|
# Build the result list
|
||||||
|
result: list[CommonsUpload] = []
|
||||||
|
for upload in uploads_with_mail:
|
||||||
|
title = upload["title"]
|
||||||
|
thumb_url = cached_thumbs.get(title, "")
|
||||||
|
if not thumb_url:
|
||||||
|
continue
|
||||||
|
|
||||||
# Get thumbnails from cache
|
mail_info = upload.get("_mail_info", {})
|
||||||
titles = [upload.title for upload, msg in recent]
|
|
||||||
now = int(time.time())
|
|
||||||
cached = {
|
|
||||||
tc.title: tc
|
|
||||||
for tc in session.query(ThumbnailCache)
|
|
||||||
.filter(ThumbnailCache.title.in_(titles))
|
|
||||||
.all()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Find titles needing fetch (missing or expired)
|
# Convert title to Commons URL
|
||||||
titles_to_fetch = [
|
commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}"
|
||||||
t for t in titles
|
|
||||||
if t not in cached or (now - (cached[t].fetched_at or 0)) > COMMONS_CACHE_MAX_AGE
|
|
||||||
]
|
|
||||||
|
|
||||||
if titles_to_fetch:
|
result.append(
|
||||||
new_thumbs = fetch_commons_thumbnails(titles_to_fetch)
|
CommonsUpload(
|
||||||
for title, thumb_url in new_thumbs.items():
|
title=title.replace("File:", "").rsplit(".", 1)[0],
|
||||||
existing = cached.get(title)
|
thumb_url=thumb_url,
|
||||||
if existing:
|
commons_url=commons_url,
|
||||||
existing.thumb_url = thumb_url
|
flickr_url=upload.get("flickr_url", ""),
|
||||||
existing.fetched_at = now
|
creator=upload.get("creator") or "Unknown",
|
||||||
else:
|
timestamp=upload.get("timestamp", "")[:10],
|
||||||
tc = ThumbnailCache(title=title, thumb_url=thumb_url, fetched_at=now)
|
wikipedia_url=mail_info.get("wikipedia_url", ""),
|
||||||
session.add(tc)
|
creator_profile_url=mail_info.get("creator_profile_url", ""),
|
||||||
cached[title] = tc
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
result: list[CommonsUpload] = []
|
|
||||||
for upload, msg in recent:
|
|
||||||
thumb_url = cached[upload.title].thumb_url if upload.title in cached else ""
|
|
||||||
if not thumb_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
commons_url = f"https://commons.wikimedia.org/wiki/{upload.title.replace(' ', '_')}"
|
|
||||||
|
|
||||||
result.append(
|
|
||||||
CommonsUpload(
|
|
||||||
title=upload.title.replace("File:", "").rsplit(".", 1)[0],
|
|
||||||
thumb_url=thumb_url,
|
|
||||||
commons_url=commons_url,
|
|
||||||
flickr_url=upload.flickr_url or "",
|
|
||||||
creator=upload.creator or "Unknown",
|
|
||||||
timestamp=(upload.timestamp or "")[:10],
|
|
||||||
wikipedia_url=upload.wikipedia_url or "",
|
|
||||||
creator_profile_url=upload.creator_profile_url or "",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return result, total_matched
|
return result, total_matched
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
|
|
||||||
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||||
|
|
@ -279,33 +421,26 @@ def get_previous_messages(flickr_user: str, flickr_username: str) -> list[dict]:
|
||||||
Checks both the display name (flickr_user) and username (flickr_username)
|
Checks both the display name (flickr_user) and username (flickr_username)
|
||||||
against the recipient field in the messages index.
|
against the recipient field in the messages index.
|
||||||
"""
|
"""
|
||||||
names = set()
|
if not SENT_MAIL_INDEX_FILE.exists():
|
||||||
if flickr_user:
|
|
||||||
names.add(flickr_user.lower())
|
|
||||||
if flickr_username:
|
|
||||||
names.add(flickr_username.lower())
|
|
||||||
if not names:
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
session = get_session()
|
|
||||||
try:
|
try:
|
||||||
messages = (
|
with open(SENT_MAIL_INDEX_FILE) as f:
|
||||||
session.query(SentMessage)
|
messages = json.load(f)
|
||||||
.filter(func.lower(SentMessage.recipient).in_(names))
|
except (json.JSONDecodeError, OSError):
|
||||||
.all()
|
return []
|
||||||
)
|
|
||||||
return [
|
# Normalize for case-insensitive comparison
|
||||||
{
|
flickr_user_lower = flickr_user.lower() if flickr_user else ""
|
||||||
"message_id": m.message_id,
|
flickr_username_lower = flickr_username.lower() if flickr_username else ""
|
||||||
"subject": m.subject,
|
|
||||||
"url": m.url,
|
matches = []
|
||||||
"recipient": m.recipient,
|
for msg in messages:
|
||||||
"date": m.date,
|
recipient = msg.get("recipient", "").lower()
|
||||||
}
|
if recipient and (recipient == flickr_user_lower or recipient == flickr_username_lower):
|
||||||
for m in messages
|
matches.append(msg)
|
||||||
]
|
|
||||||
finally:
|
return matches
|
||||||
session.close()
|
|
||||||
|
|
||||||
|
|
||||||
def parse_category_input(category_input: str) -> str | None:
|
def parse_category_input(category_input: str) -> str | None:
|
||||||
|
|
|
||||||
|
|
@ -1,233 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""One-time migration from JSON files to SQLite database."""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from flickr_mail.database import init_db, get_session
|
|
||||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
|
|
||||||
from flickr_mail.url_utils import (
|
|
||||||
creator_profile_from_flickr_url,
|
|
||||||
extract_urls_from_message,
|
|
||||||
normalize_flickr_url,
|
|
||||||
)
|
|
||||||
|
|
||||||
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
|
|
||||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
|
||||||
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
|
||||||
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
|
|
||||||
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
|
|
||||||
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_contributions(session) -> int:
|
|
||||||
"""Migrate contributions.json to contributions table."""
|
|
||||||
if not CONTRIBUTIONS_FILE.exists():
|
|
||||||
print("No contributions.json found, skipping")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
with open(CONTRIBUTIONS_FILE) as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
contributions = data.get("contributions", [])
|
|
||||||
print(f"Migrating {len(contributions)} contributions...")
|
|
||||||
|
|
||||||
for c in contributions:
|
|
||||||
session.add(Contribution(
|
|
||||||
userid=c.get("userid"),
|
|
||||||
user=c.get("user"),
|
|
||||||
pageid=c.get("pageid"),
|
|
||||||
revid=c.get("revid"),
|
|
||||||
parentid=c.get("parentid"),
|
|
||||||
ns=c.get("ns"),
|
|
||||||
title=c.get("title"),
|
|
||||||
timestamp=c.get("timestamp"),
|
|
||||||
minor=c.get("minor"),
|
|
||||||
top=c.get("top"),
|
|
||||||
comment=c.get("comment"),
|
|
||||||
size=c.get("size"),
|
|
||||||
sizediff=c.get("sizediff"),
|
|
||||||
tags=json.dumps(c.get("tags", [])),
|
|
||||||
))
|
|
||||||
|
|
||||||
session.flush()
|
|
||||||
count = session.query(Contribution).count()
|
|
||||||
print(f" -> {count} contributions migrated")
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_sent_messages(session) -> dict[str, str]:
|
|
||||||
"""Migrate sent messages to sent_messages table.
|
|
||||||
|
|
||||||
Returns a dict of normalized_flickr_url -> message_id for FK linking.
|
|
||||||
"""
|
|
||||||
if not SENT_MAIL_INDEX.exists():
|
|
||||||
print("No messages_index.json found, skipping")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
with open(SENT_MAIL_INDEX) as f:
|
|
||||||
index = json.load(f)
|
|
||||||
|
|
||||||
print(f"Migrating {len(index)} sent messages...")
|
|
||||||
|
|
||||||
url_to_message_id: dict[str, str] = {}
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
for msg_meta in index:
|
|
||||||
msg_id = msg_meta.get("message_id", "")
|
|
||||||
if not msg_id:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Load the full message from individual file
|
|
||||||
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
|
|
||||||
if msg_file.exists():
|
|
||||||
with open(msg_file) as f:
|
|
||||||
msg = json.load(f)
|
|
||||||
else:
|
|
||||||
msg = msg_meta
|
|
||||||
|
|
||||||
body = msg.get("body", "")
|
|
||||||
subject = msg.get("subject", "")
|
|
||||||
|
|
||||||
# Extract URLs from body
|
|
||||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
||||||
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
|
||||||
|
|
||||||
# Extract creator profile URL
|
|
||||||
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
|
||||||
|
|
||||||
session.add(SentMessage(
|
|
||||||
message_id=msg_id,
|
|
||||||
subject=msg.get("subject", ""),
|
|
||||||
url=msg.get("url", ""),
|
|
||||||
recipient=msg.get("recipient", ""),
|
|
||||||
date=msg.get("date", ""),
|
|
||||||
body=body,
|
|
||||||
body_html=msg.get("body_html", ""),
|
|
||||||
flickr_url=flickr_url,
|
|
||||||
normalized_flickr_url=normalized,
|
|
||||||
wikipedia_url=wikipedia_url,
|
|
||||||
creator_profile_url=creator_profile_url,
|
|
||||||
))
|
|
||||||
|
|
||||||
# Build URL -> message_id map for FK linking (skip replies)
|
|
||||||
if normalized and not subject.startswith("Re:"):
|
|
||||||
url_to_message_id[normalized] = msg_id
|
|
||||||
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
session.flush()
|
|
||||||
actual = session.query(SentMessage).count()
|
|
||||||
print(f" -> {actual} sent messages migrated")
|
|
||||||
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
|
|
||||||
return url_to_message_id
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
|
|
||||||
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
|
|
||||||
if not FLICKR_UPLOADS_FILE.exists():
|
|
||||||
print("No flickr_uploads.json found, skipping")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
with open(FLICKR_UPLOADS_FILE) as f:
|
|
||||||
uploads = json.load(f)
|
|
||||||
|
|
||||||
print(f"Migrating {len(uploads)} flickr uploads...")
|
|
||||||
|
|
||||||
linked = 0
|
|
||||||
for u in uploads:
|
|
||||||
flickr_url = u.get("flickr_url", "")
|
|
||||||
normalized = normalize_flickr_url(flickr_url)
|
|
||||||
|
|
||||||
# Look up sent message FK
|
|
||||||
sent_message_id = url_to_message_id.get(normalized) if normalized else None
|
|
||||||
if sent_message_id:
|
|
||||||
linked += 1
|
|
||||||
|
|
||||||
# Get wikipedia_url and creator_profile_url from the linked message
|
|
||||||
wikipedia_url = ""
|
|
||||||
creator_profile_url = ""
|
|
||||||
if sent_message_id:
|
|
||||||
msg = session.get(SentMessage, sent_message_id)
|
|
||||||
if msg:
|
|
||||||
wikipedia_url = msg.wikipedia_url or ""
|
|
||||||
creator_profile_url = msg.creator_profile_url or ""
|
|
||||||
|
|
||||||
session.add(FlickrUpload(
|
|
||||||
pageid=u.get("pageid"),
|
|
||||||
revid=u.get("revid"),
|
|
||||||
title=u.get("title"),
|
|
||||||
timestamp=u.get("timestamp"),
|
|
||||||
flickr_url=flickr_url,
|
|
||||||
normalized_flickr_url=normalized,
|
|
||||||
creator=u.get("creator"),
|
|
||||||
wikipedia_url=wikipedia_url,
|
|
||||||
creator_profile_url=creator_profile_url,
|
|
||||||
sent_message_id=sent_message_id,
|
|
||||||
))
|
|
||||||
|
|
||||||
session.flush()
|
|
||||||
count = session.query(FlickrUpload).count()
|
|
||||||
print(f" -> {count} flickr uploads migrated")
|
|
||||||
print(f" -> {linked} linked to sent messages")
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_thumbnail_cache(session) -> int:
|
|
||||||
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
|
|
||||||
if not THUMBNAIL_CACHE_FILE.exists():
|
|
||||||
print("No thumbnail_cache.json found, skipping")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
with open(THUMBNAIL_CACHE_FILE) as f:
|
|
||||||
cache = json.load(f)
|
|
||||||
|
|
||||||
thumbnails = cache.get("thumbnails", {})
|
|
||||||
cache_timestamp = int(cache.get("timestamp", 0))
|
|
||||||
|
|
||||||
print(f"Migrating {len(thumbnails)} cached thumbnails...")
|
|
||||||
|
|
||||||
for title, thumb_url in thumbnails.items():
|
|
||||||
session.add(ThumbnailCache(
|
|
||||||
title=title,
|
|
||||||
thumb_url=thumb_url,
|
|
||||||
fetched_at=cache_timestamp,
|
|
||||||
))
|
|
||||||
|
|
||||||
session.flush()
|
|
||||||
count = session.query(ThumbnailCache).count()
|
|
||||||
print(f" -> {count} thumbnail cache entries migrated")
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
print("Initializing database...")
|
|
||||||
init_db()
|
|
||||||
|
|
||||||
session = get_session()
|
|
||||||
try:
|
|
||||||
# Check if already migrated
|
|
||||||
existing = session.query(Contribution).count()
|
|
||||||
if existing > 0:
|
|
||||||
print(f"Database already contains {existing} contributions. Aborting.")
|
|
||||||
print("Delete flickr_mail.db to re-run migration.")
|
|
||||||
return
|
|
||||||
|
|
||||||
migrate_contributions(session)
|
|
||||||
url_to_message_id = migrate_sent_messages(session)
|
|
||||||
migrate_flickr_uploads(session, url_to_message_id)
|
|
||||||
migrate_thumbnail_cache(session)
|
|
||||||
|
|
||||||
session.commit()
|
|
||||||
print("\nMigration complete!")
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
session.rollback()
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Find UploadWizard contributions that are from Flickr and add them to the database.
|
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
||||||
|
|
||||||
For contributions with comment 'User created page with UploadWizard', queries the
|
For contributions with comment 'User created page with UploadWizard', queries the
|
||||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||||
|
|
@ -9,13 +9,12 @@ Commons API to check if the image source is Flickr (by checking the Credit field
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from flickr_mail.database import init_db, get_session
|
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
||||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
||||||
from flickr_mail.url_utils import normalize_flickr_url
|
|
||||||
|
|
||||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||||
|
|
||||||
|
|
@ -76,101 +75,99 @@ def clean_artist_name(artist_html: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
init_db()
|
# Load contributions
|
||||||
session = get_session()
|
print("Loading contributions...")
|
||||||
|
with open(CONTRIBUTIONS_FILE) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
try:
|
contributions = data.get("contributions", [])
|
||||||
# Get existing normalized flickr URLs to avoid duplicates
|
|
||||||
existing_urls = {
|
|
||||||
r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
|
|
||||||
if r[0]
|
|
||||||
}
|
|
||||||
print(f"Existing uploads: {session.query(FlickrUpload).count()}")
|
|
||||||
print(f"Existing flickr URLs: {len(existing_urls)}")
|
|
||||||
|
|
||||||
# Build sent message index for FK linking
|
# Load existing flickr uploads
|
||||||
sent_messages = (
|
existing_flickr_urls = set()
|
||||||
session.query(SentMessage)
|
existing_uploads = []
|
||||||
.filter(SentMessage.normalized_flickr_url != "")
|
if FLICKR_UPLOADS_FILE.exists():
|
||||||
.filter(~SentMessage.subject.startswith("Re:"))
|
with open(FLICKR_UPLOADS_FILE) as f:
|
||||||
.all()
|
existing_uploads = json.load(f)
|
||||||
)
|
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
||||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
# Also normalize existing URLs for comparison
|
||||||
|
for u in existing_uploads:
|
||||||
|
url = u.get("flickr_url", "")
|
||||||
|
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||||
|
existing_flickr_urls.add(normalized)
|
||||||
|
|
||||||
# Find UploadWizard contributions (page creations only)
|
print(f"Existing uploads: {len(existing_uploads)}")
|
||||||
upload_wizard = (
|
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
||||||
session.query(Contribution)
|
|
||||||
.filter(Contribution.comment == "User created page with UploadWizard")
|
|
||||||
.filter(Contribution.title.startswith("File:"))
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"UploadWizard contributions to check: {len(upload_wizard)}")
|
# Find UploadWizard contributions (page creations only)
|
||||||
|
upload_wizard_contributions = []
|
||||||
|
for c in contributions:
|
||||||
|
comment = c.get("comment", "")
|
||||||
|
if comment == "User created page with UploadWizard":
|
||||||
|
# Only include if it's a File: page
|
||||||
|
title = c.get("title", "")
|
||||||
|
if title.startswith("File:"):
|
||||||
|
upload_wizard_contributions.append(c)
|
||||||
|
|
||||||
# Process in batches of 50
|
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
||||||
new_count = 0
|
|
||||||
batch_size = 50
|
|
||||||
|
|
||||||
for i in range(0, len(upload_wizard), batch_size):
|
# Process in batches of 50
|
||||||
batch = upload_wizard[i : i + batch_size]
|
new_uploads = []
|
||||||
titles = [c.title for c in batch]
|
batch_size = 50
|
||||||
|
|
||||||
print(
|
for i in range(0, len(upload_wizard_contributions), batch_size):
|
||||||
f"Processing batch {i // batch_size + 1}/"
|
batch = upload_wizard_contributions[i : i + batch_size]
|
||||||
f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
|
titles = [c["title"] for c in batch]
|
||||||
)
|
|
||||||
|
|
||||||
metadata = get_image_metadata(titles)
|
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
||||||
|
|
||||||
for c in batch:
|
metadata = get_image_metadata(titles)
|
||||||
meta = metadata.get(c.title, {})
|
|
||||||
credit = meta.get("credit", "")
|
|
||||||
artist = meta.get("artist", "")
|
|
||||||
|
|
||||||
flickr_url = extract_flickr_url_from_credit(credit)
|
for c in batch:
|
||||||
if not flickr_url:
|
title = c["title"]
|
||||||
continue
|
meta = metadata.get(title, {})
|
||||||
|
credit = meta.get("credit", "")
|
||||||
|
artist = meta.get("artist", "")
|
||||||
|
|
||||||
normalized = normalize_flickr_url(flickr_url)
|
flickr_url = extract_flickr_url_from_credit(credit)
|
||||||
if normalized in existing_urls:
|
if not flickr_url:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
creator = clean_artist_name(artist) if artist else None
|
# Check if we already have this URL
|
||||||
|
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||||
|
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
# Look up sent message for FK linking
|
creator = clean_artist_name(artist) if artist else None
|
||||||
msg = url_to_message.get(normalized) if normalized else None
|
|
||||||
|
|
||||||
session.add(FlickrUpload(
|
new_upload = {
|
||||||
pageid=c.pageid,
|
"pageid": c["pageid"],
|
||||||
revid=c.revid,
|
"revid": c["revid"],
|
||||||
title=c.title,
|
"title": title,
|
||||||
timestamp=c.timestamp,
|
"timestamp": c["timestamp"],
|
||||||
flickr_url=flickr_url,
|
"flickr_url": flickr_url,
|
||||||
normalized_flickr_url=normalized,
|
"creator": creator,
|
||||||
creator=creator,
|
}
|
||||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
|
||||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
|
||||||
sent_message_id=msg.message_id if msg else None,
|
|
||||||
))
|
|
||||||
new_count += 1
|
|
||||||
existing_urls.add(normalized)
|
|
||||||
print(f" Found: {c.title[:50]} -> {flickr_url}")
|
|
||||||
|
|
||||||
session.commit()
|
new_uploads.append(new_upload)
|
||||||
|
existing_flickr_urls.add(normalized)
|
||||||
|
print(f" Found: {title[:50]} -> {flickr_url}")
|
||||||
|
|
||||||
# Rate limiting
|
# Rate limiting
|
||||||
if i + batch_size < len(upload_wizard):
|
if i + batch_size < len(upload_wizard_contributions):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
total = session.query(FlickrUpload).count()
|
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
||||||
print(f"\nFound {new_count} new Flickr uploads")
|
|
||||||
print(f"Total: {total} uploads in database")
|
|
||||||
|
|
||||||
except Exception:
|
if new_uploads:
|
||||||
session.rollback()
|
# Merge and sort by timestamp (newest first)
|
||||||
raise
|
all_uploads = existing_uploads + new_uploads
|
||||||
finally:
|
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||||
session.close()
|
|
||||||
|
# Save
|
||||||
|
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
||||||
|
json.dump(all_uploads, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue