diff --git a/AGENTS.md b/AGENTS.md index 47b3193..ef5ddb7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -92,6 +92,8 @@ those obtained via Flickr mail requests. - `flickr_uploads`: derived table built by `update_flickr_uploads.py` by matching Commons uploads to Flickr URLs - `thumbnail_cache`: cached Commons API thumbnail URLs (7-day TTL) +- `interaction_log`: written by the web app to record searches and message + generation events (see below) **Key functions**: - `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match, @@ -115,6 +117,24 @@ Builds/updates `flickr_uploads` from `contributions` and links to - Extracts Flickr URL from contribution comment when present - Falls back to Commons `extmetadata.Credit` lookup when comment has no URL +### Interaction Logging (`log_interaction`) + +The `log_interaction()` helper writes a row to `interaction_log` on each +meaningful user action: + +- `"search_article"` – user submits a Wikipedia article search (page 1 only, + to avoid logging every pagination hit) +- `"search_category"` – user submits a Wikipedia category search +- `"generate_message"` – a non-free CC message is generated after clicking a photo + +Each row captures: Unix `timestamp`, `interaction_type`, `ip_address` +(prefers `X-Forwarded-For` for proxy setups), `user_agent`, `query` (article +title or category name), and optionally `flickr_url` / `wikipedia_url`. + +The table is created by `init_db()` (called via `python3 -c "from +flickr_mail.database import init_db; init_db()"` or any of the maintenance +scripts). The web app never calls `init_db()` itself. + ### Category Search (`/category` route) Finds Wikipedia articles in a category that don't have images. diff --git a/README.md b/README.md index b8d984d..3848a3e 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,18 @@ cp download_sent_mail.example.json download_sent_mail.local.json Then edit `download_sent_mail.local.json` and set `cookies_str` to your full Flickr `Cookie` header value. +## Interaction Logging + +The app logs searches and message generation to the `interaction_log` table: + +- `search_article`: when a user searches for a Wikipedia article title (page 1 only) +- `search_category`: when a user searches a Wikipedia category +- `generate_message`: when a non-free CC message is generated for a photo + +Each row records the timestamp, interaction type, client IP (from +`X-Forwarded-For` if present), User-Agent, query, and (for message events) +the Flickr and Wikipedia URLs. + ## Notes - `download_commons_contributions.py` uses an overlap window of known-only diff --git a/flickr_mail/models.py b/flickr_mail/models.py index 090b37a..0b2bedc 100644 --- a/flickr_mail/models.py +++ b/flickr_mail/models.py @@ -12,20 +12,20 @@ class Contribution(Base): __tablename__ = "contributions" id: Mapped[int] = mapped_column(primary_key=True) - userid: Mapped[int | None] - user: Mapped[str | None] - pageid: Mapped[int | None] - revid: Mapped[int | None] = mapped_column(unique=True) - parentid: Mapped[int | None] - ns: Mapped[int | None] - title: Mapped[str | None] - timestamp: Mapped[str | None] + userid: Mapped[int] + user: Mapped[str] + pageid: Mapped[int] + revid: Mapped[int] = mapped_column(unique=True) + parentid: Mapped[int] + ns: Mapped[int] + title: Mapped[str] + timestamp: Mapped[str] minor: Mapped[str | None] top: Mapped[str | None] - comment: Mapped[str | None] = mapped_column(Text) - size: Mapped[int | None] - sizediff: Mapped[int | None] - tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text + comment: Mapped[str] = mapped_column(Text) + size: Mapped[int] + sizediff: Mapped[int] + tags: Mapped[str] = mapped_column(Text) # JSON array stored as text __table_args__ = ( Index("ix_contributions_timestamp", "timestamp"), @@ -37,16 +37,16 @@ class SentMessage(Base): __tablename__ = "sent_messages" message_id: Mapped[str] = mapped_column(primary_key=True) - subject: Mapped[str | None] - url: Mapped[str | None] - recipient: Mapped[str | None] - date: Mapped[str | None] - body: Mapped[str | None] = mapped_column(Text) - body_html: Mapped[str | None] = mapped_column(Text) - flickr_url: Mapped[str | None] - normalized_flickr_url: Mapped[str | None] - wikipedia_url: Mapped[str | None] - creator_profile_url: Mapped[str | None] + subject: Mapped[str] + url: Mapped[str] + recipient: Mapped[str] + date: Mapped[str] + body: Mapped[str] = mapped_column(Text) + body_html: Mapped[str] = mapped_column(Text) + flickr_url: Mapped[str] + normalized_flickr_url: Mapped[str] + wikipedia_url: Mapped[str] + creator_profile_url: Mapped[str] flickr_uploads: Mapped[list["FlickrUpload"]] = relationship( back_populates="sent_message" @@ -62,15 +62,15 @@ class FlickrUpload(Base): __tablename__ = "flickr_uploads" id: Mapped[int] = mapped_column(primary_key=True) - pageid: Mapped[int | None] - revid: Mapped[int | None] - title: Mapped[str | None] - timestamp: Mapped[str | None] - flickr_url: Mapped[str | None] - normalized_flickr_url: Mapped[str | None] + pageid: Mapped[int] + revid: Mapped[int] + title: Mapped[str] + timestamp: Mapped[str] + flickr_url: Mapped[str] + normalized_flickr_url: Mapped[str] creator: Mapped[str | None] - wikipedia_url: Mapped[str | None] - creator_profile_url: Mapped[str | None] + wikipedia_url: Mapped[str] + creator_profile_url: Mapped[str] sent_message_id: Mapped[str | None] = mapped_column( ForeignKey("sent_messages.message_id") ) @@ -89,5 +89,23 @@ class ThumbnailCache(Base): __tablename__ = "thumbnail_cache" title: Mapped[str] = mapped_column(primary_key=True) - thumb_url: Mapped[str | None] - fetched_at: Mapped[int | None] # Unix timestamp + thumb_url: Mapped[str] + fetched_at: Mapped[int] # Unix timestamp + + +class InteractionLog(Base): + __tablename__ = "interaction_log" + + id: Mapped[int] = mapped_column(primary_key=True) + timestamp: Mapped[int] # Unix timestamp + interaction_type: Mapped[str] # "search_article", "search_category", "generate_message" + ip_address: Mapped[str | None] + user_agent: Mapped[str | None] = mapped_column(Text) + query: Mapped[str | None] # search term or category name + flickr_url: Mapped[str | None] + wikipedia_url: Mapped[str | None] + + __table_args__ = ( + Index("ix_interaction_log_timestamp", "timestamp"), + Index("ix_interaction_log_type", "interaction_type"), + ) diff --git a/main.py b/main.py index 25279ff..3e7c336 100755 --- a/main.py +++ b/main.py @@ -18,7 +18,7 @@ from sqlalchemy import func from werkzeug.debug.tbtools import DebugTraceback from flickr_mail.database import get_session -from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache +from flickr_mail.models import FlickrUpload, InteractionLog, SentMessage, ThumbnailCache from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url import re @@ -378,15 +378,12 @@ def has_content_image(images: list[dict]) -> bool: return False -def get_articles_without_images( - category: str, limit: int = 100 -) -> tuple[list[ArticleWithoutImage], str | None]: +def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: """Get articles in a category that don't have images. Uses generator=categorymembers with prop=images to efficiently check - multiple articles in a single API request. - - Returns a tuple of (articles_list, continue_token). + multiple articles in a single API request, following continuation until + all category members have been processed. """ params = { "action": "query", @@ -394,49 +391,54 @@ def get_articles_without_images( "gcmtitle": category, "gcmtype": "page", # Only articles, not subcategories or files "gcmnamespace": "0", # Main namespace only - "gcmlimit": str(limit), + "gcmlimit": "max", "prop": "images", "imlimit": "max", # Need enough to check all pages in batch "format": "json", } headers = {"User-Agent": WIKIMEDIA_USER_AGENT} - - try: - response = requests.get( - WIKIPEDIA_API, params=params, headers=headers, timeout=30 - ) - response.raise_for_status() - data = response.json() - except (requests.RequestException, json.JSONDecodeError) as e: - print(f"Wikipedia API error: {e}") - return [], None - articles_without_images: list[ArticleWithoutImage] = [] + continue_token: str | None = None - pages = data.get("query", {}).get("pages", {}) - for page in pages.values(): - images = page.get("images", []) + while True: + request_params = params.copy() + if continue_token: + request_params["gcmcontinue"] = continue_token - # Skip if page has content images (not just UI icons) - if has_content_image(images): - continue - - title = page.get("title", "") - pageid = page.get("pageid", 0) - - if title and pageid: - articles_without_images.append( - ArticleWithoutImage(title=title, pageid=pageid) + try: + response = requests.get( + WIKIPEDIA_API, params=request_params, headers=headers, timeout=30 ) + response.raise_for_status() + data = response.json() + except (requests.RequestException, json.JSONDecodeError) as e: + print(f"Wikipedia API error: {e}") + break + + pages = data.get("query", {}).get("pages", {}) + for page in pages.values(): + images = page.get("images", []) + + # Skip if page has content images (not just UI icons) + if has_content_image(images): + continue + + title = page.get("title", "") + pageid = page.get("pageid", 0) + + if title and pageid: + articles_without_images.append( + ArticleWithoutImage(title=title, pageid=pageid) + ) + + continue_token = data.get("continue", {}).get("gcmcontinue") + if not continue_token: + break # Sort by title for consistent display articles_without_images.sort(key=lambda a: a.title) - - # Get continue token if there are more results - continue_token = data.get("continue", {}).get("gcmcontinue") - - return articles_without_images, continue_token + return articles_without_images def is_valid_flickr_image_url(url: str) -> bool: @@ -583,6 +585,33 @@ def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult: ) +def log_interaction( + interaction_type: str, + query: str | None = None, + flickr_url: str | None = None, + wikipedia_url: str | None = None, +) -> None: + """Log a user interaction to the database.""" + forwarded_for = flask.request.headers.get("X-Forwarded-For") + ip_address = forwarded_for.split(",")[0].strip() if forwarded_for else flask.request.remote_addr + user_agent = flask.request.headers.get("User-Agent") + session = get_session() + try: + entry = InteractionLog( + timestamp=int(time.time()), + interaction_type=interaction_type, + ip_address=ip_address, + user_agent=user_agent, + query=query, + flickr_url=flickr_url, + wikipedia_url=wikipedia_url, + ) + session.add(entry) + session.commit() + finally: + session.close() + + @app.errorhandler(werkzeug.exceptions.InternalServerError) def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]: """Handle exception.""" @@ -656,6 +685,8 @@ def start() -> str: # Search Flickr for photos page = flask.request.args.get("page", 1, type=int) page = max(1, page) # Ensure page is at least 1 + if page == 1: + log_interaction("search_article", query=name, wikipedia_url=wikipedia_url) search_result = search_flickr(name, page) return flask.render_template( "combined.html", @@ -718,6 +749,13 @@ def start() -> str: previous_messages=previous_messages, ) + log_interaction( + "generate_message", + query=name, + flickr_url=flickr_url, + wikipedia_url=wikipedia_url, + ) + msg = flask.render_template( "message.jinja", flickr_url=flickr_url, @@ -768,7 +806,8 @@ def category_search() -> str: cat=cat, ) - articles, continue_token = get_articles_without_images(category) + log_interaction("search_category", query=category) + articles = get_articles_without_images(category) # Get the display name (without Category: prefix) category_name = category.replace("Category:", "") @@ -779,7 +818,6 @@ def category_search() -> str: category=category, category_name=category_name, articles=articles, - continue_token=continue_token, ) diff --git a/templates/category.html b/templates/category.html index da52980..31319b6 100644 --- a/templates/category.html +++ b/templates/category.html @@ -33,7 +33,7 @@
Articles without images in {{ category_name }}
{% if articles %} -

Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}

+

Found {{ articles | length }} article(s) without images

{% for article in articles %} @@ -44,10 +44,6 @@ {% endfor %}
- {% if continue_token %} -

Note: Only showing first batch of results. More articles may be available in this category.

- {% endif %} - {% else %}
All articles in this category have images!