Add interaction logging and tighten model NOT NULL constraints
Log searches (article/category) and message-generation events to a new interaction_log table, capturing IP address and User-Agent. Also apply NOT NULL constraints to Contribution, SentMessage, FlickrUpload, and ThumbnailCache fields that are always populated, and remove stale continue_token references from category.html. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
252a854e76
commit
08f5128e8d
5 changed files with 160 additions and 76 deletions
20
AGENTS.md
20
AGENTS.md
|
|
@ -92,6 +92,8 @@ those obtained via Flickr mail requests.
|
|||
- `flickr_uploads`: derived table built by `update_flickr_uploads.py` by
|
||||
matching Commons uploads to Flickr URLs
|
||||
- `thumbnail_cache`: cached Commons API thumbnail URLs (7-day TTL)
|
||||
- `interaction_log`: written by the web app to record searches and message
|
||||
generation events (see below)
|
||||
|
||||
**Key functions**:
|
||||
- `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match,
|
||||
|
|
@ -115,6 +117,24 @@ Builds/updates `flickr_uploads` from `contributions` and links to
|
|||
- Extracts Flickr URL from contribution comment when present
|
||||
- Falls back to Commons `extmetadata.Credit` lookup when comment has no URL
|
||||
|
||||
### Interaction Logging (`log_interaction`)
|
||||
|
||||
The `log_interaction()` helper writes a row to `interaction_log` on each
|
||||
meaningful user action:
|
||||
|
||||
- `"search_article"` – user submits a Wikipedia article search (page 1 only,
|
||||
to avoid logging every pagination hit)
|
||||
- `"search_category"` – user submits a Wikipedia category search
|
||||
- `"generate_message"` – a non-free CC message is generated after clicking a photo
|
||||
|
||||
Each row captures: Unix `timestamp`, `interaction_type`, `ip_address`
|
||||
(prefers `X-Forwarded-For` for proxy setups), `user_agent`, `query` (article
|
||||
title or category name), and optionally `flickr_url` / `wikipedia_url`.
|
||||
|
||||
The table is created by `init_db()` (called via `python3 -c "from
|
||||
flickr_mail.database import init_db; init_db()"` or any of the maintenance
|
||||
scripts). The web app never calls `init_db()` itself.
|
||||
|
||||
### Category Search (`/category` route)
|
||||
|
||||
Finds Wikipedia articles in a category that don't have images.
|
||||
|
|
|
|||
12
README.md
12
README.md
|
|
@ -79,6 +79,18 @@ cp download_sent_mail.example.json download_sent_mail.local.json
|
|||
Then edit `download_sent_mail.local.json` and set `cookies_str` to your full
|
||||
Flickr `Cookie` header value.
|
||||
|
||||
## Interaction Logging
|
||||
|
||||
The app logs searches and message generation to the `interaction_log` table:
|
||||
|
||||
- `search_article`: when a user searches for a Wikipedia article title (page 1 only)
|
||||
- `search_category`: when a user searches a Wikipedia category
|
||||
- `generate_message`: when a non-free CC message is generated for a photo
|
||||
|
||||
Each row records the timestamp, interaction type, client IP (from
|
||||
`X-Forwarded-For` if present), User-Agent, query, and (for message events)
|
||||
the Flickr and Wikipedia URLs.
|
||||
|
||||
## Notes
|
||||
|
||||
- `download_commons_contributions.py` uses an overlap window of known-only
|
||||
|
|
|
|||
|
|
@ -12,20 +12,20 @@ class Contribution(Base):
|
|||
__tablename__ = "contributions"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
userid: Mapped[int | None]
|
||||
user: Mapped[str | None]
|
||||
pageid: Mapped[int | None]
|
||||
revid: Mapped[int | None] = mapped_column(unique=True)
|
||||
parentid: Mapped[int | None]
|
||||
ns: Mapped[int | None]
|
||||
title: Mapped[str | None]
|
||||
timestamp: Mapped[str | None]
|
||||
userid: Mapped[int]
|
||||
user: Mapped[str]
|
||||
pageid: Mapped[int]
|
||||
revid: Mapped[int] = mapped_column(unique=True)
|
||||
parentid: Mapped[int]
|
||||
ns: Mapped[int]
|
||||
title: Mapped[str]
|
||||
timestamp: Mapped[str]
|
||||
minor: Mapped[str | None]
|
||||
top: Mapped[str | None]
|
||||
comment: Mapped[str | None] = mapped_column(Text)
|
||||
size: Mapped[int | None]
|
||||
sizediff: Mapped[int | None]
|
||||
tags: Mapped[str | None] = mapped_column(Text) # JSON array stored as text
|
||||
comment: Mapped[str] = mapped_column(Text)
|
||||
size: Mapped[int]
|
||||
sizediff: Mapped[int]
|
||||
tags: Mapped[str] = mapped_column(Text) # JSON array stored as text
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_contributions_timestamp", "timestamp"),
|
||||
|
|
@ -37,16 +37,16 @@ class SentMessage(Base):
|
|||
__tablename__ = "sent_messages"
|
||||
|
||||
message_id: Mapped[str] = mapped_column(primary_key=True)
|
||||
subject: Mapped[str | None]
|
||||
url: Mapped[str | None]
|
||||
recipient: Mapped[str | None]
|
||||
date: Mapped[str | None]
|
||||
body: Mapped[str | None] = mapped_column(Text)
|
||||
body_html: Mapped[str | None] = mapped_column(Text)
|
||||
flickr_url: Mapped[str | None]
|
||||
normalized_flickr_url: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str | None]
|
||||
creator_profile_url: Mapped[str | None]
|
||||
subject: Mapped[str]
|
||||
url: Mapped[str]
|
||||
recipient: Mapped[str]
|
||||
date: Mapped[str]
|
||||
body: Mapped[str] = mapped_column(Text)
|
||||
body_html: Mapped[str] = mapped_column(Text)
|
||||
flickr_url: Mapped[str]
|
||||
normalized_flickr_url: Mapped[str]
|
||||
wikipedia_url: Mapped[str]
|
||||
creator_profile_url: Mapped[str]
|
||||
|
||||
flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
|
||||
back_populates="sent_message"
|
||||
|
|
@ -62,15 +62,15 @@ class FlickrUpload(Base):
|
|||
__tablename__ = "flickr_uploads"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
pageid: Mapped[int | None]
|
||||
revid: Mapped[int | None]
|
||||
title: Mapped[str | None]
|
||||
timestamp: Mapped[str | None]
|
||||
flickr_url: Mapped[str | None]
|
||||
normalized_flickr_url: Mapped[str | None]
|
||||
pageid: Mapped[int]
|
||||
revid: Mapped[int]
|
||||
title: Mapped[str]
|
||||
timestamp: Mapped[str]
|
||||
flickr_url: Mapped[str]
|
||||
normalized_flickr_url: Mapped[str]
|
||||
creator: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str | None]
|
||||
creator_profile_url: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str]
|
||||
creator_profile_url: Mapped[str]
|
||||
sent_message_id: Mapped[str | None] = mapped_column(
|
||||
ForeignKey("sent_messages.message_id")
|
||||
)
|
||||
|
|
@ -89,5 +89,23 @@ class ThumbnailCache(Base):
|
|||
__tablename__ = "thumbnail_cache"
|
||||
|
||||
title: Mapped[str] = mapped_column(primary_key=True)
|
||||
thumb_url: Mapped[str | None]
|
||||
fetched_at: Mapped[int | None] # Unix timestamp
|
||||
thumb_url: Mapped[str]
|
||||
fetched_at: Mapped[int] # Unix timestamp
|
||||
|
||||
|
||||
class InteractionLog(Base):
|
||||
__tablename__ = "interaction_log"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
timestamp: Mapped[int] # Unix timestamp
|
||||
interaction_type: Mapped[str] # "search_article", "search_category", "generate_message"
|
||||
ip_address: Mapped[str | None]
|
||||
user_agent: Mapped[str | None] = mapped_column(Text)
|
||||
query: Mapped[str | None] # search term or category name
|
||||
flickr_url: Mapped[str | None]
|
||||
wikipedia_url: Mapped[str | None]
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_interaction_log_timestamp", "timestamp"),
|
||||
Index("ix_interaction_log_type", "interaction_type"),
|
||||
)
|
||||
|
|
|
|||
76
main.py
76
main.py
|
|
@ -18,7 +18,7 @@ from sqlalchemy import func
|
|||
from werkzeug.debug.tbtools import DebugTraceback
|
||||
|
||||
from flickr_mail.database import get_session
|
||||
from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
|
||||
from flickr_mail.models import FlickrUpload, InteractionLog, SentMessage, ThumbnailCache
|
||||
from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url
|
||||
|
||||
import re
|
||||
|
|
@ -378,15 +378,12 @@ def has_content_image(images: list[dict]) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def get_articles_without_images(
|
||||
category: str, limit: int = 100
|
||||
) -> tuple[list[ArticleWithoutImage], str | None]:
|
||||
def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
|
||||
"""Get articles in a category that don't have images.
|
||||
|
||||
Uses generator=categorymembers with prop=images to efficiently check
|
||||
multiple articles in a single API request.
|
||||
|
||||
Returns a tuple of (articles_list, continue_token).
|
||||
multiple articles in a single API request, following continuation until
|
||||
all category members have been processed.
|
||||
"""
|
||||
params = {
|
||||
"action": "query",
|
||||
|
|
@ -394,25 +391,30 @@ def get_articles_without_images(
|
|||
"gcmtitle": category,
|
||||
"gcmtype": "page", # Only articles, not subcategories or files
|
||||
"gcmnamespace": "0", # Main namespace only
|
||||
"gcmlimit": str(limit),
|
||||
"gcmlimit": "max",
|
||||
"prop": "images",
|
||||
"imlimit": "max", # Need enough to check all pages in batch
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||
articles_without_images: list[ArticleWithoutImage] = []
|
||||
continue_token: str | None = None
|
||||
|
||||
while True:
|
||||
request_params = params.copy()
|
||||
if continue_token:
|
||||
request_params["gcmcontinue"] = continue_token
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
WIKIPEDIA_API, params=params, headers=headers, timeout=30
|
||||
WIKIPEDIA_API, params=request_params, headers=headers, timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
print(f"Wikipedia API error: {e}")
|
||||
return [], None
|
||||
|
||||
articles_without_images: list[ArticleWithoutImage] = []
|
||||
break
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
|
|
@ -430,13 +432,13 @@ def get_articles_without_images(
|
|||
ArticleWithoutImage(title=title, pageid=pageid)
|
||||
)
|
||||
|
||||
continue_token = data.get("continue", {}).get("gcmcontinue")
|
||||
if not continue_token:
|
||||
break
|
||||
|
||||
# Sort by title for consistent display
|
||||
articles_without_images.sort(key=lambda a: a.title)
|
||||
|
||||
# Get continue token if there are more results
|
||||
continue_token = data.get("continue", {}).get("gcmcontinue")
|
||||
|
||||
return articles_without_images, continue_token
|
||||
return articles_without_images
|
||||
|
||||
|
||||
def is_valid_flickr_image_url(url: str) -> bool:
|
||||
|
|
@ -583,6 +585,33 @@ def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult:
|
|||
)
|
||||
|
||||
|
||||
def log_interaction(
|
||||
interaction_type: str,
|
||||
query: str | None = None,
|
||||
flickr_url: str | None = None,
|
||||
wikipedia_url: str | None = None,
|
||||
) -> None:
|
||||
"""Log a user interaction to the database."""
|
||||
forwarded_for = flask.request.headers.get("X-Forwarded-For")
|
||||
ip_address = forwarded_for.split(",")[0].strip() if forwarded_for else flask.request.remote_addr
|
||||
user_agent = flask.request.headers.get("User-Agent")
|
||||
session = get_session()
|
||||
try:
|
||||
entry = InteractionLog(
|
||||
timestamp=int(time.time()),
|
||||
interaction_type=interaction_type,
|
||||
ip_address=ip_address,
|
||||
user_agent=user_agent,
|
||||
query=query,
|
||||
flickr_url=flickr_url,
|
||||
wikipedia_url=wikipedia_url,
|
||||
)
|
||||
session.add(entry)
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@app.errorhandler(werkzeug.exceptions.InternalServerError)
|
||||
def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]:
|
||||
"""Handle exception."""
|
||||
|
|
@ -656,6 +685,8 @@ def start() -> str:
|
|||
# Search Flickr for photos
|
||||
page = flask.request.args.get("page", 1, type=int)
|
||||
page = max(1, page) # Ensure page is at least 1
|
||||
if page == 1:
|
||||
log_interaction("search_article", query=name, wikipedia_url=wikipedia_url)
|
||||
search_result = search_flickr(name, page)
|
||||
return flask.render_template(
|
||||
"combined.html",
|
||||
|
|
@ -718,6 +749,13 @@ def start() -> str:
|
|||
previous_messages=previous_messages,
|
||||
)
|
||||
|
||||
log_interaction(
|
||||
"generate_message",
|
||||
query=name,
|
||||
flickr_url=flickr_url,
|
||||
wikipedia_url=wikipedia_url,
|
||||
)
|
||||
|
||||
msg = flask.render_template(
|
||||
"message.jinja",
|
||||
flickr_url=flickr_url,
|
||||
|
|
@ -768,7 +806,8 @@ def category_search() -> str:
|
|||
cat=cat,
|
||||
)
|
||||
|
||||
articles, continue_token = get_articles_without_images(category)
|
||||
log_interaction("search_category", query=category)
|
||||
articles = get_articles_without_images(category)
|
||||
|
||||
# Get the display name (without Category: prefix)
|
||||
category_name = category.replace("Category:", "")
|
||||
|
|
@ -779,7 +818,6 @@ def category_search() -> str:
|
|||
category=category,
|
||||
category_name=category_name,
|
||||
articles=articles,
|
||||
continue_token=continue_token,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@
|
|||
<h5>Articles without images in <a href="https://en.wikipedia.org/wiki/{{ category | replace(' ', '_') }}" target="_blank">{{ category_name }}</a></h5>
|
||||
|
||||
{% if articles %}
|
||||
<p class="text-muted small">Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}</p>
|
||||
<p class="text-muted small">Found {{ articles | length }} article(s) without images</p>
|
||||
|
||||
<div class="list-group">
|
||||
{% for article in articles %}
|
||||
|
|
@ -44,10 +44,6 @@
|
|||
{% endfor %}
|
||||
</div>
|
||||
|
||||
{% if continue_token %}
|
||||
<p class="text-muted small mt-3">Note: Only showing first batch of results. More articles may be available in this category.</p>
|
||||
{% endif %}
|
||||
|
||||
{% else %}
|
||||
<div class="alert alert-success mt-3">
|
||||
All articles in this category have images!
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue