Add pagination to category search for large categories
Large categories like "Living people" (900k+ articles) were impractical because the code tried to download all members before displaying results. Now stops after collecting ~200 articles and provides a "Next page" link. Also fixes the MediaWiki API continuation protocol: passes the full continue dict (not just gcmcontinue) so imcontinue responses are handled properly, and reduces gcmlimit from "max" to 50 so each batch's images fit in one API response. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ab012f9cf3
commit
57b2e474df
2 changed files with 54 additions and 15 deletions
63
main.py
63
main.py
|
|
@ -348,6 +348,14 @@ class ArticleWithoutImage:
|
||||||
return f"/?enwp={quote(self.title)}"
|
return f"/?enwp={quote(self.title)}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class CategoryResult:
|
||||||
|
"""Result of a paginated category search."""
|
||||||
|
|
||||||
|
articles: list[ArticleWithoutImage]
|
||||||
|
gcmcontinue: str | None
|
||||||
|
|
||||||
|
|
||||||
# Common non-content images to ignore when checking if an article has images
|
# Common non-content images to ignore when checking if an article has images
|
||||||
NON_CONTENT_IMAGE_PATTERNS = [
|
NON_CONTENT_IMAGE_PATTERNS = [
|
||||||
"OOjs UI icon",
|
"OOjs UI icon",
|
||||||
|
|
@ -378,12 +386,16 @@ def has_content_image(images: list[dict]) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
|
def get_articles_without_images(
|
||||||
|
category: str,
|
||||||
|
limit: int = 200,
|
||||||
|
gcmcontinue: str | None = None,
|
||||||
|
) -> CategoryResult:
|
||||||
"""Get articles in a category that don't have images.
|
"""Get articles in a category that don't have images.
|
||||||
|
|
||||||
Uses generator=categorymembers with prop=images to efficiently check
|
Uses generator=categorymembers with prop=images to efficiently check
|
||||||
multiple articles in a single API request, following continuation until
|
multiple articles in a single API request, following continuation until
|
||||||
all category members have been processed.
|
the limit is reached or all category members have been processed.
|
||||||
"""
|
"""
|
||||||
params = {
|
params = {
|
||||||
"action": "query",
|
"action": "query",
|
||||||
|
|
@ -391,20 +403,25 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
|
||||||
"gcmtitle": category,
|
"gcmtitle": category,
|
||||||
"gcmtype": "page", # Only articles, not subcategories or files
|
"gcmtype": "page", # Only articles, not subcategories or files
|
||||||
"gcmnamespace": "0", # Main namespace only
|
"gcmnamespace": "0", # Main namespace only
|
||||||
"gcmlimit": "max",
|
"gcmlimit": "50", # Small batches so images fit in one response
|
||||||
"prop": "images",
|
"prop": "images",
|
||||||
"imlimit": "max", # Need enough to check all pages in batch
|
"imlimit": "max",
|
||||||
"format": "json",
|
"format": "json",
|
||||||
}
|
}
|
||||||
|
|
||||||
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
|
||||||
articles_without_images: list[ArticleWithoutImage] = []
|
articles_without_images: list[ArticleWithoutImage] = []
|
||||||
continue_token: str | None = None
|
seen_pageids: set[int] = set()
|
||||||
|
next_gcmcontinue: str | None = None
|
||||||
|
|
||||||
|
# Build initial continue params from the external pagination token
|
||||||
|
continue_params: dict[str, str] = {}
|
||||||
|
if gcmcontinue:
|
||||||
|
continue_params = {"gcmcontinue": gcmcontinue, "continue": "gcmcontinue||"}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
request_params = params.copy()
|
request_params = params.copy()
|
||||||
if continue_token:
|
request_params.update(continue_params)
|
||||||
request_params["gcmcontinue"] = continue_token
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
|
|
@ -418,6 +435,11 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
|
||||||
|
|
||||||
pages = data.get("query", {}).get("pages", {})
|
pages = data.get("query", {}).get("pages", {})
|
||||||
for page in pages.values():
|
for page in pages.values():
|
||||||
|
pageid = page.get("pageid", 0)
|
||||||
|
if not pageid or pageid in seen_pageids:
|
||||||
|
continue
|
||||||
|
seen_pageids.add(pageid)
|
||||||
|
|
||||||
images = page.get("images", [])
|
images = page.get("images", [])
|
||||||
|
|
||||||
# Skip if page has content images (not just UI icons)
|
# Skip if page has content images (not just UI icons)
|
||||||
|
|
@ -425,20 +447,29 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = page.get("title", "")
|
title = page.get("title", "")
|
||||||
pageid = page.get("pageid", 0)
|
if title:
|
||||||
|
|
||||||
if title and pageid:
|
|
||||||
articles_without_images.append(
|
articles_without_images.append(
|
||||||
ArticleWithoutImage(title=title, pageid=pageid)
|
ArticleWithoutImage(title=title, pageid=pageid)
|
||||||
)
|
)
|
||||||
|
|
||||||
continue_token = data.get("continue", {}).get("gcmcontinue")
|
api_continue = data.get("continue")
|
||||||
if not continue_token:
|
if not api_continue:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Only stop at generator boundaries where we have a resumable token
|
||||||
|
gcmc = api_continue.get("gcmcontinue")
|
||||||
|
if gcmc and len(articles_without_images) >= limit:
|
||||||
|
next_gcmcontinue = gcmc
|
||||||
|
break
|
||||||
|
|
||||||
|
continue_params = api_continue
|
||||||
|
|
||||||
# Sort by title for consistent display
|
# Sort by title for consistent display
|
||||||
articles_without_images.sort(key=lambda a: a.title)
|
articles_without_images.sort(key=lambda a: a.title)
|
||||||
return articles_without_images
|
return CategoryResult(
|
||||||
|
articles=articles_without_images,
|
||||||
|
gcmcontinue=next_gcmcontinue,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_valid_flickr_image_url(url: str) -> bool:
|
def is_valid_flickr_image_url(url: str) -> bool:
|
||||||
|
|
@ -807,7 +838,8 @@ def category_search() -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
log_interaction("search_category", query=category)
|
log_interaction("search_category", query=category)
|
||||||
articles = get_articles_without_images(category)
|
gcmcontinue = flask.request.args.get("gcmcontinue") or None
|
||||||
|
result = get_articles_without_images(category, gcmcontinue=gcmcontinue)
|
||||||
|
|
||||||
# Get the display name (without Category: prefix)
|
# Get the display name (without Category: prefix)
|
||||||
category_name = category.replace("Category:", "")
|
category_name = category.replace("Category:", "")
|
||||||
|
|
@ -817,7 +849,8 @@ def category_search() -> str:
|
||||||
cat=cat,
|
cat=cat,
|
||||||
category=category,
|
category=category,
|
||||||
category_name=category_name,
|
category_name=category_name,
|
||||||
articles=articles,
|
articles=result.articles,
|
||||||
|
gcmcontinue=result.gcmcontinue,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,12 @@
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{% if gcmcontinue %}
|
||||||
|
<div class="mt-3">
|
||||||
|
<a href="{{ url_for('category_search', cat=cat, gcmcontinue=gcmcontinue) }}" class="btn btn-outline-primary">Next page »</a>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="alert alert-success mt-3">
|
<div class="alert alert-success mt-3">
|
||||||
All articles in this category have images!
|
All articles in this category have images!
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue