Show Articles With Multiple Dablinks report on index page.

This commit is contained in:
Edward Betts 2022-08-17 08:52:45 +01:00
parent 6f4a5ecc56
commit d499c896b4
2 changed files with 47 additions and 11 deletions

View file

@ -2,8 +2,11 @@
{% block content %} {% block content %}
<ul> <ul>
{% for enwiki in articles %} {% for enwiki, count in articles %}
<li><a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}</li> <li>
<a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
({{ count }} links)
</li>
{% endfor %} {% endfor %}
</ul> </ul>
{% endblock %} {% endblock %}

View file

@ -3,8 +3,7 @@
import inspect import inspect
import json import json
import re import re
from dab_mechanic import wikidata_oauth from typing import Any, Iterator, Optional, TypedDict
from typing import Any, Iterator, TypedDict
import flask import flask
import lxml.html import lxml.html
@ -14,6 +13,8 @@ from requests_oauthlib import OAuth1Session
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from werkzeug.wrappers import Response from werkzeug.wrappers import Response
from dab_mechanic import wikidata_oauth
app = flask.Flask(__name__) app = flask.Flask(__name__)
app.config.from_object("config.default") app.config.from_object("config.default")
app.debug = True app.debug = True
@ -22,6 +23,7 @@ wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_api_php = f"https://{wiki_hostname}/w/api.php"
wiki_index_php = f"https://{wiki_hostname}/w/index.php" wiki_index_php = f"https://{wiki_hostname}/w/index.php"
@app.before_request @app.before_request
def global_user(): def global_user():
"""Make username available everywhere.""" """Make username available everywhere."""
@ -59,15 +61,35 @@ def get_content(title: str) -> str:
return rev return rev
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
"""Parse Articles With Multiple Dablinks."""
articles = []
table = root.find(".//table")
for tr in table:
title = tr[0][0].text
count_text = tr[1][0].text
assert count_text.endswith(" links")
count = int(count_text[:-6])
articles.append((title, count))
return articles
@app.route("/") @app.route("/")
def index(): def index():
articles = [line[:-1] for line in open("article_list")]
r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
root = lxml.html.fromstring(r.content)
articles = parse_articles_with_dab_links(root)
# articles = [line[:-1] for line in open("article_list")]
return flask.render_template("index.html", articles=articles) return flask.render_template("index.html", articles=articles)
def get_article_html(enwiki: str) -> str: def call_parse_api(enwiki: str) -> dict[str, Any]:
"""Parse article wikitext and return HTML.""" """Call mediawiki parse API for given article."""
url = "https://en.wikipedia.org/w/api.php" url = "https://en.wikipedia.org/w/api.php"
params: dict[str, str | int] = { params: dict[str, str | int] = {
@ -76,11 +98,19 @@ def get_article_html(enwiki: str) -> str:
"formatversion": 2, "formatversion": 2,
"disableeditsection": 1, "disableeditsection": 1,
"page": enwiki, "page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
} }
r = requests.get(url, params=params) r = requests.get(url, params=params)
html: str = r.json()["parse"]["text"] parse: dict[str, Any] = r.json()["parse"]
return html return parse
def get_article_html(enwiki: str) -> str:
"""Parse article wikitext and return HTML."""
text: str = call_parse_api(enwiki)["text"]
return text
disambig_templates = [ disambig_templates = [
@ -267,6 +297,7 @@ class Article:
self.dab_list: list[DabItem] = [] self.dab_list: list[DabItem] = []
self.dab_lookup: dict[int, str] = {} self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = [] self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None
def save_endpoint(self) -> str: def save_endpoint(self) -> str:
"""Endpoint for saving changes.""" """Endpoint for saving changes."""
@ -275,8 +306,8 @@ class Article:
def load(self) -> None: def load(self) -> None:
"""Load parsed article HTML.""" """Load parsed article HTML."""
html = get_article_html(self.enwiki) self.parse = call_parse_api(self.enwiki)
self.root = lxml.html.fromstring(html) self.root = lxml.html.fromstring(self.parse.pop("text"))
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing.""" """Disambiguation links that need fixing."""
@ -328,6 +359,8 @@ def article_page(enwiki: str) -> Response:
article.load() article.load()
article.process_links() article.process_links()
assert article.parse
return flask.render_template("article.html", article=article) return flask.render_template("article.html", article=article)