Compare commits
10 commits
d499c896b4
...
4d175c8733
Author | SHA1 | Date | |
---|---|---|---|
Edward Betts | 4d175c8733 | ||
Edward Betts | e85cefbc2f | ||
Edward Betts | b1f402e1f9 | ||
Edward Betts | 5f8900a47a | ||
Edward Betts | 4e1ad4efbc | ||
Edward Betts | c2b3d22e45 | ||
Edward Betts | 8268822181 | ||
Edward Betts | 78de5cc139 | ||
Edward Betts | 8432632aae | ||
Edward Betts | 5ff4749512 |
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__pycache__
|
50
article_list
50
article_list
|
@ -1,50 +0,0 @@
|
||||||
Rail transport in Indonesia
|
|
||||||
SchleFaZ
|
|
||||||
Chicago Bulls
|
|
||||||
Orwell Prize
|
|
||||||
List of fatal victims of the September 11 attacks
|
|
||||||
Arabic exonyms
|
|
||||||
Canadian Alpine Ski Championships
|
|
||||||
Method Man filmography
|
|
||||||
Popular Union
|
|
||||||
The Cantos
|
|
||||||
Unisex name
|
|
||||||
United States Alpine Ski Championships
|
|
||||||
AS Kaloum Star
|
|
||||||
Akademi Fantasia (season 1)
|
|
||||||
Athletics at the 2022 Bolivarian Games
|
|
||||||
I Love the 2000s
|
|
||||||
Kununokuni
|
|
||||||
List of Wisin & Yandel collaborations
|
|
||||||
List of comics based on films
|
|
||||||
List of programs broadcast by Asianet
|
|
||||||
Urban Hymns
|
|
||||||
1979 Sydney City FC season
|
|
||||||
2007 in Spanish television
|
|
||||||
2022 World Athletics U20 Championships – Men's 4 × 100 metres relay
|
|
||||||
A2 autostrada (Poland)
|
|
||||||
Black to the Future (TV series)
|
|
||||||
Chandel (Rajput clan)
|
|
||||||
County of Isenburg
|
|
||||||
Dinka people
|
|
||||||
Dwayne McDuffie Award for Diversity in Comics
|
|
||||||
FTSE Italia Mid Cap
|
|
||||||
Globoplay
|
|
||||||
Index of Armenia-related articles
|
|
||||||
List of Equinox episodes
|
|
||||||
List of Indian monarchs
|
|
||||||
List of Italian exonyms in Dalmatia
|
|
||||||
List of Ultimate Marvel characters
|
|
||||||
List of cities with historical German exonyms
|
|
||||||
List of jötnar in Norse mythology
|
|
||||||
List of language families
|
|
||||||
List of people with surname Davis
|
|
||||||
List of political parties in Venezuela
|
|
||||||
List of programmes broadcast by HTV
|
|
||||||
Paul (given name)
|
|
||||||
Principality of Lippe
|
|
||||||
Propaganda in Russia
|
|
||||||
Qazi Ghulam Mustafa
|
|
||||||
Redfern Now
|
|
||||||
Roy Orbison/The Beatles Tour
|
|
||||||
Royal Birmingham Conservatoire
|
|
45
dab_mechanic/mediawiki_api.py
Normal file
45
dab_mechanic/mediawiki_api.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
"""Interface with the mediawiki API."""
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
from . import wikidata_oauth
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
user_agent = "dab-mechanic/0.1"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||||
|
"""Call mediawiki parse API for given article."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "parse",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"disableeditsection": 1,
|
||||||
|
"page": enwiki,
|
||||||
|
"prop": "text|links|headhtml",
|
||||||
|
"disabletoc": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
parse: dict[str, Any] = call(params)["parse"]
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def call(params: dict[str, str | int]) -> dict[str, Any]:
|
||||||
|
"""Make GET request to mediawiki API."""
|
||||||
|
data: dict[str, Any] = wikidata_oauth.api_post_request(params)
|
||||||
|
return data.json()
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(title: str) -> str:
|
||||||
|
"""Get article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
data = call(params)
|
||||||
|
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||||
|
return rev
|
|
@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]:
|
||||||
def api_post_request(params: dict[str, str | int]):
|
def api_post_request(params: dict[str, str | int]):
|
||||||
"""HTTP Post using Oauth."""
|
"""HTTP Post using Oauth."""
|
||||||
app = current_app
|
app = current_app
|
||||||
url = "https://www.wikidata.org/w/api.php"
|
|
||||||
client_key = app.config["CLIENT_KEY"]
|
client_key = app.config["CLIENT_KEY"]
|
||||||
client_secret = app.config["CLIENT_SECRET"]
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
oauth = OAuth1Session(
|
oauth = OAuth1Session(
|
||||||
|
@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]):
|
||||||
resource_owner_secret=session["owner_secret"],
|
resource_owner_secret=session["owner_secret"],
|
||||||
)
|
)
|
||||||
proxies = get_edit_proxy()
|
proxies = get_edit_proxy()
|
||||||
return oauth.post(url, data=params, timeout=4, proxies=proxies)
|
return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
|
||||||
|
|
||||||
|
|
||||||
def raw_request(params):
|
def raw_request(params):
|
||||||
app = current_app
|
app = current_app
|
||||||
url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
|
url = api_url + "?" + urlencode(params)
|
||||||
client_key = app.config["CLIENT_KEY"]
|
client_key = app.config["CLIENT_KEY"]
|
||||||
client_secret = app.config["CLIENT_SECRET"]
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
oauth = OAuth1Session(
|
oauth = OAuth1Session(
|
||||||
|
@ -44,7 +43,7 @@ def raw_request(params):
|
||||||
resource_owner_secret=session["owner_secret"],
|
resource_owner_secret=session["owner_secret"],
|
||||||
)
|
)
|
||||||
proxies = get_edit_proxy()
|
proxies = get_edit_proxy()
|
||||||
return oauth.get(url, timeout=4, proxies=proxies)
|
return oauth.get(url, timeout=10, proxies=proxies)
|
||||||
|
|
||||||
|
|
||||||
def api_request(params):
|
def api_request(params):
|
||||||
|
|
206
dab_mechanic/wikipedia.py
Normal file
206
dab_mechanic/wikipedia.py
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Any, Iterator, Optional, TypedDict
|
||||||
|
|
||||||
|
import flask
|
||||||
|
import lxml.html
|
||||||
|
|
||||||
|
from . import mediawiki_api
|
||||||
|
from pprint import pprint
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
disambig_templates = [
|
||||||
|
"Template:Disambiguation",
|
||||||
|
"Template:Airport disambiguation",
|
||||||
|
"Template:Biology disambiguation",
|
||||||
|
"Template:Call sign disambiguation",
|
||||||
|
"Template:Caselaw disambiguation",
|
||||||
|
"Template:Chinese title disambiguation",
|
||||||
|
"Template:Disambiguation cleanup",
|
||||||
|
"Template:Genus disambiguation",
|
||||||
|
"Template:Hospital disambiguation",
|
||||||
|
"Template:Human name disambiguation",
|
||||||
|
"Template:Human name disambiguation cleanup",
|
||||||
|
"Template:Letter-number combination disambiguation",
|
||||||
|
"Template:Mathematical disambiguation",
|
||||||
|
"Template:Military unit disambiguation",
|
||||||
|
"Template:Music disambiguation",
|
||||||
|
"Template:Number disambiguation",
|
||||||
|
"Template:Opus number disambiguation",
|
||||||
|
"Template:Phonetics disambiguation",
|
||||||
|
"Template:Place name disambiguation",
|
||||||
|
"Template:Portal disambiguation",
|
||||||
|
"Template:Road disambiguation",
|
||||||
|
"Template:School disambiguation",
|
||||||
|
"Template:Species Latin name abbreviation disambiguation",
|
||||||
|
"Template:Species Latin name disambiguation",
|
||||||
|
"Template:Station disambiguation",
|
||||||
|
"Template:Synagogue disambiguation",
|
||||||
|
"Template:Taxonomic authority disambiguation",
|
||||||
|
"Template:Taxonomy disambiguation",
|
||||||
|
"Template:Template disambiguation",
|
||||||
|
"Template:WoO number disambiguation",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def link_params(enwiki: str) -> dict[str, str | int]:
|
||||||
|
"""Parameters for finding article links from the API."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"titles": enwiki,
|
||||||
|
"generator": "links",
|
||||||
|
"gpllimit": "max",
|
||||||
|
"gplnamespace": 0,
|
||||||
|
"tllimit": "max",
|
||||||
|
"redirects": 1,
|
||||||
|
"tlnamespace": 10,
|
||||||
|
"tltemplates": "|".join(disambig_templates),
|
||||||
|
"prop": "templates",
|
||||||
|
}
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def needs_disambig(link: dict[str, Any]) -> bool:
|
||||||
|
"""Is this a disambiguation link."""
|
||||||
|
return bool(
|
||||||
|
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_links(enwiki: str) -> list[str]:
|
||||||
|
"""Get links that appear in this article."""
|
||||||
|
|
||||||
|
params: dict[str, str | int] = link_params(enwiki)
|
||||||
|
links: set[str] = set()
|
||||||
|
|
||||||
|
redirects = defaultdict(set)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
data = mediawiki_api.call(params)
|
||||||
|
if "query" not in data:
|
||||||
|
pprint(data)
|
||||||
|
pages = data["query"].pop("pages")
|
||||||
|
for r in data["query"].pop("redirects"):
|
||||||
|
redirects[r["to"]].add(r["from"])
|
||||||
|
|
||||||
|
links.update(page["title"] for page in pages if needs_disambig(page))
|
||||||
|
|
||||||
|
if "continue" not in data:
|
||||||
|
break
|
||||||
|
|
||||||
|
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
for link in set(links):
|
||||||
|
if link in redirects:
|
||||||
|
links.update(redirects[link])
|
||||||
|
|
||||||
|
return list(links)
|
||||||
|
|
||||||
|
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_html(enwiki: str) -> str:
|
||||||
|
"""Parse article wikitext and return HTML."""
|
||||||
|
text: str = mediawiki_api.parse_page(enwiki)["text"]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class DabItem(TypedDict):
|
||||||
|
"""Represent a disabiguation page."""
|
||||||
|
|
||||||
|
num: int
|
||||||
|
title: str
|
||||||
|
html: str
|
||||||
|
|
||||||
|
|
||||||
|
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
||||||
|
"""Delete table of contents from article HTML."""
|
||||||
|
for toc in root.findall(".//div[@class='toc']"):
|
||||||
|
toc.getparent().remove(toc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dab_html(dab_num: int, title: str) -> str:
|
||||||
|
"""Parse dab page and rewrite links."""
|
||||||
|
dab_html = get_article_html(title)
|
||||||
|
root = lxml.html.fromstring(dab_html)
|
||||||
|
delete_toc(root)
|
||||||
|
|
||||||
|
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
||||||
|
|
||||||
|
for a in root.findall(".//a[@href]"):
|
||||||
|
href: str | None = a.get("href")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
if not href.startswith("#"):
|
||||||
|
a.set("href", "#")
|
||||||
|
a.set("onclick", f"return select_dab(this, {dab_num})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
destination_element = element_id_map[href[1:]]
|
||||||
|
assert destination_element is not None
|
||||||
|
destination_element.set("id", f"{dab_num}{href[1:]}")
|
||||||
|
a.set("href", f"#{dab_num}{href[1:]}")
|
||||||
|
|
||||||
|
html: str = lxml.html.tostring(root, encoding=str)
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
class Article:
|
||||||
|
"""Current article we're working on."""
|
||||||
|
|
||||||
|
def __init__(self, enwiki: str) -> None:
|
||||||
|
"""Make a new Article object."""
|
||||||
|
self.enwiki = enwiki.replace("_", " ")
|
||||||
|
|
||||||
|
self.links = get_article_links(enwiki)
|
||||||
|
|
||||||
|
self.dab_list: list[DabItem] = []
|
||||||
|
self.dab_lookup: dict[int, str] = {}
|
||||||
|
self.dab_order: list[str] = []
|
||||||
|
self.parse: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
|
def save_endpoint(self) -> str:
|
||||||
|
"""Endpoint for saving changes."""
|
||||||
|
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
||||||
|
return href
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
"""Load parsed article HTML."""
|
||||||
|
self.parse = mediawiki_api.parse_page(self.enwiki)
|
||||||
|
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||||
|
|
||||||
|
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||||
|
"""Disambiguation links that need fixing."""
|
||||||
|
seen = set()
|
||||||
|
for a in self.root.findall(".//a[@href]"):
|
||||||
|
title = a.get("title")
|
||||||
|
if title is None or title not in self.links:
|
||||||
|
continue
|
||||||
|
a.set("class", "disambig")
|
||||||
|
|
||||||
|
if title in seen:
|
||||||
|
continue
|
||||||
|
seen.add(title)
|
||||||
|
|
||||||
|
yield a, title
|
||||||
|
|
||||||
|
def process_links(self) -> None:
|
||||||
|
"""Process links in parsed wikitext."""
|
||||||
|
for dab_num, (a, title) in enumerate(self.iter_links()):
|
||||||
|
a.set("id", f"dab-{dab_num}")
|
||||||
|
|
||||||
|
dab: DabItem = {
|
||||||
|
"num": dab_num,
|
||||||
|
"title": title,
|
||||||
|
"html": get_dab_html(dab_num, title),
|
||||||
|
}
|
||||||
|
self.dab_list.append(dab)
|
||||||
|
self.dab_order.append(title)
|
||||||
|
self.dab_lookup[dab_num] = title
|
||||||
|
|
||||||
|
def get_html(self) -> str:
|
||||||
|
"""Return the processed article HTML."""
|
||||||
|
html: str = lxml.html.tostring(self.root, encoding=str)
|
||||||
|
return html
|
78
static/css/exception.css
Normal file
78
static/css/exception.css
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
div.debugger { text-align: left; padding: 12px; margin: auto;
|
||||||
|
background-color: white; }
|
||||||
|
div.detail { cursor: pointer; }
|
||||||
|
div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap;
|
||||||
|
font-family: monospace; }
|
||||||
|
div.explanation { margin: 20px 13px; font-size: 15px; color: #555; }
|
||||||
|
div.footer { font-size: 13px; text-align: right; margin: 30px 0;
|
||||||
|
color: #86989B; }
|
||||||
|
|
||||||
|
h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px;
|
||||||
|
background-color: #11557C; color: white; }
|
||||||
|
h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; }
|
||||||
|
|
||||||
|
div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; }
|
||||||
|
div.plain p { margin: 0; }
|
||||||
|
div.plain textarea,
|
||||||
|
div.plain pre { margin: 10px 0 0 0; padding: 4px;
|
||||||
|
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
|
||||||
|
div.plain textarea { width: 99%; height: 300px; }
|
||||||
|
div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; }
|
||||||
|
div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; }
|
||||||
|
div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; }
|
||||||
|
div.traceback pre { margin: 0; padding: 5px 0 3px 15px;
|
||||||
|
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
|
||||||
|
div.traceback .library .current { background: white; color: #555; }
|
||||||
|
div.traceback .expanded .current { background: #E8EFF0; color: black; }
|
||||||
|
div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; }
|
||||||
|
div.traceback div.source.expanded pre + pre { border-top: none; }
|
||||||
|
|
||||||
|
div.traceback span.ws { display: none; }
|
||||||
|
div.traceback pre.before, div.traceback pre.after { display: none; background: white; }
|
||||||
|
div.traceback div.source.expanded pre.before,
|
||||||
|
div.traceback div.source.expanded pre.after {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.traceback div.source.expanded span.ws {
|
||||||
|
display: inline;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; }
|
||||||
|
div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; }
|
||||||
|
div.traceback img:hover { background-color: #ddd; cursor: pointer;
|
||||||
|
border-color: #BFDDE0; }
|
||||||
|
div.traceback pre:hover img { display: block; }
|
||||||
|
div.traceback cite.filename { font-style: normal; color: #3B666B; }
|
||||||
|
|
||||||
|
pre.console { border: 1px solid #ccc; background: white!important;
|
||||||
|
color: black; padding: 5px!important;
|
||||||
|
margin: 3px 0 0 0!important; cursor: default!important;
|
||||||
|
max-height: 400px; overflow: auto; }
|
||||||
|
pre.console form { color: #555; }
|
||||||
|
pre.console input { background-color: transparent; color: #555;
|
||||||
|
width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono',
|
||||||
|
'Bitstream Vera Sans Mono', monospace; font-size: 14px;
|
||||||
|
border: none!important; }
|
||||||
|
|
||||||
|
span.string { color: #30799B; }
|
||||||
|
span.number { color: #9C1A1C; }
|
||||||
|
span.help { color: #3A7734; }
|
||||||
|
span.object { color: #485F6E; }
|
||||||
|
span.extended { opacity: 0.5; }
|
||||||
|
span.extended:hover { opacity: 1; }
|
||||||
|
a.toggle { text-decoration: none; background-repeat: no-repeat;
|
||||||
|
background-position: center center;
|
||||||
|
background-image: url(?__debugger__=yes&cmd=resource&f=more.png); }
|
||||||
|
a.toggle:hover { background-color: #444; }
|
||||||
|
a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); }
|
||||||
|
|
||||||
|
div.traceback pre, div.console pre {
|
||||||
|
white-space: pre-wrap; /* css-3 should we be so lucky... */
|
||||||
|
white-space: -moz-pre-wrap; /* Mozilla, since 1999 */
|
||||||
|
white-space: -pre-wrap; /* Opera 4-6 ?? */
|
||||||
|
white-space: -o-pre-wrap; /* Opera 7 ?? */
|
||||||
|
word-wrap: break-word; /* Internet Explorer 5.5+ */
|
||||||
|
_white-space: pre; /* IE only hack to re-specify in
|
||||||
|
addition to word-wrap */
|
||||||
|
}
|
|
@ -1,12 +1,14 @@
|
||||||
{% extends "base.html" %}
|
{% extends "base.html" %}
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<ul>
|
<div class="m-3">
|
||||||
|
<ol>
|
||||||
{% for enwiki, count in articles %}
|
{% for enwiki, count in articles %}
|
||||||
<li>
|
<li>
|
||||||
<a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
|
<a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
|
||||||
({{ count }} links)
|
({{ count }} links)
|
||||||
</li>
|
</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</ol>
|
||||||
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
252
web_view.py
252
web_view.py
|
@ -3,7 +3,7 @@
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import Any, Iterator, Optional, TypedDict
|
from typing import Optional
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
@ -13,16 +13,17 @@ from requests_oauthlib import OAuth1Session
|
||||||
from werkzeug.debug.tbtools import get_current_traceback
|
from werkzeug.debug.tbtools import get_current_traceback
|
||||||
from werkzeug.wrappers import Response
|
from werkzeug.wrappers import Response
|
||||||
|
|
||||||
from dab_mechanic import wikidata_oauth
|
from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
|
||||||
|
|
||||||
app = flask.Flask(__name__)
|
app = flask.Flask(__name__)
|
||||||
app.config.from_object("config.default")
|
app.config.from_object("config.default")
|
||||||
app.debug = True
|
|
||||||
|
|
||||||
wiki_hostname = "en.wikipedia.org"
|
wiki_hostname = "en.wikipedia.org"
|
||||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
||||||
|
|
||||||
|
awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php"
|
||||||
|
|
||||||
|
|
||||||
@app.before_request
|
@app.before_request
|
||||||
def global_user():
|
def global_user():
|
||||||
|
@ -46,21 +47,6 @@ def exception_handler(e):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_content(title: str) -> str:
|
|
||||||
"""Get article text."""
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"action": "query",
|
|
||||||
"format": "json",
|
|
||||||
"formatversion": 2,
|
|
||||||
"prop": "revisions|info",
|
|
||||||
"rvprop": "content|timestamp",
|
|
||||||
"titles": title,
|
|
||||||
}
|
|
||||||
data = requests.get(wiki_api_php, params=params).json()
|
|
||||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
|
||||||
return rev
|
|
||||||
|
|
||||||
|
|
||||||
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
||||||
"""Parse Articles With Multiple Dablinks."""
|
"""Parse Articles With Multiple Dablinks."""
|
||||||
articles = []
|
articles = []
|
||||||
|
@ -78,8 +64,7 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def index():
|
def index():
|
||||||
|
r = requests.get(awdl_url, params={"limit": 100})
|
||||||
r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
|
|
||||||
root = lxml.html.fromstring(r.content)
|
root = lxml.html.fromstring(r.content)
|
||||||
articles = parse_articles_with_dab_links(root)
|
articles = parse_articles_with_dab_links(root)
|
||||||
|
|
||||||
|
@ -88,145 +73,6 @@ def index():
|
||||||
return flask.render_template("index.html", articles=articles)
|
return flask.render_template("index.html", articles=articles)
|
||||||
|
|
||||||
|
|
||||||
def call_parse_api(enwiki: str) -> dict[str, Any]:
|
|
||||||
"""Call mediawiki parse API for given article."""
|
|
||||||
url = "https://en.wikipedia.org/w/api.php"
|
|
||||||
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"action": "parse",
|
|
||||||
"format": "json",
|
|
||||||
"formatversion": 2,
|
|
||||||
"disableeditsection": 1,
|
|
||||||
"page": enwiki,
|
|
||||||
"prop": "text|links|headhtml",
|
|
||||||
"disabletoc": 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.get(url, params=params)
|
|
||||||
parse: dict[str, Any] = r.json()["parse"]
|
|
||||||
return parse
|
|
||||||
|
|
||||||
|
|
||||||
def get_article_html(enwiki: str) -> str:
|
|
||||||
"""Parse article wikitext and return HTML."""
|
|
||||||
text: str = call_parse_api(enwiki)["text"]
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
disambig_templates = [
|
|
||||||
"Template:Disambiguation",
|
|
||||||
"Template:Airport disambiguation",
|
|
||||||
"Template:Biology disambiguation",
|
|
||||||
"Template:Call sign disambiguation",
|
|
||||||
"Template:Caselaw disambiguation",
|
|
||||||
"Template:Chinese title disambiguation",
|
|
||||||
"Template:Disambiguation cleanup",
|
|
||||||
"Template:Genus disambiguation",
|
|
||||||
"Template:Hospital disambiguation",
|
|
||||||
"Template:Human name disambiguation",
|
|
||||||
"Template:Human name disambiguation cleanup",
|
|
||||||
"Template:Letter-number combination disambiguation",
|
|
||||||
"Template:Mathematical disambiguation",
|
|
||||||
"Template:Military unit disambiguation",
|
|
||||||
"Template:Music disambiguation",
|
|
||||||
"Template:Number disambiguation",
|
|
||||||
"Template:Opus number disambiguation",
|
|
||||||
"Template:Phonetics disambiguation",
|
|
||||||
"Template:Place name disambiguation",
|
|
||||||
"Template:Portal disambiguation",
|
|
||||||
"Template:Road disambiguation",
|
|
||||||
"Template:School disambiguation",
|
|
||||||
"Template:Species Latin name abbreviation disambiguation",
|
|
||||||
"Template:Species Latin name disambiguation",
|
|
||||||
"Template:Station disambiguation",
|
|
||||||
"Template:Synagogue disambiguation",
|
|
||||||
"Template:Taxonomic authority disambiguation",
|
|
||||||
"Template:Taxonomy disambiguation",
|
|
||||||
"Template:Template disambiguation",
|
|
||||||
"Template:WoO number disambiguation",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def link_params(enwiki: str) -> dict[str, str | int]:
|
|
||||||
"""Parameters for finding article links from the API."""
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"action": "query",
|
|
||||||
"format": "json",
|
|
||||||
"formatversion": 2,
|
|
||||||
"titles": enwiki,
|
|
||||||
"generator": "links",
|
|
||||||
"gpllimit": "max",
|
|
||||||
"gplnamespace": 0,
|
|
||||||
"tllimit": "max",
|
|
||||||
"tlnamespace": 10,
|
|
||||||
"tltemplates": "|".join(disambig_templates),
|
|
||||||
"prop": "templates",
|
|
||||||
}
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def needs_disambig(link: dict[str, Any]) -> bool:
|
|
||||||
"""Is this a disambiguation link."""
|
|
||||||
return bool(
|
|
||||||
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_article_links(enwiki: str) -> list[str]:
|
|
||||||
"""Get links that appear in this article."""
|
|
||||||
url = "https://en.wikipedia.org/w/api.php"
|
|
||||||
|
|
||||||
params: dict[str, str | int] = link_params(enwiki)
|
|
||||||
links: set[str] = set()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
data = requests.get(url, params=params).json()
|
|
||||||
links.update(
|
|
||||||
page["title"] for page in data["query"]["pages"] if needs_disambig(page)
|
|
||||||
)
|
|
||||||
|
|
||||||
if "continue" not in data:
|
|
||||||
break
|
|
||||||
|
|
||||||
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
|
||||||
|
|
||||||
return list(links)
|
|
||||||
|
|
||||||
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
|
||||||
|
|
||||||
|
|
||||||
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
|
||||||
"""Delete table of contents from article HTML."""
|
|
||||||
for toc in root.findall(".//div[@class='toc']"):
|
|
||||||
toc.getparent().remove(toc)
|
|
||||||
|
|
||||||
|
|
||||||
def get_dab_html(dab_num: int, title: str) -> str:
|
|
||||||
"""Parse dab page and rewrite links."""
|
|
||||||
dab_html = get_article_html(title)
|
|
||||||
root = lxml.html.fromstring(dab_html)
|
|
||||||
delete_toc(root)
|
|
||||||
|
|
||||||
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
|
||||||
|
|
||||||
for a in root.findall(".//a[@href]"):
|
|
||||||
href: str | None = a.get("href")
|
|
||||||
if not href:
|
|
||||||
continue
|
|
||||||
if not href.startswith("#"):
|
|
||||||
a.set("href", "#")
|
|
||||||
a.set("onclick", f"return select_dab(this, {dab_num})")
|
|
||||||
continue
|
|
||||||
|
|
||||||
destination_element = element_id_map[href[1:]]
|
|
||||||
assert destination_element is not None
|
|
||||||
destination_element.set("id", f"{dab_num}{href[1:]}")
|
|
||||||
a.set("href", f"#{dab_num}{href[1:]}")
|
|
||||||
|
|
||||||
html: str = lxml.html.tostring(root, encoding=str)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def make_disamb_link(edit: tuple[str, str]) -> str:
|
def make_disamb_link(edit: tuple[str, str]) -> str:
|
||||||
"""Given an edit return the appropriate link."""
|
"""Given an edit return the appropriate link."""
|
||||||
return f"[[{edit[1]}|{edit[0]}]]"
|
return f"[[{edit[1]}|{edit[0]}]]"
|
||||||
|
@ -266,7 +112,7 @@ def save(enwiki: str) -> Response | str:
|
||||||
|
|
||||||
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
||||||
|
|
||||||
article_text = apply_edits(get_content(enwiki), edits)
|
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
|
||||||
|
|
||||||
return flask.render_template(
|
return flask.render_template(
|
||||||
"save.html",
|
"save.html",
|
||||||
|
@ -277,85 +123,25 @@ def save(enwiki: str) -> Response | str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class DabItem(TypedDict):
|
def redirect_if_needed(enwiki: str) -> Optional[Response]:
|
||||||
"""Represent a disabiguation page."""
|
"""Check if there are spaces in the article name and redirect."""
|
||||||
|
return (
|
||||||
num: int
|
flask.redirect(
|
||||||
title: str
|
flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_"))
|
||||||
html: str
|
)
|
||||||
|
if " " in enwiki
|
||||||
|
else None
|
||||||
class Article:
|
)
|
||||||
"""Current article we're working on."""
|
|
||||||
|
|
||||||
def __init__(self, enwiki: str) -> None:
|
|
||||||
"""Make a new Article object."""
|
|
||||||
self.enwiki = enwiki
|
|
||||||
|
|
||||||
self.links = get_article_links(enwiki)
|
|
||||||
|
|
||||||
self.dab_list: list[DabItem] = []
|
|
||||||
self.dab_lookup: dict[int, str] = {}
|
|
||||||
self.dab_order: list[str] = []
|
|
||||||
self.parse: Optional[dict[str, Any]] = None
|
|
||||||
|
|
||||||
def save_endpoint(self) -> str:
|
|
||||||
"""Endpoint for saving changes."""
|
|
||||||
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
|
||||||
return href
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load parsed article HTML."""
|
|
||||||
self.parse = call_parse_api(self.enwiki)
|
|
||||||
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
|
||||||
|
|
||||||
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
|
||||||
"""Disambiguation links that need fixing."""
|
|
||||||
seen = set()
|
|
||||||
for a in self.root.findall(".//a[@href]"):
|
|
||||||
title = a.get("title")
|
|
||||||
if title is None or title not in self.links:
|
|
||||||
continue
|
|
||||||
a.set("class", "disambig")
|
|
||||||
|
|
||||||
if title in seen:
|
|
||||||
continue
|
|
||||||
seen.add(title)
|
|
||||||
|
|
||||||
yield a, title
|
|
||||||
|
|
||||||
def process_links(self) -> None:
|
|
||||||
"""Process links in parsed wikitext."""
|
|
||||||
for dab_num, (a, title) in enumerate(self.iter_links()):
|
|
||||||
a.set("id", f"dab-{dab_num}")
|
|
||||||
|
|
||||||
dab: DabItem = {
|
|
||||||
"num": dab_num,
|
|
||||||
"title": title,
|
|
||||||
"html": get_dab_html(dab_num, title),
|
|
||||||
}
|
|
||||||
self.dab_list.append(dab)
|
|
||||||
self.dab_order.append(title)
|
|
||||||
self.dab_lookup[dab_num] = title
|
|
||||||
|
|
||||||
def get_html(self) -> str:
|
|
||||||
"""Return the processed article HTML."""
|
|
||||||
html: str = lxml.html.tostring(self.root, encoding=str)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/enwiki/<path:enwiki>")
|
@app.route("/enwiki/<path:enwiki>")
|
||||||
def article_page(enwiki: str) -> Response:
|
def article_page(enwiki: str) -> Response:
|
||||||
"""Article Page."""
|
"""Article Page."""
|
||||||
enwiki_orig = enwiki
|
redirect = redirect_if_needed(enwiki)
|
||||||
enwiki = enwiki.replace("_", " ")
|
if redirect:
|
||||||
enwiki_underscore = enwiki.replace(" ", "_")
|
return redirect
|
||||||
if " " in enwiki_orig:
|
|
||||||
return flask.redirect(
|
|
||||||
flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
|
|
||||||
)
|
|
||||||
|
|
||||||
article = Article(enwiki)
|
article = wikipedia.Article(enwiki)
|
||||||
article.load()
|
article.load()
|
||||||
article.process_links()
|
article.process_links()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue