2022-08-13 13:16:49 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
2022-08-15 17:56:21 +01:00
|
|
|
import inspect
|
2022-08-13 16:25:07 +01:00
|
|
|
import json
|
2022-08-14 17:44:07 +01:00
|
|
|
import re
|
2022-08-17 08:52:45 +01:00
|
|
|
from typing import Any, Iterator, Optional, TypedDict
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
import flask
|
|
|
|
import lxml.html
|
|
|
|
import requests
|
2022-08-15 17:56:21 +01:00
|
|
|
import werkzeug.exceptions
|
2022-08-15 11:43:22 +01:00
|
|
|
from requests_oauthlib import OAuth1Session
|
2022-08-15 17:56:21 +01:00
|
|
|
from werkzeug.debug.tbtools import get_current_traceback
|
2022-08-13 13:16:49 +01:00
|
|
|
from werkzeug.wrappers import Response
|
|
|
|
|
2022-08-17 08:52:45 +01:00
|
|
|
from dab_mechanic import wikidata_oauth
|
|
|
|
|
2022-08-13 13:16:49 +01:00
|
|
|
app = flask.Flask(__name__)
|
2022-08-15 13:11:29 +01:00
|
|
|
app.config.from_object("config.default")
|
2022-08-14 17:44:07 +01:00
|
|
|
app.debug = True
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-16 12:43:03 +01:00
|
|
|
wiki_hostname = "en.wikipedia.org"
|
|
|
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
|
|
|
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
|
|
|
|
2022-08-17 08:52:45 +01:00
|
|
|
|
2022-08-16 12:43:03 +01:00
|
|
|
@app.before_request
|
|
|
|
def global_user():
|
|
|
|
"""Make username available everywhere."""
|
|
|
|
flask.g.user = wikidata_oauth.get_username()
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-14 17:44:07 +01:00
|
|
|
|
2022-08-15 17:56:21 +01:00
|
|
|
@app.errorhandler(werkzeug.exceptions.InternalServerError)
|
|
|
|
def exception_handler(e):
|
|
|
|
tb = get_current_traceback()
|
|
|
|
last_frame = next(frame for frame in reversed(tb.frames) if not frame.is_library)
|
|
|
|
last_frame_args = inspect.getargs(last_frame.code)
|
|
|
|
return (
|
|
|
|
flask.render_template(
|
|
|
|
"show_error.html",
|
|
|
|
tb=tb,
|
|
|
|
last_frame=last_frame,
|
|
|
|
last_frame_args=last_frame_args,
|
|
|
|
),
|
|
|
|
500,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-08-14 17:44:07 +01:00
|
|
|
def get_content(title: str) -> str:
|
|
|
|
"""Get article text."""
|
|
|
|
params: dict[str, str | int] = {
|
|
|
|
"action": "query",
|
|
|
|
"format": "json",
|
|
|
|
"formatversion": 2,
|
|
|
|
"prop": "revisions|info",
|
|
|
|
"rvprop": "content|timestamp",
|
|
|
|
"titles": title,
|
|
|
|
}
|
2022-08-16 12:43:03 +01:00
|
|
|
data = requests.get(wiki_api_php, params=params).json()
|
2022-08-14 17:44:07 +01:00
|
|
|
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
|
|
|
return rev
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
|
2022-08-17 08:52:45 +01:00
|
|
|
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
|
|
|
"""Parse Articles With Multiple Dablinks."""
|
|
|
|
articles = []
|
|
|
|
table = root.find(".//table")
|
|
|
|
for tr in table:
|
|
|
|
title = tr[0][0].text
|
|
|
|
count_text = tr[1][0].text
|
|
|
|
assert count_text.endswith(" links")
|
|
|
|
count = int(count_text[:-6])
|
|
|
|
|
|
|
|
articles.append((title, count))
|
|
|
|
|
|
|
|
return articles
|
|
|
|
|
|
|
|
|
2022-08-13 13:16:49 +01:00
|
|
|
@app.route("/")
|
|
|
|
def index():
|
2022-08-17 08:52:45 +01:00
|
|
|
|
|
|
|
r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
|
|
|
|
root = lxml.html.fromstring(r.content)
|
|
|
|
articles = parse_articles_with_dab_links(root)
|
|
|
|
|
|
|
|
# articles = [line[:-1] for line in open("article_list")]
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
return flask.render_template("index.html", articles=articles)
|
|
|
|
|
|
|
|
|
2022-08-17 08:52:45 +01:00
|
|
|
def call_parse_api(enwiki: str) -> dict[str, Any]:
|
|
|
|
"""Call mediawiki parse API for given article."""
|
2022-08-13 13:16:49 +01:00
|
|
|
url = "https://en.wikipedia.org/w/api.php"
|
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
params: dict[str, str | int] = {
|
2022-08-13 13:16:49 +01:00
|
|
|
"action": "parse",
|
|
|
|
"format": "json",
|
|
|
|
"formatversion": 2,
|
|
|
|
"disableeditsection": 1,
|
|
|
|
"page": enwiki,
|
2022-08-17 08:52:45 +01:00
|
|
|
"prop": "text|links|headhtml",
|
|
|
|
"disabletoc": 1,
|
2022-08-13 13:16:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
r = requests.get(url, params=params)
|
2022-08-17 08:52:45 +01:00
|
|
|
parse: dict[str, Any] = r.json()["parse"]
|
|
|
|
return parse
|
|
|
|
|
|
|
|
|
|
|
|
def get_article_html(enwiki: str) -> str:
|
|
|
|
"""Parse article wikitext and return HTML."""
|
|
|
|
text: str = call_parse_api(enwiki)["text"]
|
|
|
|
return text
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
|
|
|
|
disambig_templates = [
|
|
|
|
"Template:Disambiguation",
|
|
|
|
"Template:Airport disambiguation",
|
|
|
|
"Template:Biology disambiguation",
|
|
|
|
"Template:Call sign disambiguation",
|
|
|
|
"Template:Caselaw disambiguation",
|
|
|
|
"Template:Chinese title disambiguation",
|
|
|
|
"Template:Disambiguation cleanup",
|
|
|
|
"Template:Genus disambiguation",
|
|
|
|
"Template:Hospital disambiguation",
|
|
|
|
"Template:Human name disambiguation",
|
|
|
|
"Template:Human name disambiguation cleanup",
|
|
|
|
"Template:Letter-number combination disambiguation",
|
|
|
|
"Template:Mathematical disambiguation",
|
|
|
|
"Template:Military unit disambiguation",
|
|
|
|
"Template:Music disambiguation",
|
|
|
|
"Template:Number disambiguation",
|
|
|
|
"Template:Opus number disambiguation",
|
|
|
|
"Template:Phonetics disambiguation",
|
|
|
|
"Template:Place name disambiguation",
|
|
|
|
"Template:Portal disambiguation",
|
|
|
|
"Template:Road disambiguation",
|
|
|
|
"Template:School disambiguation",
|
|
|
|
"Template:Species Latin name abbreviation disambiguation",
|
|
|
|
"Template:Species Latin name disambiguation",
|
|
|
|
"Template:Station disambiguation",
|
|
|
|
"Template:Synagogue disambiguation",
|
|
|
|
"Template:Taxonomic authority disambiguation",
|
|
|
|
"Template:Taxonomy disambiguation",
|
|
|
|
"Template:Template disambiguation",
|
|
|
|
"Template:WoO number disambiguation",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
def link_params(enwiki: str) -> dict[str, str | int]:
|
|
|
|
"""Parameters for finding article links from the API."""
|
|
|
|
params: dict[str, str | int] = {
|
2022-08-13 13:16:49 +01:00
|
|
|
"action": "query",
|
|
|
|
"format": "json",
|
|
|
|
"formatversion": 2,
|
|
|
|
"titles": enwiki,
|
|
|
|
"generator": "links",
|
|
|
|
"gpllimit": "max",
|
|
|
|
"gplnamespace": 0,
|
|
|
|
"tllimit": "max",
|
|
|
|
"tlnamespace": 10,
|
|
|
|
"tltemplates": "|".join(disambig_templates),
|
|
|
|
"prop": "templates",
|
|
|
|
}
|
2022-08-13 16:25:07 +01:00
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
|
|
def needs_disambig(link: dict[str, Any]) -> bool:
|
|
|
|
"""Is this a disambiguation link."""
|
|
|
|
return bool(
|
|
|
|
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
|
|
|
)
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
|
|
|
|
def get_article_links(enwiki: str) -> list[str]:
|
|
|
|
"""Get links that appear in this article."""
|
|
|
|
url = "https://en.wikipedia.org/w/api.php"
|
|
|
|
|
|
|
|
params: dict[str, str | int] = link_params(enwiki)
|
|
|
|
links: set[str] = set()
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
while True:
|
2022-08-13 16:25:07 +01:00
|
|
|
data = requests.get(url, params=params).json()
|
|
|
|
links.update(
|
|
|
|
page["title"] for page in data["query"]["pages"] if needs_disambig(page)
|
|
|
|
)
|
|
|
|
|
|
|
|
if "continue" not in data:
|
2022-08-13 13:16:49 +01:00
|
|
|
break
|
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
return list(links)
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
|
|
|
|
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
|
|
|
"""Delete table of contents from article HTML."""
|
|
|
|
for toc in root.findall(".//div[@class='toc']"):
|
|
|
|
toc.getparent().remove(toc)
|
|
|
|
|
|
|
|
|
|
|
|
def get_dab_html(dab_num: int, title: str) -> str:
|
|
|
|
"""Parse dab page and rewrite links."""
|
|
|
|
dab_html = get_article_html(title)
|
|
|
|
root = lxml.html.fromstring(dab_html)
|
|
|
|
delete_toc(root)
|
|
|
|
|
|
|
|
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
|
|
|
|
|
|
|
for a in root.findall(".//a[@href]"):
|
|
|
|
href: str | None = a.get("href")
|
|
|
|
if not href:
|
|
|
|
continue
|
|
|
|
if not href.startswith("#"):
|
|
|
|
a.set("href", "#")
|
|
|
|
a.set("onclick", f"return select_dab(this, {dab_num})")
|
|
|
|
continue
|
|
|
|
|
|
|
|
destination_element = element_id_map[href[1:]]
|
|
|
|
assert destination_element is not None
|
|
|
|
destination_element.set("id", f"{dab_num}{href[1:]}")
|
|
|
|
a.set("href", f"#{dab_num}{href[1:]}")
|
|
|
|
|
|
|
|
html: str = lxml.html.tostring(root, encoding=str)
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
2022-08-14 17:44:07 +01:00
|
|
|
def make_disamb_link(edit: tuple[str, str]) -> str:
|
|
|
|
"""Given an edit return the appropriate link."""
|
|
|
|
return f"[[{edit[1]}|{edit[0]}]]"
|
|
|
|
|
|
|
|
|
|
|
|
def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str:
|
|
|
|
"""Apply edits to article text."""
|
|
|
|
|
|
|
|
def escape(s: str) -> str:
|
|
|
|
return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]")
|
|
|
|
|
|
|
|
for link_from, link_to in edits:
|
|
|
|
print(rf"\[\[{escape(link_from)}\]\]")
|
|
|
|
article_text = re.sub(
|
|
|
|
rf"\[\[{escape(link_from)}\]\]",
|
|
|
|
f"[[{link_to}|{link_from}]]",
|
|
|
|
article_text,
|
|
|
|
)
|
|
|
|
|
|
|
|
return article_text
|
|
|
|
|
|
|
|
|
2022-08-13 16:25:07 +01:00
|
|
|
@app.route("/save/<path:enwiki>", methods=["POST"])
|
|
|
|
def save(enwiki: str) -> Response | str:
|
|
|
|
"""Save edits to article."""
|
2022-08-14 17:44:07 +01:00
|
|
|
edits = [
|
|
|
|
(link_to, link_from)
|
|
|
|
for link_to, link_from in json.loads(flask.request.form["edits"])
|
|
|
|
]
|
|
|
|
|
|
|
|
enwiki = enwiki.replace("_", " ")
|
|
|
|
titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1])
|
|
|
|
if len(titles) > 1:
|
|
|
|
titles += " and "
|
|
|
|
|
|
|
|
titles += make_disamb_link(edits[-1])
|
|
|
|
|
|
|
|
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
|
|
|
|
|
|
|
article_text = apply_edits(get_content(enwiki), edits)
|
|
|
|
|
|
|
|
return flask.render_template(
|
|
|
|
"save.html",
|
|
|
|
edit_summary=edit_summary,
|
|
|
|
title=enwiki,
|
|
|
|
edits=edits,
|
|
|
|
text=article_text,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class DabItem(TypedDict):
|
|
|
|
"""Represent a disabiguation page."""
|
|
|
|
|
|
|
|
num: int
|
|
|
|
title: str
|
|
|
|
html: str
|
|
|
|
|
|
|
|
|
|
|
|
class Article:
|
|
|
|
"""Current article we're working on."""
|
|
|
|
|
|
|
|
def __init__(self, enwiki: str) -> None:
|
|
|
|
"""Make a new Article object."""
|
|
|
|
self.enwiki = enwiki
|
|
|
|
|
|
|
|
self.links = get_article_links(enwiki)
|
|
|
|
|
|
|
|
self.dab_list: list[DabItem] = []
|
|
|
|
self.dab_lookup: dict[int, str] = {}
|
|
|
|
self.dab_order: list[str] = []
|
2022-08-17 08:52:45 +01:00
|
|
|
self.parse: Optional[dict[str, Any]] = None
|
2022-08-14 17:44:07 +01:00
|
|
|
|
|
|
|
def save_endpoint(self) -> str:
|
|
|
|
"""Endpoint for saving changes."""
|
|
|
|
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
|
|
|
return href
|
|
|
|
|
|
|
|
def load(self) -> None:
|
|
|
|
"""Load parsed article HTML."""
|
2022-08-17 08:52:45 +01:00
|
|
|
self.parse = call_parse_api(self.enwiki)
|
|
|
|
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
2022-08-14 17:44:07 +01:00
|
|
|
|
2022-08-15 11:43:12 +01:00
|
|
|
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
|
|
|
"""Disambiguation links that need fixing."""
|
2022-08-14 17:44:07 +01:00
|
|
|
seen = set()
|
|
|
|
for a in self.root.findall(".//a[@href]"):
|
|
|
|
title = a.get("title")
|
2022-08-15 11:43:12 +01:00
|
|
|
if title is None or title not in self.links:
|
2022-08-14 17:44:07 +01:00
|
|
|
continue
|
|
|
|
a.set("class", "disambig")
|
2022-08-15 11:43:12 +01:00
|
|
|
|
|
|
|
if title in seen:
|
|
|
|
continue
|
|
|
|
seen.add(title)
|
|
|
|
|
|
|
|
yield a, title
|
|
|
|
|
|
|
|
def process_links(self) -> None:
|
|
|
|
"""Process links in parsed wikitext."""
|
|
|
|
for dab_num, (a, title) in enumerate(self.iter_links()):
|
|
|
|
a.set("id", f"dab-{dab_num}")
|
|
|
|
|
|
|
|
dab: DabItem = {
|
|
|
|
"num": dab_num,
|
|
|
|
"title": title,
|
|
|
|
"html": get_dab_html(dab_num, title),
|
|
|
|
}
|
|
|
|
self.dab_list.append(dab)
|
|
|
|
self.dab_order.append(title)
|
|
|
|
self.dab_lookup[dab_num] = title
|
2022-08-14 17:44:07 +01:00
|
|
|
|
2022-08-14 17:48:19 +01:00
|
|
|
def get_html(self) -> str:
|
2022-08-14 17:44:07 +01:00
|
|
|
"""Return the processed article HTML."""
|
|
|
|
html: str = lxml.html.tostring(self.root, encoding=str)
|
|
|
|
return html
|
2022-08-13 16:25:07 +01:00
|
|
|
|
|
|
|
|
2022-08-13 13:16:49 +01:00
|
|
|
@app.route("/enwiki/<path:enwiki>")
|
2022-08-14 17:44:07 +01:00
|
|
|
def article_page(enwiki: str) -> Response:
|
2022-08-13 13:16:49 +01:00
|
|
|
"""Article Page."""
|
2022-08-13 16:25:07 +01:00
|
|
|
enwiki_orig = enwiki
|
|
|
|
enwiki = enwiki.replace("_", " ")
|
|
|
|
enwiki_underscore = enwiki.replace(" ", "_")
|
|
|
|
if " " in enwiki_orig:
|
|
|
|
return flask.redirect(
|
|
|
|
flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
|
|
|
|
)
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-14 17:44:07 +01:00
|
|
|
article = Article(enwiki)
|
|
|
|
article.load()
|
|
|
|
article.process_links()
|
2022-08-13 13:16:49 +01:00
|
|
|
|
2022-08-17 08:52:45 +01:00
|
|
|
assert article.parse
|
|
|
|
|
2022-08-14 17:48:19 +01:00
|
|
|
return flask.render_template("article.html", article=article)
|
2022-08-13 13:16:49 +01:00
|
|
|
|
|
|
|
|
2022-08-15 11:43:22 +01:00
|
|
|
@app.route("/oauth/start")
|
|
|
|
def start_oauth():
|
|
|
|
next_page = flask.request.args.get("next")
|
|
|
|
if next_page:
|
|
|
|
flask.session["after_login"] = next_page
|
|
|
|
|
|
|
|
client_key = app.config["CLIENT_KEY"]
|
|
|
|
client_secret = app.config["CLIENT_SECRET"]
|
2022-08-16 12:43:03 +01:00
|
|
|
request_token_url = wiki_index_php + "?title=Special%3aOAuth%2finitiate"
|
2022-08-15 11:43:22 +01:00
|
|
|
|
|
|
|
oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob")
|
|
|
|
fetch_response = oauth.fetch_request_token(request_token_url)
|
|
|
|
|
|
|
|
flask.session["owner_key"] = fetch_response.get("oauth_token")
|
|
|
|
flask.session["owner_secret"] = fetch_response.get("oauth_token_secret")
|
|
|
|
|
2022-08-16 12:43:03 +01:00
|
|
|
base_authorization_url = f"https://{wiki_hostname}/wiki/Special:OAuth/authorize"
|
2022-08-15 11:43:22 +01:00
|
|
|
authorization_url = oauth.authorization_url(
|
|
|
|
base_authorization_url, oauth_consumer_key=client_key
|
|
|
|
)
|
|
|
|
return flask.redirect(authorization_url)
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/oauth/callback", methods=["GET"])
|
|
|
|
def oauth_callback():
|
|
|
|
client_key = app.config["CLIENT_KEY"]
|
|
|
|
client_secret = app.config["CLIENT_SECRET"]
|
|
|
|
|
|
|
|
oauth = OAuth1Session(
|
|
|
|
client_key,
|
|
|
|
client_secret=client_secret,
|
|
|
|
resource_owner_key=flask.session["owner_key"],
|
|
|
|
resource_owner_secret=flask.session["owner_secret"],
|
|
|
|
)
|
|
|
|
|
|
|
|
oauth_response = oauth.parse_authorization_response(flask.request.url)
|
|
|
|
verifier = oauth_response.get("oauth_verifier")
|
2022-08-16 12:43:03 +01:00
|
|
|
access_token_url = wiki_index_php + "?title=Special%3aOAuth%2ftoken"
|
2022-08-15 11:43:22 +01:00
|
|
|
oauth = OAuth1Session(
|
|
|
|
client_key,
|
|
|
|
client_secret=client_secret,
|
|
|
|
resource_owner_key=flask.session["owner_key"],
|
|
|
|
resource_owner_secret=flask.session["owner_secret"],
|
|
|
|
verifier=verifier,
|
|
|
|
)
|
|
|
|
|
|
|
|
oauth_tokens = oauth.fetch_access_token(access_token_url)
|
|
|
|
flask.session["owner_key"] = oauth_tokens.get("oauth_token")
|
|
|
|
flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret")
|
|
|
|
|
|
|
|
next_page = flask.session.get("after_login")
|
|
|
|
return flask.redirect(next_page) if next_page else flask.url_for("index")
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/oauth/disconnect")
|
|
|
|
def oauth_disconnect():
|
|
|
|
for key in "owner_key", "owner_secret", "username", "after_login":
|
|
|
|
if key in flask.session:
|
|
|
|
del flask.session[key]
|
|
|
|
return flask.redirect(flask.url_for("index"))
|
|
|
|
|
|
|
|
|
2022-08-13 13:16:49 +01:00
|
|
|
if __name__ == "__main__":
|
|
|
|
app.run(host="0.0.0.0")
|