diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/article_list b/article_list deleted file mode 100644 index 7f4bcaf..0000000 --- a/article_list +++ /dev/null @@ -1,50 +0,0 @@ -Rail transport in Indonesia -SchleFaZ -Chicago Bulls -Orwell Prize -List of fatal victims of the September 11 attacks -Arabic exonyms -Canadian Alpine Ski Championships -Method Man filmography -Popular Union -The Cantos -Unisex name -United States Alpine Ski Championships -AS Kaloum Star -Akademi Fantasia (season 1) -Athletics at the 2022 Bolivarian Games -I Love the 2000s -Kununokuni -List of Wisin & Yandel collaborations -List of comics based on films -List of programs broadcast by Asianet -Urban Hymns -1979 Sydney City FC season -2007 in Spanish television -2022 World Athletics U20 Championships – Men's 4 × 100 metres relay -A2 autostrada (Poland) -Black to the Future (TV series) -Chandel (Rajput clan) -County of Isenburg -Dinka people -Dwayne McDuffie Award for Diversity in Comics -FTSE Italia Mid Cap -Globoplay -Index of Armenia-related articles -List of Equinox episodes -List of Indian monarchs -List of Italian exonyms in Dalmatia -List of Ultimate Marvel characters -List of cities with historical German exonyms -List of jötnar in Norse mythology -List of language families -List of people with surname Davis -List of political parties in Venezuela -List of programmes broadcast by HTV -Paul (given name) -Principality of Lippe -Propaganda in Russia -Qazi Ghulam Mustafa -Redfern Now -Roy Orbison/The Beatles Tour -Royal Birmingham Conservatoire diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py new file mode 100644 index 0000000..26d7a20 --- /dev/null +++ b/dab_mechanic/mediawiki_api.py @@ -0,0 +1,45 @@ +"""Interface with the mediawiki API.""" + +from typing import Any +from . import wikidata_oauth + +wiki_hostname = "en.wikipedia.org" +wiki_api_php = f"https://{wiki_hostname}/w/api.php" +user_agent = "dab-mechanic/0.1" + + +def parse_page(enwiki: str) -> dict[str, Any]: + """Call mediawiki parse API for given article.""" + params: dict[str, str | int] = { + "action": "parse", + "format": "json", + "formatversion": 2, + "disableeditsection": 1, + "page": enwiki, + "prop": "text|links|headhtml", + "disabletoc": 1, + } + + parse: dict[str, Any] = call(params)["parse"] + return parse + + +def call(params: dict[str, str | int]) -> dict[str, Any]: + """Make GET request to mediawiki API.""" + data: dict[str, Any] = wikidata_oauth.api_post_request(params) + return data.json() + + +def get_content(title: str) -> str: + """Get article text.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "prop": "revisions|info", + "rvprop": "content|timestamp", + "titles": title, + } + data = call(params) + rev: str = data["query"]["pages"][0]["revisions"][0]["content"] + return rev diff --git a/dab_mechanic/wikidata_oauth.py b/dab_mechanic/wikidata_oauth.py index dca0707..5af0976 100644 --- a/dab_mechanic/wikidata_oauth.py +++ b/dab_mechanic/wikidata_oauth.py @@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]: def api_post_request(params: dict[str, str | int]): """HTTP Post using Oauth.""" app = current_app - url = "https://www.wikidata.org/w/api.php" client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.post(url, data=params, timeout=4, proxies=proxies) + return oauth.post(api_url, data=params, timeout=10, proxies=proxies) def raw_request(params): app = current_app - url = "https://www.wikidata.org/w/api.php?" + urlencode(params) + url = api_url + "?" + urlencode(params) client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -44,7 +43,7 @@ def raw_request(params): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.get(url, timeout=4, proxies=proxies) + return oauth.get(url, timeout=10, proxies=proxies) def api_request(params): diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py new file mode 100644 index 0000000..57c03c4 --- /dev/null +++ b/dab_mechanic/wikipedia.py @@ -0,0 +1,206 @@ +from collections import defaultdict +from typing import Any, Iterator, Optional, TypedDict + +import flask +import lxml.html + +from . import mediawiki_api +from pprint import pprint +from time import sleep + +disambig_templates = [ + "Template:Disambiguation", + "Template:Airport disambiguation", + "Template:Biology disambiguation", + "Template:Call sign disambiguation", + "Template:Caselaw disambiguation", + "Template:Chinese title disambiguation", + "Template:Disambiguation cleanup", + "Template:Genus disambiguation", + "Template:Hospital disambiguation", + "Template:Human name disambiguation", + "Template:Human name disambiguation cleanup", + "Template:Letter-number combination disambiguation", + "Template:Mathematical disambiguation", + "Template:Military unit disambiguation", + "Template:Music disambiguation", + "Template:Number disambiguation", + "Template:Opus number disambiguation", + "Template:Phonetics disambiguation", + "Template:Place name disambiguation", + "Template:Portal disambiguation", + "Template:Road disambiguation", + "Template:School disambiguation", + "Template:Species Latin name abbreviation disambiguation", + "Template:Species Latin name disambiguation", + "Template:Station disambiguation", + "Template:Synagogue disambiguation", + "Template:Taxonomic authority disambiguation", + "Template:Taxonomy disambiguation", + "Template:Template disambiguation", + "Template:WoO number disambiguation", +] + + +def link_params(enwiki: str) -> dict[str, str | int]: + """Parameters for finding article links from the API.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "titles": enwiki, + "generator": "links", + "gpllimit": "max", + "gplnamespace": 0, + "tllimit": "max", + "redirects": 1, + "tlnamespace": 10, + "tltemplates": "|".join(disambig_templates), + "prop": "templates", + } + return params + + +def needs_disambig(link: dict[str, Any]) -> bool: + """Is this a disambiguation link.""" + return bool( + not link["title"].endswith(" (disambiguation)") and link.get("templates") + ) + + +def get_article_links(enwiki: str) -> list[str]: + """Get links that appear in this article.""" + + params: dict[str, str | int] = link_params(enwiki) + links: set[str] = set() + + redirects = defaultdict(set) + + while True: + data = mediawiki_api.call(params) + if "query" not in data: + pprint(data) + pages = data["query"].pop("pages") + for r in data["query"].pop("redirects"): + redirects[r["to"]].add(r["from"]) + + links.update(page["title"] for page in pages if needs_disambig(page)) + + if "continue" not in data: + break + + params["gplcontinue"] = data["continue"]["gplcontinue"] + sleep(0.1) + + for link in set(links): + if link in redirects: + links.update(redirects[link]) + + return list(links) + + # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} + + +def get_article_html(enwiki: str) -> str: + """Parse article wikitext and return HTML.""" + text: str = mediawiki_api.parse_page(enwiki)["text"] + return text + + +class DabItem(TypedDict): + """Represent a disabiguation page.""" + + num: int + title: str + html: str + + +def delete_toc(root: lxml.html.HtmlElement) -> None: + """Delete table of contents from article HTML.""" + for toc in root.findall(".//div[@class='toc']"): + toc.getparent().remove(toc) + + +def get_dab_html(dab_num: int, title: str) -> str: + """Parse dab page and rewrite links.""" + dab_html = get_article_html(title) + root = lxml.html.fromstring(dab_html) + delete_toc(root) + + element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} + + for a in root.findall(".//a[@href]"): + href: str | None = a.get("href") + if not href: + continue + if not href.startswith("#"): + a.set("href", "#") + a.set("onclick", f"return select_dab(this, {dab_num})") + continue + + destination_element = element_id_map[href[1:]] + assert destination_element is not None + destination_element.set("id", f"{dab_num}{href[1:]}") + a.set("href", f"#{dab_num}{href[1:]}") + + html: str = lxml.html.tostring(root, encoding=str) + return html + + +class Article: + """Current article we're working on.""" + + def __init__(self, enwiki: str) -> None: + """Make a new Article object.""" + self.enwiki = enwiki.replace("_", " ") + + self.links = get_article_links(enwiki) + + self.dab_list: list[DabItem] = [] + self.dab_lookup: dict[int, str] = {} + self.dab_order: list[str] = [] + self.parse: Optional[dict[str, Any]] = None + + def save_endpoint(self) -> str: + """Endpoint for saving changes.""" + href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) + return href + + def load(self) -> None: + """Load parsed article HTML.""" + self.parse = mediawiki_api.parse_page(self.enwiki) + self.root = lxml.html.fromstring(self.parse.pop("text")) + + def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: + """Disambiguation links that need fixing.""" + seen = set() + for a in self.root.findall(".//a[@href]"): + title = a.get("title") + if title is None or title not in self.links: + continue + a.set("class", "disambig") + + if title in seen: + continue + seen.add(title) + + yield a, title + + def process_links(self) -> None: + """Process links in parsed wikitext.""" + for dab_num, (a, title) in enumerate(self.iter_links()): + a.set("id", f"dab-{dab_num}") + + dab: DabItem = { + "num": dab_num, + "title": title, + "html": get_dab_html(dab_num, title), + } + self.dab_list.append(dab) + self.dab_order.append(title) + self.dab_lookup[dab_num] = title + + def get_html(self) -> str: + """Return the processed article HTML.""" + html: str = lxml.html.tostring(self.root, encoding=str) + return html diff --git a/static/css/exception.css b/static/css/exception.css new file mode 100644 index 0000000..1f141c5 --- /dev/null +++ b/static/css/exception.css @@ -0,0 +1,78 @@ +div.debugger { text-align: left; padding: 12px; margin: auto; + background-color: white; } +div.detail { cursor: pointer; } +div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap; + font-family: monospace; } +div.explanation { margin: 20px 13px; font-size: 15px; color: #555; } +div.footer { font-size: 13px; text-align: right; margin: 30px 0; + color: #86989B; } + +h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px; + background-color: #11557C; color: white; } +h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; } + +div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; } +div.plain p { margin: 0; } +div.plain textarea, +div.plain pre { margin: 10px 0 0 0; padding: 4px; + background-color: #E8EFF0; border: 1px solid #D3E7E9; } +div.plain textarea { width: 99%; height: 300px; } +div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; } +div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; } +div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; } +div.traceback pre { margin: 0; padding: 5px 0 3px 15px; + background-color: #E8EFF0; border: 1px solid #D3E7E9; } +div.traceback .library .current { background: white; color: #555; } +div.traceback .expanded .current { background: #E8EFF0; color: black; } +div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; } +div.traceback div.source.expanded pre + pre { border-top: none; } + +div.traceback span.ws { display: none; } +div.traceback pre.before, div.traceback pre.after { display: none; background: white; } +div.traceback div.source.expanded pre.before, +div.traceback div.source.expanded pre.after { + display: block; +} + +div.traceback div.source.expanded span.ws { + display: inline; +} + +div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; } +div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; } +div.traceback img:hover { background-color: #ddd; cursor: pointer; + border-color: #BFDDE0; } +div.traceback pre:hover img { display: block; } +div.traceback cite.filename { font-style: normal; color: #3B666B; } + +pre.console { border: 1px solid #ccc; background: white!important; + color: black; padding: 5px!important; + margin: 3px 0 0 0!important; cursor: default!important; + max-height: 400px; overflow: auto; } +pre.console form { color: #555; } +pre.console input { background-color: transparent; color: #555; + width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; font-size: 14px; + border: none!important; } + +span.string { color: #30799B; } +span.number { color: #9C1A1C; } +span.help { color: #3A7734; } +span.object { color: #485F6E; } +span.extended { opacity: 0.5; } +span.extended:hover { opacity: 1; } +a.toggle { text-decoration: none; background-repeat: no-repeat; + background-position: center center; + background-image: url(?__debugger__=yes&cmd=resource&f=more.png); } +a.toggle:hover { background-color: #444; } +a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); } + +div.traceback pre, div.console pre { + white-space: pre-wrap; /* css-3 should we be so lucky... */ + white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ + white-space: -pre-wrap; /* Opera 4-6 ?? */ + white-space: -o-pre-wrap; /* Opera 7 ?? */ + word-wrap: break-word; /* Internet Explorer 5.5+ */ + _white-space: pre; /* IE only hack to re-specify in + addition to word-wrap */ +} diff --git a/templates/index.html b/templates/index.html index 21e1b5f..1758d8e 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,12 +1,14 @@ {% extends "base.html" %} {% block content %} - + + {% endblock %} diff --git a/web_view.py b/web_view.py index 61ba0d1..2730f23 100755 --- a/web_view.py +++ b/web_view.py @@ -3,7 +3,7 @@ import inspect import json import re -from typing import Any, Iterator, Optional, TypedDict +from typing import Optional import flask import lxml.html @@ -13,16 +13,17 @@ from requests_oauthlib import OAuth1Session from werkzeug.debug.tbtools import get_current_traceback from werkzeug.wrappers import Response -from dab_mechanic import wikidata_oauth +from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia app = flask.Flask(__name__) app.config.from_object("config.default") -app.debug = True wiki_hostname = "en.wikipedia.org" wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_index_php = f"https://{wiki_hostname}/w/index.php" +awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php" + @app.before_request def global_user(): @@ -46,21 +47,6 @@ def exception_handler(e): ) -def get_content(title: str) -> str: - """Get article text.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "prop": "revisions|info", - "rvprop": "content|timestamp", - "titles": title, - } - data = requests.get(wiki_api_php, params=params).json() - rev: str = data["query"]["pages"][0]["revisions"][0]["content"] - return rev - - def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: """Parse Articles With Multiple Dablinks.""" articles = [] @@ -78,8 +64,7 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in @app.route("/") def index(): - - r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php") + r = requests.get(awdl_url, params={"limit": 100}) root = lxml.html.fromstring(r.content) articles = parse_articles_with_dab_links(root) @@ -88,145 +73,6 @@ def index(): return flask.render_template("index.html", articles=articles) -def call_parse_api(enwiki: str) -> dict[str, Any]: - """Call mediawiki parse API for given article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = { - "action": "parse", - "format": "json", - "formatversion": 2, - "disableeditsection": 1, - "page": enwiki, - "prop": "text|links|headhtml", - "disabletoc": 1, - } - - r = requests.get(url, params=params) - parse: dict[str, Any] = r.json()["parse"] - return parse - - -def get_article_html(enwiki: str) -> str: - """Parse article wikitext and return HTML.""" - text: str = call_parse_api(enwiki)["text"] - return text - - -disambig_templates = [ - "Template:Disambiguation", - "Template:Airport disambiguation", - "Template:Biology disambiguation", - "Template:Call sign disambiguation", - "Template:Caselaw disambiguation", - "Template:Chinese title disambiguation", - "Template:Disambiguation cleanup", - "Template:Genus disambiguation", - "Template:Hospital disambiguation", - "Template:Human name disambiguation", - "Template:Human name disambiguation cleanup", - "Template:Letter-number combination disambiguation", - "Template:Mathematical disambiguation", - "Template:Military unit disambiguation", - "Template:Music disambiguation", - "Template:Number disambiguation", - "Template:Opus number disambiguation", - "Template:Phonetics disambiguation", - "Template:Place name disambiguation", - "Template:Portal disambiguation", - "Template:Road disambiguation", - "Template:School disambiguation", - "Template:Species Latin name abbreviation disambiguation", - "Template:Species Latin name disambiguation", - "Template:Station disambiguation", - "Template:Synagogue disambiguation", - "Template:Taxonomic authority disambiguation", - "Template:Taxonomy disambiguation", - "Template:Template disambiguation", - "Template:WoO number disambiguation", -] - - -def link_params(enwiki: str) -> dict[str, str | int]: - """Parameters for finding article links from the API.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "titles": enwiki, - "generator": "links", - "gpllimit": "max", - "gplnamespace": 0, - "tllimit": "max", - "tlnamespace": 10, - "tltemplates": "|".join(disambig_templates), - "prop": "templates", - } - return params - - -def needs_disambig(link: dict[str, Any]) -> bool: - """Is this a disambiguation link.""" - return bool( - not link["title"].endswith(" (disambiguation)") and link.get("templates") - ) - - -def get_article_links(enwiki: str) -> list[str]: - """Get links that appear in this article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = link_params(enwiki) - links: set[str] = set() - - while True: - data = requests.get(url, params=params).json() - links.update( - page["title"] for page in data["query"]["pages"] if needs_disambig(page) - ) - - if "continue" not in data: - break - - params["gplcontinue"] = data["continue"]["gplcontinue"] - - return list(links) - - # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} - - -def delete_toc(root: lxml.html.HtmlElement) -> None: - """Delete table of contents from article HTML.""" - for toc in root.findall(".//div[@class='toc']"): - toc.getparent().remove(toc) - - -def get_dab_html(dab_num: int, title: str) -> str: - """Parse dab page and rewrite links.""" - dab_html = get_article_html(title) - root = lxml.html.fromstring(dab_html) - delete_toc(root) - - element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} - - for a in root.findall(".//a[@href]"): - href: str | None = a.get("href") - if not href: - continue - if not href.startswith("#"): - a.set("href", "#") - a.set("onclick", f"return select_dab(this, {dab_num})") - continue - - destination_element = element_id_map[href[1:]] - assert destination_element is not None - destination_element.set("id", f"{dab_num}{href[1:]}") - a.set("href", f"#{dab_num}{href[1:]}") - - html: str = lxml.html.tostring(root, encoding=str) - return html - - def make_disamb_link(edit: tuple[str, str]) -> str: """Given an edit return the appropriate link.""" return f"[[{edit[1]}|{edit[0]}]]" @@ -266,7 +112,7 @@ def save(enwiki: str) -> Response | str: edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" - article_text = apply_edits(get_content(enwiki), edits) + article_text = apply_edits(mediawiki_api.get_content(enwiki), edits) return flask.render_template( "save.html", @@ -277,85 +123,25 @@ def save(enwiki: str) -> Response | str: ) -class DabItem(TypedDict): - """Represent a disabiguation page.""" - - num: int - title: str - html: str - - -class Article: - """Current article we're working on.""" - - def __init__(self, enwiki: str) -> None: - """Make a new Article object.""" - self.enwiki = enwiki - - self.links = get_article_links(enwiki) - - self.dab_list: list[DabItem] = [] - self.dab_lookup: dict[int, str] = {} - self.dab_order: list[str] = [] - self.parse: Optional[dict[str, Any]] = None - - def save_endpoint(self) -> str: - """Endpoint for saving changes.""" - href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) - return href - - def load(self) -> None: - """Load parsed article HTML.""" - self.parse = call_parse_api(self.enwiki) - self.root = lxml.html.fromstring(self.parse.pop("text")) - - def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: - """Disambiguation links that need fixing.""" - seen = set() - for a in self.root.findall(".//a[@href]"): - title = a.get("title") - if title is None or title not in self.links: - continue - a.set("class", "disambig") - - if title in seen: - continue - seen.add(title) - - yield a, title - - def process_links(self) -> None: - """Process links in parsed wikitext.""" - for dab_num, (a, title) in enumerate(self.iter_links()): - a.set("id", f"dab-{dab_num}") - - dab: DabItem = { - "num": dab_num, - "title": title, - "html": get_dab_html(dab_num, title), - } - self.dab_list.append(dab) - self.dab_order.append(title) - self.dab_lookup[dab_num] = title - - def get_html(self) -> str: - """Return the processed article HTML.""" - html: str = lxml.html.tostring(self.root, encoding=str) - return html +def redirect_if_needed(enwiki: str) -> Optional[Response]: + """Check if there are spaces in the article name and redirect.""" + return ( + flask.redirect( + flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_")) + ) + if " " in enwiki + else None + ) @app.route("/enwiki/") def article_page(enwiki: str) -> Response: """Article Page.""" - enwiki_orig = enwiki - enwiki = enwiki.replace("_", " ") - enwiki_underscore = enwiki.replace(" ", "_") - if " " in enwiki_orig: - return flask.redirect( - flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore) - ) + redirect = redirect_if_needed(enwiki) + if redirect: + return redirect - article = Article(enwiki) + article = wikipedia.Article(enwiki) article.load() article.process_links()