diff --git a/.gitignore b/.gitignore deleted file mode 100644 index bee8a64..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -__pycache__ diff --git a/article_list b/article_list new file mode 100644 index 0000000..7f4bcaf --- /dev/null +++ b/article_list @@ -0,0 +1,50 @@ +Rail transport in Indonesia +SchleFaZ +Chicago Bulls +Orwell Prize +List of fatal victims of the September 11 attacks +Arabic exonyms +Canadian Alpine Ski Championships +Method Man filmography +Popular Union +The Cantos +Unisex name +United States Alpine Ski Championships +AS Kaloum Star +Akademi Fantasia (season 1) +Athletics at the 2022 Bolivarian Games +I Love the 2000s +Kununokuni +List of Wisin & Yandel collaborations +List of comics based on films +List of programs broadcast by Asianet +Urban Hymns +1979 Sydney City FC season +2007 in Spanish television +2022 World Athletics U20 Championships – Men's 4 × 100 metres relay +A2 autostrada (Poland) +Black to the Future (TV series) +Chandel (Rajput clan) +County of Isenburg +Dinka people +Dwayne McDuffie Award for Diversity in Comics +FTSE Italia Mid Cap +Globoplay +Index of Armenia-related articles +List of Equinox episodes +List of Indian monarchs +List of Italian exonyms in Dalmatia +List of Ultimate Marvel characters +List of cities with historical German exonyms +List of jötnar in Norse mythology +List of language families +List of people with surname Davis +List of political parties in Venezuela +List of programmes broadcast by HTV +Paul (given name) +Principality of Lippe +Propaganda in Russia +Qazi Ghulam Mustafa +Redfern Now +Roy Orbison/The Beatles Tour +Royal Birmingham Conservatoire diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py deleted file mode 100644 index 26d7a20..0000000 --- a/dab_mechanic/mediawiki_api.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Interface with the mediawiki API.""" - -from typing import Any -from . import wikidata_oauth - -wiki_hostname = "en.wikipedia.org" -wiki_api_php = f"https://{wiki_hostname}/w/api.php" -user_agent = "dab-mechanic/0.1" - - -def parse_page(enwiki: str) -> dict[str, Any]: - """Call mediawiki parse API for given article.""" - params: dict[str, str | int] = { - "action": "parse", - "format": "json", - "formatversion": 2, - "disableeditsection": 1, - "page": enwiki, - "prop": "text|links|headhtml", - "disabletoc": 1, - } - - parse: dict[str, Any] = call(params)["parse"] - return parse - - -def call(params: dict[str, str | int]) -> dict[str, Any]: - """Make GET request to mediawiki API.""" - data: dict[str, Any] = wikidata_oauth.api_post_request(params) - return data.json() - - -def get_content(title: str) -> str: - """Get article text.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "prop": "revisions|info", - "rvprop": "content|timestamp", - "titles": title, - } - data = call(params) - rev: str = data["query"]["pages"][0]["revisions"][0]["content"] - return rev diff --git a/dab_mechanic/wikidata_oauth.py b/dab_mechanic/wikidata_oauth.py index 5af0976..dca0707 100644 --- a/dab_mechanic/wikidata_oauth.py +++ b/dab_mechanic/wikidata_oauth.py @@ -19,6 +19,7 @@ def get_edit_proxy() -> dict[str, str]: def api_post_request(params: dict[str, str | int]): """HTTP Post using Oauth.""" app = current_app + url = "https://www.wikidata.org/w/api.php" client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -28,12 +29,12 @@ def api_post_request(params: dict[str, str | int]): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.post(api_url, data=params, timeout=10, proxies=proxies) + return oauth.post(url, data=params, timeout=4, proxies=proxies) def raw_request(params): app = current_app - url = api_url + "?" + urlencode(params) + url = "https://www.wikidata.org/w/api.php?" + urlencode(params) client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -43,7 +44,7 @@ def raw_request(params): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.get(url, timeout=10, proxies=proxies) + return oauth.get(url, timeout=4, proxies=proxies) def api_request(params): diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py deleted file mode 100644 index 57c03c4..0000000 --- a/dab_mechanic/wikipedia.py +++ /dev/null @@ -1,206 +0,0 @@ -from collections import defaultdict -from typing import Any, Iterator, Optional, TypedDict - -import flask -import lxml.html - -from . import mediawiki_api -from pprint import pprint -from time import sleep - -disambig_templates = [ - "Template:Disambiguation", - "Template:Airport disambiguation", - "Template:Biology disambiguation", - "Template:Call sign disambiguation", - "Template:Caselaw disambiguation", - "Template:Chinese title disambiguation", - "Template:Disambiguation cleanup", - "Template:Genus disambiguation", - "Template:Hospital disambiguation", - "Template:Human name disambiguation", - "Template:Human name disambiguation cleanup", - "Template:Letter-number combination disambiguation", - "Template:Mathematical disambiguation", - "Template:Military unit disambiguation", - "Template:Music disambiguation", - "Template:Number disambiguation", - "Template:Opus number disambiguation", - "Template:Phonetics disambiguation", - "Template:Place name disambiguation", - "Template:Portal disambiguation", - "Template:Road disambiguation", - "Template:School disambiguation", - "Template:Species Latin name abbreviation disambiguation", - "Template:Species Latin name disambiguation", - "Template:Station disambiguation", - "Template:Synagogue disambiguation", - "Template:Taxonomic authority disambiguation", - "Template:Taxonomy disambiguation", - "Template:Template disambiguation", - "Template:WoO number disambiguation", -] - - -def link_params(enwiki: str) -> dict[str, str | int]: - """Parameters for finding article links from the API.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "titles": enwiki, - "generator": "links", - "gpllimit": "max", - "gplnamespace": 0, - "tllimit": "max", - "redirects": 1, - "tlnamespace": 10, - "tltemplates": "|".join(disambig_templates), - "prop": "templates", - } - return params - - -def needs_disambig(link: dict[str, Any]) -> bool: - """Is this a disambiguation link.""" - return bool( - not link["title"].endswith(" (disambiguation)") and link.get("templates") - ) - - -def get_article_links(enwiki: str) -> list[str]: - """Get links that appear in this article.""" - - params: dict[str, str | int] = link_params(enwiki) - links: set[str] = set() - - redirects = defaultdict(set) - - while True: - data = mediawiki_api.call(params) - if "query" not in data: - pprint(data) - pages = data["query"].pop("pages") - for r in data["query"].pop("redirects"): - redirects[r["to"]].add(r["from"]) - - links.update(page["title"] for page in pages if needs_disambig(page)) - - if "continue" not in data: - break - - params["gplcontinue"] = data["continue"]["gplcontinue"] - sleep(0.1) - - for link in set(links): - if link in redirects: - links.update(redirects[link]) - - return list(links) - - # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} - - -def get_article_html(enwiki: str) -> str: - """Parse article wikitext and return HTML.""" - text: str = mediawiki_api.parse_page(enwiki)["text"] - return text - - -class DabItem(TypedDict): - """Represent a disabiguation page.""" - - num: int - title: str - html: str - - -def delete_toc(root: lxml.html.HtmlElement) -> None: - """Delete table of contents from article HTML.""" - for toc in root.findall(".//div[@class='toc']"): - toc.getparent().remove(toc) - - -def get_dab_html(dab_num: int, title: str) -> str: - """Parse dab page and rewrite links.""" - dab_html = get_article_html(title) - root = lxml.html.fromstring(dab_html) - delete_toc(root) - - element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} - - for a in root.findall(".//a[@href]"): - href: str | None = a.get("href") - if not href: - continue - if not href.startswith("#"): - a.set("href", "#") - a.set("onclick", f"return select_dab(this, {dab_num})") - continue - - destination_element = element_id_map[href[1:]] - assert destination_element is not None - destination_element.set("id", f"{dab_num}{href[1:]}") - a.set("href", f"#{dab_num}{href[1:]}") - - html: str = lxml.html.tostring(root, encoding=str) - return html - - -class Article: - """Current article we're working on.""" - - def __init__(self, enwiki: str) -> None: - """Make a new Article object.""" - self.enwiki = enwiki.replace("_", " ") - - self.links = get_article_links(enwiki) - - self.dab_list: list[DabItem] = [] - self.dab_lookup: dict[int, str] = {} - self.dab_order: list[str] = [] - self.parse: Optional[dict[str, Any]] = None - - def save_endpoint(self) -> str: - """Endpoint for saving changes.""" - href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) - return href - - def load(self) -> None: - """Load parsed article HTML.""" - self.parse = mediawiki_api.parse_page(self.enwiki) - self.root = lxml.html.fromstring(self.parse.pop("text")) - - def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: - """Disambiguation links that need fixing.""" - seen = set() - for a in self.root.findall(".//a[@href]"): - title = a.get("title") - if title is None or title not in self.links: - continue - a.set("class", "disambig") - - if title in seen: - continue - seen.add(title) - - yield a, title - - def process_links(self) -> None: - """Process links in parsed wikitext.""" - for dab_num, (a, title) in enumerate(self.iter_links()): - a.set("id", f"dab-{dab_num}") - - dab: DabItem = { - "num": dab_num, - "title": title, - "html": get_dab_html(dab_num, title), - } - self.dab_list.append(dab) - self.dab_order.append(title) - self.dab_lookup[dab_num] = title - - def get_html(self) -> str: - """Return the processed article HTML.""" - html: str = lxml.html.tostring(self.root, encoding=str) - return html diff --git a/static/css/exception.css b/static/css/exception.css deleted file mode 100644 index 1f141c5..0000000 --- a/static/css/exception.css +++ /dev/null @@ -1,78 +0,0 @@ -div.debugger { text-align: left; padding: 12px; margin: auto; - background-color: white; } -div.detail { cursor: pointer; } -div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap; - font-family: monospace; } -div.explanation { margin: 20px 13px; font-size: 15px; color: #555; } -div.footer { font-size: 13px; text-align: right; margin: 30px 0; - color: #86989B; } - -h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px; - background-color: #11557C; color: white; } -h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; } - -div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; } -div.plain p { margin: 0; } -div.plain textarea, -div.plain pre { margin: 10px 0 0 0; padding: 4px; - background-color: #E8EFF0; border: 1px solid #D3E7E9; } -div.plain textarea { width: 99%; height: 300px; } -div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; } -div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; } -div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; } -div.traceback pre { margin: 0; padding: 5px 0 3px 15px; - background-color: #E8EFF0; border: 1px solid #D3E7E9; } -div.traceback .library .current { background: white; color: #555; } -div.traceback .expanded .current { background: #E8EFF0; color: black; } -div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; } -div.traceback div.source.expanded pre + pre { border-top: none; } - -div.traceback span.ws { display: none; } -div.traceback pre.before, div.traceback pre.after { display: none; background: white; } -div.traceback div.source.expanded pre.before, -div.traceback div.source.expanded pre.after { - display: block; -} - -div.traceback div.source.expanded span.ws { - display: inline; -} - -div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; } -div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; } -div.traceback img:hover { background-color: #ddd; cursor: pointer; - border-color: #BFDDE0; } -div.traceback pre:hover img { display: block; } -div.traceback cite.filename { font-style: normal; color: #3B666B; } - -pre.console { border: 1px solid #ccc; background: white!important; - color: black; padding: 5px!important; - margin: 3px 0 0 0!important; cursor: default!important; - max-height: 400px; overflow: auto; } -pre.console form { color: #555; } -pre.console input { background-color: transparent; color: #555; - width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono', - 'Bitstream Vera Sans Mono', monospace; font-size: 14px; - border: none!important; } - -span.string { color: #30799B; } -span.number { color: #9C1A1C; } -span.help { color: #3A7734; } -span.object { color: #485F6E; } -span.extended { opacity: 0.5; } -span.extended:hover { opacity: 1; } -a.toggle { text-decoration: none; background-repeat: no-repeat; - background-position: center center; - background-image: url(?__debugger__=yes&cmd=resource&f=more.png); } -a.toggle:hover { background-color: #444; } -a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); } - -div.traceback pre, div.console pre { - white-space: pre-wrap; /* css-3 should we be so lucky... */ - white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ - white-space: -pre-wrap; /* Opera 4-6 ?? */ - white-space: -o-pre-wrap; /* Opera 7 ?? */ - word-wrap: break-word; /* Internet Explorer 5.5+ */ - _white-space: pre; /* IE only hack to re-specify in - addition to word-wrap */ -} diff --git a/templates/index.html b/templates/index.html index 1758d8e..21e1b5f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,14 +1,12 @@ {% extends "base.html" %} {% block content %} -
-
    +
-
+ {% endblock %} diff --git a/web_view.py b/web_view.py index 2730f23..61ba0d1 100755 --- a/web_view.py +++ b/web_view.py @@ -3,7 +3,7 @@ import inspect import json import re -from typing import Optional +from typing import Any, Iterator, Optional, TypedDict import flask import lxml.html @@ -13,17 +13,16 @@ from requests_oauthlib import OAuth1Session from werkzeug.debug.tbtools import get_current_traceback from werkzeug.wrappers import Response -from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia +from dab_mechanic import wikidata_oauth app = flask.Flask(__name__) app.config.from_object("config.default") +app.debug = True wiki_hostname = "en.wikipedia.org" wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_index_php = f"https://{wiki_hostname}/w/index.php" -awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php" - @app.before_request def global_user(): @@ -47,6 +46,21 @@ def exception_handler(e): ) +def get_content(title: str) -> str: + """Get article text.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "prop": "revisions|info", + "rvprop": "content|timestamp", + "titles": title, + } + data = requests.get(wiki_api_php, params=params).json() + rev: str = data["query"]["pages"][0]["revisions"][0]["content"] + return rev + + def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: """Parse Articles With Multiple Dablinks.""" articles = [] @@ -64,7 +78,8 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in @app.route("/") def index(): - r = requests.get(awdl_url, params={"limit": 100}) + + r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php") root = lxml.html.fromstring(r.content) articles = parse_articles_with_dab_links(root) @@ -73,6 +88,145 @@ def index(): return flask.render_template("index.html", articles=articles) +def call_parse_api(enwiki: str) -> dict[str, Any]: + """Call mediawiki parse API for given article.""" + url = "https://en.wikipedia.org/w/api.php" + + params: dict[str, str | int] = { + "action": "parse", + "format": "json", + "formatversion": 2, + "disableeditsection": 1, + "page": enwiki, + "prop": "text|links|headhtml", + "disabletoc": 1, + } + + r = requests.get(url, params=params) + parse: dict[str, Any] = r.json()["parse"] + return parse + + +def get_article_html(enwiki: str) -> str: + """Parse article wikitext and return HTML.""" + text: str = call_parse_api(enwiki)["text"] + return text + + +disambig_templates = [ + "Template:Disambiguation", + "Template:Airport disambiguation", + "Template:Biology disambiguation", + "Template:Call sign disambiguation", + "Template:Caselaw disambiguation", + "Template:Chinese title disambiguation", + "Template:Disambiguation cleanup", + "Template:Genus disambiguation", + "Template:Hospital disambiguation", + "Template:Human name disambiguation", + "Template:Human name disambiguation cleanup", + "Template:Letter-number combination disambiguation", + "Template:Mathematical disambiguation", + "Template:Military unit disambiguation", + "Template:Music disambiguation", + "Template:Number disambiguation", + "Template:Opus number disambiguation", + "Template:Phonetics disambiguation", + "Template:Place name disambiguation", + "Template:Portal disambiguation", + "Template:Road disambiguation", + "Template:School disambiguation", + "Template:Species Latin name abbreviation disambiguation", + "Template:Species Latin name disambiguation", + "Template:Station disambiguation", + "Template:Synagogue disambiguation", + "Template:Taxonomic authority disambiguation", + "Template:Taxonomy disambiguation", + "Template:Template disambiguation", + "Template:WoO number disambiguation", +] + + +def link_params(enwiki: str) -> dict[str, str | int]: + """Parameters for finding article links from the API.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "titles": enwiki, + "generator": "links", + "gpllimit": "max", + "gplnamespace": 0, + "tllimit": "max", + "tlnamespace": 10, + "tltemplates": "|".join(disambig_templates), + "prop": "templates", + } + return params + + +def needs_disambig(link: dict[str, Any]) -> bool: + """Is this a disambiguation link.""" + return bool( + not link["title"].endswith(" (disambiguation)") and link.get("templates") + ) + + +def get_article_links(enwiki: str) -> list[str]: + """Get links that appear in this article.""" + url = "https://en.wikipedia.org/w/api.php" + + params: dict[str, str | int] = link_params(enwiki) + links: set[str] = set() + + while True: + data = requests.get(url, params=params).json() + links.update( + page["title"] for page in data["query"]["pages"] if needs_disambig(page) + ) + + if "continue" not in data: + break + + params["gplcontinue"] = data["continue"]["gplcontinue"] + + return list(links) + + # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} + + +def delete_toc(root: lxml.html.HtmlElement) -> None: + """Delete table of contents from article HTML.""" + for toc in root.findall(".//div[@class='toc']"): + toc.getparent().remove(toc) + + +def get_dab_html(dab_num: int, title: str) -> str: + """Parse dab page and rewrite links.""" + dab_html = get_article_html(title) + root = lxml.html.fromstring(dab_html) + delete_toc(root) + + element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} + + for a in root.findall(".//a[@href]"): + href: str | None = a.get("href") + if not href: + continue + if not href.startswith("#"): + a.set("href", "#") + a.set("onclick", f"return select_dab(this, {dab_num})") + continue + + destination_element = element_id_map[href[1:]] + assert destination_element is not None + destination_element.set("id", f"{dab_num}{href[1:]}") + a.set("href", f"#{dab_num}{href[1:]}") + + html: str = lxml.html.tostring(root, encoding=str) + return html + + def make_disamb_link(edit: tuple[str, str]) -> str: """Given an edit return the appropriate link.""" return f"[[{edit[1]}|{edit[0]}]]" @@ -112,7 +266,7 @@ def save(enwiki: str) -> Response | str: edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" - article_text = apply_edits(mediawiki_api.get_content(enwiki), edits) + article_text = apply_edits(get_content(enwiki), edits) return flask.render_template( "save.html", @@ -123,25 +277,85 @@ def save(enwiki: str) -> Response | str: ) -def redirect_if_needed(enwiki: str) -> Optional[Response]: - """Check if there are spaces in the article name and redirect.""" - return ( - flask.redirect( - flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_")) - ) - if " " in enwiki - else None - ) +class DabItem(TypedDict): + """Represent a disabiguation page.""" + + num: int + title: str + html: str + + +class Article: + """Current article we're working on.""" + + def __init__(self, enwiki: str) -> None: + """Make a new Article object.""" + self.enwiki = enwiki + + self.links = get_article_links(enwiki) + + self.dab_list: list[DabItem] = [] + self.dab_lookup: dict[int, str] = {} + self.dab_order: list[str] = [] + self.parse: Optional[dict[str, Any]] = None + + def save_endpoint(self) -> str: + """Endpoint for saving changes.""" + href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) + return href + + def load(self) -> None: + """Load parsed article HTML.""" + self.parse = call_parse_api(self.enwiki) + self.root = lxml.html.fromstring(self.parse.pop("text")) + + def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: + """Disambiguation links that need fixing.""" + seen = set() + for a in self.root.findall(".//a[@href]"): + title = a.get("title") + if title is None or title not in self.links: + continue + a.set("class", "disambig") + + if title in seen: + continue + seen.add(title) + + yield a, title + + def process_links(self) -> None: + """Process links in parsed wikitext.""" + for dab_num, (a, title) in enumerate(self.iter_links()): + a.set("id", f"dab-{dab_num}") + + dab: DabItem = { + "num": dab_num, + "title": title, + "html": get_dab_html(dab_num, title), + } + self.dab_list.append(dab) + self.dab_order.append(title) + self.dab_lookup[dab_num] = title + + def get_html(self) -> str: + """Return the processed article HTML.""" + html: str = lxml.html.tostring(self.root, encoding=str) + return html @app.route("/enwiki/") def article_page(enwiki: str) -> Response: """Article Page.""" - redirect = redirect_if_needed(enwiki) - if redirect: - return redirect + enwiki_orig = enwiki + enwiki = enwiki.replace("_", " ") + enwiki_underscore = enwiki.replace(" ", "_") + if " " in enwiki_orig: + return flask.redirect( + flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore) + ) - article = wikipedia.Article(enwiki) + article = Article(enwiki) article.load() article.process_links()