From 5ff4749512a257e5a6ba5c787a3fd132d6fe9213 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:34:17 +0100 Subject: [PATCH 01/10] Formatting for article list --- templates/index.html | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/templates/index.html b/templates/index.html index 21e1b5f..1758d8e 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,12 +1,14 @@ {% extends "base.html" %} {% block content %} - + + {% endblock %} From 8432632aae9a6b9f52e5d10fced24d94f4c4725c Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:34:55 +0100 Subject: [PATCH 02/10] Show more articles on index page --- web_view.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web_view.py b/web_view.py index 61ba0d1..03f1ccb 100755 --- a/web_view.py +++ b/web_view.py @@ -23,6 +23,8 @@ wiki_hostname = "en.wikipedia.org" wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_index_php = f"https://{wiki_hostname}/w/index.php" +awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php" + @app.before_request def global_user(): @@ -78,8 +80,7 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in @app.route("/") def index(): - - r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php") + r = requests.get(awdl_url, params={"limit": 100}) root = lxml.html.fromstring(r.content) articles = parse_articles_with_dab_links(root) From 78de5cc139b1f927c7b59c7e3bdcd502d8843927 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:35:26 +0100 Subject: [PATCH 03/10] Split code into another file --- dab_mechanic/wikipedia.py | 220 ++++++++++++++++++++++++++++++++++++++ web_view.py | 211 +----------------------------------- 2 files changed, 222 insertions(+), 209 deletions(-) create mode 100644 dab_mechanic/wikipedia.py diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py new file mode 100644 index 0000000..3b59278 --- /dev/null +++ b/dab_mechanic/wikipedia.py @@ -0,0 +1,220 @@ +from collections import defaultdict +from typing import Any, Iterator, Optional, TypedDict + +import flask +import lxml.html +import requests + +disambig_templates = [ + "Template:Disambiguation", + "Template:Airport disambiguation", + "Template:Biology disambiguation", + "Template:Call sign disambiguation", + "Template:Caselaw disambiguation", + "Template:Chinese title disambiguation", + "Template:Disambiguation cleanup", + "Template:Genus disambiguation", + "Template:Hospital disambiguation", + "Template:Human name disambiguation", + "Template:Human name disambiguation cleanup", + "Template:Letter-number combination disambiguation", + "Template:Mathematical disambiguation", + "Template:Military unit disambiguation", + "Template:Music disambiguation", + "Template:Number disambiguation", + "Template:Opus number disambiguation", + "Template:Phonetics disambiguation", + "Template:Place name disambiguation", + "Template:Portal disambiguation", + "Template:Road disambiguation", + "Template:School disambiguation", + "Template:Species Latin name abbreviation disambiguation", + "Template:Species Latin name disambiguation", + "Template:Station disambiguation", + "Template:Synagogue disambiguation", + "Template:Taxonomic authority disambiguation", + "Template:Taxonomy disambiguation", + "Template:Template disambiguation", + "Template:WoO number disambiguation", +] + + +def link_params(enwiki: str) -> dict[str, str | int]: + """Parameters for finding article links from the API.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "titles": enwiki, + "generator": "links", + "gpllimit": "max", + "gplnamespace": 0, + "tllimit": "max", + "redirects": 1, + "tlnamespace": 10, + "tltemplates": "|".join(disambig_templates), + "prop": "templates", + } + return params + + +def needs_disambig(link: dict[str, Any]) -> bool: + """Is this a disambiguation link.""" + return bool( + not link["title"].endswith(" (disambiguation)") and link.get("templates") + ) + + +def get_article_links(enwiki: str) -> list[str]: + """Get links that appear in this article.""" + url = "https://en.wikipedia.org/w/api.php" + + params: dict[str, str | int] = link_params(enwiki) + links: set[str] = set() + + redirects = defaultdict(set) + + while True: + data = requests.get(url, params=params).json() + pages = data["query"].pop("pages") + for r in data["query"].pop("redirects"): + redirects[r["to"]].add(r["from"]) + + links.update(page["title"] for page in pages if needs_disambig(page)) + + if "continue" not in data: + break + + params["gplcontinue"] = data["continue"]["gplcontinue"] + + for link in set(links): + if link in redirects: + links.update(redirects[link]) + + return list(links) + + # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} + + +def call_parse_api(enwiki: str) -> dict[str, Any]: + """Call mediawiki parse API for given article.""" + url = "https://en.wikipedia.org/w/api.php" + + params: dict[str, str | int] = { + "action": "parse", + "format": "json", + "formatversion": 2, + "disableeditsection": 1, + "page": enwiki, + "prop": "text|links|headhtml", + "disabletoc": 1, + } + + r = requests.get(url, params=params) + parse: dict[str, Any] = r.json()["parse"] + return parse + + +def get_article_html(enwiki: str) -> str: + """Parse article wikitext and return HTML.""" + text: str = call_parse_api(enwiki)["text"] + return text + + +class DabItem(TypedDict): + """Represent a disabiguation page.""" + + num: int + title: str + html: str + + +def delete_toc(root: lxml.html.HtmlElement) -> None: + """Delete table of contents from article HTML.""" + for toc in root.findall(".//div[@class='toc']"): + toc.getparent().remove(toc) + + +def get_dab_html(dab_num: int, title: str) -> str: + """Parse dab page and rewrite links.""" + dab_html = get_article_html(title) + root = lxml.html.fromstring(dab_html) + delete_toc(root) + + element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} + + for a in root.findall(".//a[@href]"): + href: str | None = a.get("href") + if not href: + continue + if not href.startswith("#"): + a.set("href", "#") + a.set("onclick", f"return select_dab(this, {dab_num})") + continue + + destination_element = element_id_map[href[1:]] + assert destination_element is not None + destination_element.set("id", f"{dab_num}{href[1:]}") + a.set("href", f"#{dab_num}{href[1:]}") + + html: str = lxml.html.tostring(root, encoding=str) + return html + + +class Article: + """Current article we're working on.""" + + def __init__(self, enwiki: str) -> None: + """Make a new Article object.""" + self.enwiki = enwiki + + self.links = get_article_links(enwiki) + + self.dab_list: list[DabItem] = [] + self.dab_lookup: dict[int, str] = {} + self.dab_order: list[str] = [] + self.parse: Optional[dict[str, Any]] = None + + def save_endpoint(self) -> str: + """Endpoint for saving changes.""" + href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) + return href + + def load(self) -> None: + """Load parsed article HTML.""" + self.parse = call_parse_api(self.enwiki) + self.root = lxml.html.fromstring(self.parse.pop("text")) + + def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: + """Disambiguation links that need fixing.""" + seen = set() + for a in self.root.findall(".//a[@href]"): + title = a.get("title") + if title is None or title not in self.links: + continue + a.set("class", "disambig") + + if title in seen: + continue + seen.add(title) + + yield a, title + + def process_links(self) -> None: + """Process links in parsed wikitext.""" + for dab_num, (a, title) in enumerate(self.iter_links()): + a.set("id", f"dab-{dab_num}") + + dab: DabItem = { + "num": dab_num, + "title": title, + "html": get_dab_html(dab_num, title), + } + self.dab_list.append(dab) + self.dab_order.append(title) + self.dab_lookup[dab_num] = title + + def get_html(self) -> str: + """Return the processed article HTML.""" + html: str = lxml.html.tostring(self.root, encoding=str) + return html diff --git a/web_view.py b/web_view.py index 03f1ccb..e6c33d4 100755 --- a/web_view.py +++ b/web_view.py @@ -3,7 +3,6 @@ import inspect import json import re -from typing import Any, Iterator, Optional, TypedDict import flask import lxml.html @@ -13,7 +12,7 @@ from requests_oauthlib import OAuth1Session from werkzeug.debug.tbtools import get_current_traceback from werkzeug.wrappers import Response -from dab_mechanic import wikidata_oauth +from dab_mechanic import wikidata_oauth, wikipedia app = flask.Flask(__name__) app.config.from_object("config.default") @@ -89,145 +88,6 @@ def index(): return flask.render_template("index.html", articles=articles) -def call_parse_api(enwiki: str) -> dict[str, Any]: - """Call mediawiki parse API for given article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = { - "action": "parse", - "format": "json", - "formatversion": 2, - "disableeditsection": 1, - "page": enwiki, - "prop": "text|links|headhtml", - "disabletoc": 1, - } - - r = requests.get(url, params=params) - parse: dict[str, Any] = r.json()["parse"] - return parse - - -def get_article_html(enwiki: str) -> str: - """Parse article wikitext and return HTML.""" - text: str = call_parse_api(enwiki)["text"] - return text - - -disambig_templates = [ - "Template:Disambiguation", - "Template:Airport disambiguation", - "Template:Biology disambiguation", - "Template:Call sign disambiguation", - "Template:Caselaw disambiguation", - "Template:Chinese title disambiguation", - "Template:Disambiguation cleanup", - "Template:Genus disambiguation", - "Template:Hospital disambiguation", - "Template:Human name disambiguation", - "Template:Human name disambiguation cleanup", - "Template:Letter-number combination disambiguation", - "Template:Mathematical disambiguation", - "Template:Military unit disambiguation", - "Template:Music disambiguation", - "Template:Number disambiguation", - "Template:Opus number disambiguation", - "Template:Phonetics disambiguation", - "Template:Place name disambiguation", - "Template:Portal disambiguation", - "Template:Road disambiguation", - "Template:School disambiguation", - "Template:Species Latin name abbreviation disambiguation", - "Template:Species Latin name disambiguation", - "Template:Station disambiguation", - "Template:Synagogue disambiguation", - "Template:Taxonomic authority disambiguation", - "Template:Taxonomy disambiguation", - "Template:Template disambiguation", - "Template:WoO number disambiguation", -] - - -def link_params(enwiki: str) -> dict[str, str | int]: - """Parameters for finding article links from the API.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "titles": enwiki, - "generator": "links", - "gpllimit": "max", - "gplnamespace": 0, - "tllimit": "max", - "tlnamespace": 10, - "tltemplates": "|".join(disambig_templates), - "prop": "templates", - } - return params - - -def needs_disambig(link: dict[str, Any]) -> bool: - """Is this a disambiguation link.""" - return bool( - not link["title"].endswith(" (disambiguation)") and link.get("templates") - ) - - -def get_article_links(enwiki: str) -> list[str]: - """Get links that appear in this article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = link_params(enwiki) - links: set[str] = set() - - while True: - data = requests.get(url, params=params).json() - links.update( - page["title"] for page in data["query"]["pages"] if needs_disambig(page) - ) - - if "continue" not in data: - break - - params["gplcontinue"] = data["continue"]["gplcontinue"] - - return list(links) - - # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} - - -def delete_toc(root: lxml.html.HtmlElement) -> None: - """Delete table of contents from article HTML.""" - for toc in root.findall(".//div[@class='toc']"): - toc.getparent().remove(toc) - - -def get_dab_html(dab_num: int, title: str) -> str: - """Parse dab page and rewrite links.""" - dab_html = get_article_html(title) - root = lxml.html.fromstring(dab_html) - delete_toc(root) - - element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} - - for a in root.findall(".//a[@href]"): - href: str | None = a.get("href") - if not href: - continue - if not href.startswith("#"): - a.set("href", "#") - a.set("onclick", f"return select_dab(this, {dab_num})") - continue - - destination_element = element_id_map[href[1:]] - assert destination_element is not None - destination_element.set("id", f"{dab_num}{href[1:]}") - a.set("href", f"#{dab_num}{href[1:]}") - - html: str = lxml.html.tostring(root, encoding=str) - return html - - def make_disamb_link(edit: tuple[str, str]) -> str: """Given an edit return the appropriate link.""" return f"[[{edit[1]}|{edit[0]}]]" @@ -278,73 +138,6 @@ def save(enwiki: str) -> Response | str: ) -class DabItem(TypedDict): - """Represent a disabiguation page.""" - - num: int - title: str - html: str - - -class Article: - """Current article we're working on.""" - - def __init__(self, enwiki: str) -> None: - """Make a new Article object.""" - self.enwiki = enwiki - - self.links = get_article_links(enwiki) - - self.dab_list: list[DabItem] = [] - self.dab_lookup: dict[int, str] = {} - self.dab_order: list[str] = [] - self.parse: Optional[dict[str, Any]] = None - - def save_endpoint(self) -> str: - """Endpoint for saving changes.""" - href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) - return href - - def load(self) -> None: - """Load parsed article HTML.""" - self.parse = call_parse_api(self.enwiki) - self.root = lxml.html.fromstring(self.parse.pop("text")) - - def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: - """Disambiguation links that need fixing.""" - seen = set() - for a in self.root.findall(".//a[@href]"): - title = a.get("title") - if title is None or title not in self.links: - continue - a.set("class", "disambig") - - if title in seen: - continue - seen.add(title) - - yield a, title - - def process_links(self) -> None: - """Process links in parsed wikitext.""" - for dab_num, (a, title) in enumerate(self.iter_links()): - a.set("id", f"dab-{dab_num}") - - dab: DabItem = { - "num": dab_num, - "title": title, - "html": get_dab_html(dab_num, title), - } - self.dab_list.append(dab) - self.dab_order.append(title) - self.dab_lookup[dab_num] = title - - def get_html(self) -> str: - """Return the processed article HTML.""" - html: str = lxml.html.tostring(self.root, encoding=str) - return html - - @app.route("/enwiki/") def article_page(enwiki: str) -> Response: """Article Page.""" @@ -356,7 +149,7 @@ def article_page(enwiki: str) -> Response: flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore) ) - article = Article(enwiki) + article = wikipedia.Article(enwiki) article.load() article.process_links() From 82688221816f53f45f49688cad9d146d6cc45742 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:46:00 +0100 Subject: [PATCH 04/10] Remove unused article list --- article_list | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 article_list diff --git a/article_list b/article_list deleted file mode 100644 index 7f4bcaf..0000000 --- a/article_list +++ /dev/null @@ -1,50 +0,0 @@ -Rail transport in Indonesia -SchleFaZ -Chicago Bulls -Orwell Prize -List of fatal victims of the September 11 attacks -Arabic exonyms -Canadian Alpine Ski Championships -Method Man filmography -Popular Union -The Cantos -Unisex name -United States Alpine Ski Championships -AS Kaloum Star -Akademi Fantasia (season 1) -Athletics at the 2022 Bolivarian Games -I Love the 2000s -Kununokuni -List of Wisin & Yandel collaborations -List of comics based on films -List of programs broadcast by Asianet -Urban Hymns -1979 Sydney City FC season -2007 in Spanish television -2022 World Athletics U20 Championships – Men's 4 × 100 metres relay -A2 autostrada (Poland) -Black to the Future (TV series) -Chandel (Rajput clan) -County of Isenburg -Dinka people -Dwayne McDuffie Award for Diversity in Comics -FTSE Italia Mid Cap -Globoplay -Index of Armenia-related articles -List of Equinox episodes -List of Indian monarchs -List of Italian exonyms in Dalmatia -List of Ultimate Marvel characters -List of cities with historical German exonyms -List of jötnar in Norse mythology -List of language families -List of people with surname Davis -List of political parties in Venezuela -List of programmes broadcast by HTV -Paul (given name) -Principality of Lippe -Propaganda in Russia -Qazi Ghulam Mustafa -Redfern Now -Roy Orbison/The Beatles Tour -Royal Birmingham Conservatoire From c2b3d22e451daf22e4ba774ea0358826a9ee779b Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:54:16 +0100 Subject: [PATCH 05/10] Add CSS for error page --- static/css/exception.css | 78 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 static/css/exception.css diff --git a/static/css/exception.css b/static/css/exception.css new file mode 100644 index 0000000..1f141c5 --- /dev/null +++ b/static/css/exception.css @@ -0,0 +1,78 @@ +div.debugger { text-align: left; padding: 12px; margin: auto; + background-color: white; } +div.detail { cursor: pointer; } +div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap; + font-family: monospace; } +div.explanation { margin: 20px 13px; font-size: 15px; color: #555; } +div.footer { font-size: 13px; text-align: right; margin: 30px 0; + color: #86989B; } + +h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px; + background-color: #11557C; color: white; } +h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; } + +div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; } +div.plain p { margin: 0; } +div.plain textarea, +div.plain pre { margin: 10px 0 0 0; padding: 4px; + background-color: #E8EFF0; border: 1px solid #D3E7E9; } +div.plain textarea { width: 99%; height: 300px; } +div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; } +div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; } +div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; } +div.traceback pre { margin: 0; padding: 5px 0 3px 15px; + background-color: #E8EFF0; border: 1px solid #D3E7E9; } +div.traceback .library .current { background: white; color: #555; } +div.traceback .expanded .current { background: #E8EFF0; color: black; } +div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; } +div.traceback div.source.expanded pre + pre { border-top: none; } + +div.traceback span.ws { display: none; } +div.traceback pre.before, div.traceback pre.after { display: none; background: white; } +div.traceback div.source.expanded pre.before, +div.traceback div.source.expanded pre.after { + display: block; +} + +div.traceback div.source.expanded span.ws { + display: inline; +} + +div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; } +div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; } +div.traceback img:hover { background-color: #ddd; cursor: pointer; + border-color: #BFDDE0; } +div.traceback pre:hover img { display: block; } +div.traceback cite.filename { font-style: normal; color: #3B666B; } + +pre.console { border: 1px solid #ccc; background: white!important; + color: black; padding: 5px!important; + margin: 3px 0 0 0!important; cursor: default!important; + max-height: 400px; overflow: auto; } +pre.console form { color: #555; } +pre.console input { background-color: transparent; color: #555; + width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; font-size: 14px; + border: none!important; } + +span.string { color: #30799B; } +span.number { color: #9C1A1C; } +span.help { color: #3A7734; } +span.object { color: #485F6E; } +span.extended { opacity: 0.5; } +span.extended:hover { opacity: 1; } +a.toggle { text-decoration: none; background-repeat: no-repeat; + background-position: center center; + background-image: url(?__debugger__=yes&cmd=resource&f=more.png); } +a.toggle:hover { background-color: #444; } +a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); } + +div.traceback pre, div.console pre { + white-space: pre-wrap; /* css-3 should we be so lucky... */ + white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ + white-space: -pre-wrap; /* Opera 4-6 ?? */ + white-space: -o-pre-wrap; /* Opera 7 ?? */ + word-wrap: break-word; /* Internet Explorer 5.5+ */ + _white-space: pre; /* IE only hack to re-specify in + addition to word-wrap */ +} From 4e1ad4efbc523ca73ab8c433811aabd546218bc7 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 13:58:17 +0100 Subject: [PATCH 06/10] add gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ From 5f8900a47abf4bf28e122d3b943bee16b62cc3c2 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 14:38:30 +0100 Subject: [PATCH 07/10] Add User-Agent to mediawiki API calls --- dab_mechanic/mediawiki_api.py | 48 +++++++++++++++++++++++++++++++++++ dab_mechanic/wikipedia.py | 29 ++++----------------- web_view.py | 19 ++------------ 3 files changed, 55 insertions(+), 41 deletions(-) create mode 100644 dab_mechanic/mediawiki_api.py diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py new file mode 100644 index 0000000..0196207 --- /dev/null +++ b/dab_mechanic/mediawiki_api.py @@ -0,0 +1,48 @@ +"""Interface with the mediawiki API.""" + +from typing import Any + +import requests + +wiki_hostname = "en.wikipedia.org" +wiki_api_php = f"https://{wiki_hostname}/w/api.php" +user_agent = "dab-mechanic/0.1" + + +def parse_page(enwiki: str) -> dict[str, Any]: + """Call mediawiki parse API for given article.""" + params: dict[str, str | int] = { + "action": "parse", + "format": "json", + "formatversion": 2, + "disableeditsection": 1, + "page": enwiki, + "prop": "text|links|headhtml", + "disabletoc": 1, + } + + parse: dict[str, Any] = get(params)["parse"] + return parse + + +def get(params: dict[str, str | int]) -> dict[str, Any]: + """Make GET request to mediawiki API.""" + data: dict[str, Any] = requests.get( + wiki_api_php, headers={"User-Agent": user_agent}, params=params + ).json() + return data + + +def get_content(title: str) -> str: + """Get article text.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "prop": "revisions|info", + "rvprop": "content|timestamp", + "titles": title, + } + data = get(params) + rev: str = data["query"]["pages"][0]["revisions"][0]["content"] + return rev diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index 3b59278..844508f 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -3,7 +3,8 @@ from typing import Any, Iterator, Optional, TypedDict import flask import lxml.html -import requests + +from . import mediawiki_api disambig_templates = [ "Template:Disambiguation", @@ -67,7 +68,6 @@ def needs_disambig(link: dict[str, Any]) -> bool: def get_article_links(enwiki: str) -> list[str]: """Get links that appear in this article.""" - url = "https://en.wikipedia.org/w/api.php" params: dict[str, str | int] = link_params(enwiki) links: set[str] = set() @@ -75,7 +75,7 @@ def get_article_links(enwiki: str) -> list[str]: redirects = defaultdict(set) while True: - data = requests.get(url, params=params).json() + data = mediawiki_api.get(params) pages = data["query"].pop("pages") for r in data["query"].pop("redirects"): redirects[r["to"]].add(r["from"]) @@ -96,28 +96,9 @@ def get_article_links(enwiki: str) -> list[str]: # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} -def call_parse_api(enwiki: str) -> dict[str, Any]: - """Call mediawiki parse API for given article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = { - "action": "parse", - "format": "json", - "formatversion": 2, - "disableeditsection": 1, - "page": enwiki, - "prop": "text|links|headhtml", - "disabletoc": 1, - } - - r = requests.get(url, params=params) - parse: dict[str, Any] = r.json()["parse"] - return parse - - def get_article_html(enwiki: str) -> str: """Parse article wikitext and return HTML.""" - text: str = call_parse_api(enwiki)["text"] + text: str = mediawiki_api.parse_page(enwiki)["text"] return text @@ -182,7 +163,7 @@ class Article: def load(self) -> None: """Load parsed article HTML.""" - self.parse = call_parse_api(self.enwiki) + self.parse = mediawiki_api.parse_page(self.enwiki) self.root = lxml.html.fromstring(self.parse.pop("text")) def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: diff --git a/web_view.py b/web_view.py index e6c33d4..cac9233 100755 --- a/web_view.py +++ b/web_view.py @@ -12,7 +12,7 @@ from requests_oauthlib import OAuth1Session from werkzeug.debug.tbtools import get_current_traceback from werkzeug.wrappers import Response -from dab_mechanic import wikidata_oauth, wikipedia +from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia app = flask.Flask(__name__) app.config.from_object("config.default") @@ -47,21 +47,6 @@ def exception_handler(e): ) -def get_content(title: str) -> str: - """Get article text.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "prop": "revisions|info", - "rvprop": "content|timestamp", - "titles": title, - } - data = requests.get(wiki_api_php, params=params).json() - rev: str = data["query"]["pages"][0]["revisions"][0]["content"] - return rev - - def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: """Parse Articles With Multiple Dablinks.""" articles = [] @@ -127,7 +112,7 @@ def save(enwiki: str) -> Response | str: edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" - article_text = apply_edits(get_content(enwiki), edits) + article_text = apply_edits(mediawiki_api.get_content(enwiki), edits) return flask.render_template( "save.html", From b1f402e1f9cb273d6603fa229441c3c84779ba3a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 14:48:20 +0100 Subject: [PATCH 08/10] refactor --- dab_mechanic/wikipedia.py | 2 +- web_view.py | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index 844508f..cf9510f 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -147,7 +147,7 @@ class Article: def __init__(self, enwiki: str) -> None: """Make a new Article object.""" - self.enwiki = enwiki + self.enwiki = enwiki.replace("_", " ") self.links = get_article_links(enwiki) diff --git a/web_view.py b/web_view.py index cac9233..c50fa7d 100755 --- a/web_view.py +++ b/web_view.py @@ -3,6 +3,7 @@ import inspect import json import re +from typing import Optional import flask import lxml.html @@ -123,16 +124,23 @@ def save(enwiki: str) -> Response | str: ) +def redirect_if_needed(enwiki: str) -> Optional[Response]: + """Check if there are spaces in the article name and redirect.""" + return ( + flask.redirect( + flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_")) + ) + if " " in enwiki + else None + ) + + @app.route("/enwiki/") def article_page(enwiki: str) -> Response: """Article Page.""" - enwiki_orig = enwiki - enwiki = enwiki.replace("_", " ") - enwiki_underscore = enwiki.replace(" ", "_") - if " " in enwiki_orig: - return flask.redirect( - flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore) - ) + redirect = redirect_if_needed(enwiki) + if redirect: + return redirect article = wikipedia.Article(enwiki) article.load() From e85cefbc2f302afdfd68567eb404a6e56ad4759d Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 20:04:43 +0100 Subject: [PATCH 09/10] Make mediawiki API calls via OAuth The API had a timeout problem. Maybe this fixes it. --- dab_mechanic/mediawiki_api.py | 15 ++++++--------- dab_mechanic/wikidata_oauth.py | 7 +++---- dab_mechanic/wikipedia.py | 7 ++++++- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py index 0196207..26d7a20 100644 --- a/dab_mechanic/mediawiki_api.py +++ b/dab_mechanic/mediawiki_api.py @@ -1,8 +1,7 @@ """Interface with the mediawiki API.""" from typing import Any - -import requests +from . import wikidata_oauth wiki_hostname = "en.wikipedia.org" wiki_api_php = f"https://{wiki_hostname}/w/api.php" @@ -21,16 +20,14 @@ def parse_page(enwiki: str) -> dict[str, Any]: "disabletoc": 1, } - parse: dict[str, Any] = get(params)["parse"] + parse: dict[str, Any] = call(params)["parse"] return parse -def get(params: dict[str, str | int]) -> dict[str, Any]: +def call(params: dict[str, str | int]) -> dict[str, Any]: """Make GET request to mediawiki API.""" - data: dict[str, Any] = requests.get( - wiki_api_php, headers={"User-Agent": user_agent}, params=params - ).json() - return data + data: dict[str, Any] = wikidata_oauth.api_post_request(params) + return data.json() def get_content(title: str) -> str: @@ -43,6 +40,6 @@ def get_content(title: str) -> str: "rvprop": "content|timestamp", "titles": title, } - data = get(params) + data = call(params) rev: str = data["query"]["pages"][0]["revisions"][0]["content"] return rev diff --git a/dab_mechanic/wikidata_oauth.py b/dab_mechanic/wikidata_oauth.py index dca0707..5af0976 100644 --- a/dab_mechanic/wikidata_oauth.py +++ b/dab_mechanic/wikidata_oauth.py @@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]: def api_post_request(params: dict[str, str | int]): """HTTP Post using Oauth.""" app = current_app - url = "https://www.wikidata.org/w/api.php" client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.post(url, data=params, timeout=4, proxies=proxies) + return oauth.post(api_url, data=params, timeout=10, proxies=proxies) def raw_request(params): app = current_app - url = "https://www.wikidata.org/w/api.php?" + urlencode(params) + url = api_url + "?" + urlencode(params) client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -44,7 +43,7 @@ def raw_request(params): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.get(url, timeout=4, proxies=proxies) + return oauth.get(url, timeout=10, proxies=proxies) def api_request(params): diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index cf9510f..57c03c4 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -5,6 +5,8 @@ import flask import lxml.html from . import mediawiki_api +from pprint import pprint +from time import sleep disambig_templates = [ "Template:Disambiguation", @@ -75,7 +77,9 @@ def get_article_links(enwiki: str) -> list[str]: redirects = defaultdict(set) while True: - data = mediawiki_api.get(params) + data = mediawiki_api.call(params) + if "query" not in data: + pprint(data) pages = data["query"].pop("pages") for r in data["query"].pop("redirects"): redirects[r["to"]].add(r["from"]) @@ -86,6 +90,7 @@ def get_article_links(enwiki: str) -> list[str]: break params["gplcontinue"] = data["continue"]["gplcontinue"] + sleep(0.1) for link in set(links): if link in redirects: From 4d175c8733b043a51f35d09cf11a4d92de34b498 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Aug 2022 20:06:36 +0100 Subject: [PATCH 10/10] Turn off app.debug so error pages works. --- web_view.py | 1 - 1 file changed, 1 deletion(-) diff --git a/web_view.py b/web_view.py index c50fa7d..2730f23 100755 --- a/web_view.py +++ b/web_view.py @@ -17,7 +17,6 @@ from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia app = flask.Flask(__name__) app.config.from_object("config.default") -app.debug = True wiki_hostname = "en.wikipedia.org" wiki_api_php = f"https://{wiki_hostname}/w/api.php"