diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py new file mode 100644 index 0000000..0196207 --- /dev/null +++ b/dab_mechanic/mediawiki_api.py @@ -0,0 +1,48 @@ +"""Interface with the mediawiki API.""" + +from typing import Any + +import requests + +wiki_hostname = "en.wikipedia.org" +wiki_api_php = f"https://{wiki_hostname}/w/api.php" +user_agent = "dab-mechanic/0.1" + + +def parse_page(enwiki: str) -> dict[str, Any]: + """Call mediawiki parse API for given article.""" + params: dict[str, str | int] = { + "action": "parse", + "format": "json", + "formatversion": 2, + "disableeditsection": 1, + "page": enwiki, + "prop": "text|links|headhtml", + "disabletoc": 1, + } + + parse: dict[str, Any] = get(params)["parse"] + return parse + + +def get(params: dict[str, str | int]) -> dict[str, Any]: + """Make GET request to mediawiki API.""" + data: dict[str, Any] = requests.get( + wiki_api_php, headers={"User-Agent": user_agent}, params=params + ).json() + return data + + +def get_content(title: str) -> str: + """Get article text.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "prop": "revisions|info", + "rvprop": "content|timestamp", + "titles": title, + } + data = get(params) + rev: str = data["query"]["pages"][0]["revisions"][0]["content"] + return rev diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index 3b59278..844508f 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -3,7 +3,8 @@ from typing import Any, Iterator, Optional, TypedDict import flask import lxml.html -import requests + +from . import mediawiki_api disambig_templates = [ "Template:Disambiguation", @@ -67,7 +68,6 @@ def needs_disambig(link: dict[str, Any]) -> bool: def get_article_links(enwiki: str) -> list[str]: """Get links that appear in this article.""" - url = "https://en.wikipedia.org/w/api.php" params: dict[str, str | int] = link_params(enwiki) links: set[str] = set() @@ -75,7 +75,7 @@ def get_article_links(enwiki: str) -> list[str]: redirects = defaultdict(set) while True: - data = requests.get(url, params=params).json() + data = mediawiki_api.get(params) pages = data["query"].pop("pages") for r in data["query"].pop("redirects"): redirects[r["to"]].add(r["from"]) @@ -96,28 +96,9 @@ def get_article_links(enwiki: str) -> list[str]: # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} -def call_parse_api(enwiki: str) -> dict[str, Any]: - """Call mediawiki parse API for given article.""" - url = "https://en.wikipedia.org/w/api.php" - - params: dict[str, str | int] = { - "action": "parse", - "format": "json", - "formatversion": 2, - "disableeditsection": 1, - "page": enwiki, - "prop": "text|links|headhtml", - "disabletoc": 1, - } - - r = requests.get(url, params=params) - parse: dict[str, Any] = r.json()["parse"] - return parse - - def get_article_html(enwiki: str) -> str: """Parse article wikitext and return HTML.""" - text: str = call_parse_api(enwiki)["text"] + text: str = mediawiki_api.parse_page(enwiki)["text"] return text @@ -182,7 +163,7 @@ class Article: def load(self) -> None: """Load parsed article HTML.""" - self.parse = call_parse_api(self.enwiki) + self.parse = mediawiki_api.parse_page(self.enwiki) self.root = lxml.html.fromstring(self.parse.pop("text")) def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: diff --git a/web_view.py b/web_view.py index e6c33d4..cac9233 100755 --- a/web_view.py +++ b/web_view.py @@ -12,7 +12,7 @@ from requests_oauthlib import OAuth1Session from werkzeug.debug.tbtools import get_current_traceback from werkzeug.wrappers import Response -from dab_mechanic import wikidata_oauth, wikipedia +from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia app = flask.Flask(__name__) app.config.from_object("config.default") @@ -47,21 +47,6 @@ def exception_handler(e): ) -def get_content(title: str) -> str: - """Get article text.""" - params: dict[str, str | int] = { - "action": "query", - "format": "json", - "formatversion": 2, - "prop": "revisions|info", - "rvprop": "content|timestamp", - "titles": title, - } - data = requests.get(wiki_api_php, params=params).json() - rev: str = data["query"]["pages"][0]["revisions"][0]["content"] - return rev - - def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: """Parse Articles With Multiple Dablinks.""" articles = [] @@ -127,7 +112,7 @@ def save(enwiki: str) -> Response | str: edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" - article_text = apply_edits(get_content(enwiki), edits) + article_text = apply_edits(mediawiki_api.get_content(enwiki), edits) return flask.render_template( "save.html",