Add User-Agent to mediawiki API calls

This commit is contained in:
Edward Betts 2022-08-17 14:38:30 +01:00
parent 4e1ad4efbc
commit 5f8900a47a
3 changed files with 55 additions and 41 deletions

View file

@ -0,0 +1,48 @@
"""Interface with the mediawiki API."""
from typing import Any
import requests
wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
user_agent = "dab-mechanic/0.1"
def parse_page(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
parse: dict[str, Any] = get(params)["parse"]
return parse
def get(params: dict[str, str | int]) -> dict[str, Any]:
"""Make GET request to mediawiki API."""
data: dict[str, Any] = requests.get(
wiki_api_php, headers={"User-Agent": user_agent}, params=params
).json()
return data
def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
data = get(params)
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev

View file

@ -3,7 +3,8 @@ from typing import Any, Iterator, Optional, TypedDict
import flask import flask
import lxml.html import lxml.html
import requests
from . import mediawiki_api
disambig_templates = [ disambig_templates = [
"Template:Disambiguation", "Template:Disambiguation",
@ -67,7 +68,6 @@ def needs_disambig(link: dict[str, Any]) -> bool:
def get_article_links(enwiki: str) -> list[str]: def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article.""" """Get links that appear in this article."""
url = "https://en.wikipedia.org/w/api.php"
params: dict[str, str | int] = link_params(enwiki) params: dict[str, str | int] = link_params(enwiki)
links: set[str] = set() links: set[str] = set()
@ -75,7 +75,7 @@ def get_article_links(enwiki: str) -> list[str]:
redirects = defaultdict(set) redirects = defaultdict(set)
while True: while True:
data = requests.get(url, params=params).json() data = mediawiki_api.get(params)
pages = data["query"].pop("pages") pages = data["query"].pop("pages")
for r in data["query"].pop("redirects"): for r in data["query"].pop("redirects"):
redirects[r["to"]].add(r["from"]) redirects[r["to"]].add(r["from"])
@ -96,28 +96,9 @@ def get_article_links(enwiki: str) -> list[str]:
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
def call_parse_api(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
url = "https://en.wikipedia.org/w/api.php"
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
r = requests.get(url, params=params)
parse: dict[str, Any] = r.json()["parse"]
return parse
def get_article_html(enwiki: str) -> str: def get_article_html(enwiki: str) -> str:
"""Parse article wikitext and return HTML.""" """Parse article wikitext and return HTML."""
text: str = call_parse_api(enwiki)["text"] text: str = mediawiki_api.parse_page(enwiki)["text"]
return text return text
@ -182,7 +163,7 @@ class Article:
def load(self) -> None: def load(self) -> None:
"""Load parsed article HTML.""" """Load parsed article HTML."""
self.parse = call_parse_api(self.enwiki) self.parse = mediawiki_api.parse_page(self.enwiki)
self.root = lxml.html.fromstring(self.parse.pop("text")) self.root = lxml.html.fromstring(self.parse.pop("text"))
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:

View file

@ -12,7 +12,7 @@ from requests_oauthlib import OAuth1Session
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from werkzeug.wrappers import Response from werkzeug.wrappers import Response
from dab_mechanic import wikidata_oauth, wikipedia from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
app = flask.Flask(__name__) app = flask.Flask(__name__)
app.config.from_object("config.default") app.config.from_object("config.default")
@ -47,21 +47,6 @@ def exception_handler(e):
) )
def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
data = requests.get(wiki_api_php, params=params).json()
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
"""Parse Articles With Multiple Dablinks.""" """Parse Articles With Multiple Dablinks."""
articles = [] articles = []
@ -127,7 +112,7 @@ def save(enwiki: str) -> Response | str:
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
article_text = apply_edits(get_content(enwiki), edits) article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
return flask.render_template( return flask.render_template(
"save.html", "save.html",