Add User-Agent to mediawiki API calls
This commit is contained in:
parent
4e1ad4efbc
commit
5f8900a47a
48
dab_mechanic/mediawiki_api.py
Normal file
48
dab_mechanic/mediawiki_api.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
"""Interface with the mediawiki API."""
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
user_agent = "dab-mechanic/0.1"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||||
|
"""Call mediawiki parse API for given article."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "parse",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"disableeditsection": 1,
|
||||||
|
"page": enwiki,
|
||||||
|
"prop": "text|links|headhtml",
|
||||||
|
"disabletoc": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
parse: dict[str, Any] = get(params)["parse"]
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def get(params: dict[str, str | int]) -> dict[str, Any]:
|
||||||
|
"""Make GET request to mediawiki API."""
|
||||||
|
data: dict[str, Any] = requests.get(
|
||||||
|
wiki_api_php, headers={"User-Agent": user_agent}, params=params
|
||||||
|
).json()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(title: str) -> str:
|
||||||
|
"""Get article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
data = get(params)
|
||||||
|
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||||
|
return rev
|
|
@ -3,7 +3,8 @@ from typing import Any, Iterator, Optional, TypedDict
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import requests
|
|
||||||
|
from . import mediawiki_api
|
||||||
|
|
||||||
disambig_templates = [
|
disambig_templates = [
|
||||||
"Template:Disambiguation",
|
"Template:Disambiguation",
|
||||||
|
@ -67,7 +68,6 @@ def needs_disambig(link: dict[str, Any]) -> bool:
|
||||||
|
|
||||||
def get_article_links(enwiki: str) -> list[str]:
|
def get_article_links(enwiki: str) -> list[str]:
|
||||||
"""Get links that appear in this article."""
|
"""Get links that appear in this article."""
|
||||||
url = "https://en.wikipedia.org/w/api.php"
|
|
||||||
|
|
||||||
params: dict[str, str | int] = link_params(enwiki)
|
params: dict[str, str | int] = link_params(enwiki)
|
||||||
links: set[str] = set()
|
links: set[str] = set()
|
||||||
|
@ -75,7 +75,7 @@ def get_article_links(enwiki: str) -> list[str]:
|
||||||
redirects = defaultdict(set)
|
redirects = defaultdict(set)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
data = requests.get(url, params=params).json()
|
data = mediawiki_api.get(params)
|
||||||
pages = data["query"].pop("pages")
|
pages = data["query"].pop("pages")
|
||||||
for r in data["query"].pop("redirects"):
|
for r in data["query"].pop("redirects"):
|
||||||
redirects[r["to"]].add(r["from"])
|
redirects[r["to"]].add(r["from"])
|
||||||
|
@ -96,28 +96,9 @@ def get_article_links(enwiki: str) -> list[str]:
|
||||||
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||||
|
|
||||||
|
|
||||||
def call_parse_api(enwiki: str) -> dict[str, Any]:
|
|
||||||
"""Call mediawiki parse API for given article."""
|
|
||||||
url = "https://en.wikipedia.org/w/api.php"
|
|
||||||
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"action": "parse",
|
|
||||||
"format": "json",
|
|
||||||
"formatversion": 2,
|
|
||||||
"disableeditsection": 1,
|
|
||||||
"page": enwiki,
|
|
||||||
"prop": "text|links|headhtml",
|
|
||||||
"disabletoc": 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.get(url, params=params)
|
|
||||||
parse: dict[str, Any] = r.json()["parse"]
|
|
||||||
return parse
|
|
||||||
|
|
||||||
|
|
||||||
def get_article_html(enwiki: str) -> str:
|
def get_article_html(enwiki: str) -> str:
|
||||||
"""Parse article wikitext and return HTML."""
|
"""Parse article wikitext and return HTML."""
|
||||||
text: str = call_parse_api(enwiki)["text"]
|
text: str = mediawiki_api.parse_page(enwiki)["text"]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@ -182,7 +163,7 @@ class Article:
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
"""Load parsed article HTML."""
|
"""Load parsed article HTML."""
|
||||||
self.parse = call_parse_api(self.enwiki)
|
self.parse = mediawiki_api.parse_page(self.enwiki)
|
||||||
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||||
|
|
||||||
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||||
|
|
19
web_view.py
19
web_view.py
|
@ -12,7 +12,7 @@ from requests_oauthlib import OAuth1Session
|
||||||
from werkzeug.debug.tbtools import get_current_traceback
|
from werkzeug.debug.tbtools import get_current_traceback
|
||||||
from werkzeug.wrappers import Response
|
from werkzeug.wrappers import Response
|
||||||
|
|
||||||
from dab_mechanic import wikidata_oauth, wikipedia
|
from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
|
||||||
|
|
||||||
app = flask.Flask(__name__)
|
app = flask.Flask(__name__)
|
||||||
app.config.from_object("config.default")
|
app.config.from_object("config.default")
|
||||||
|
@ -47,21 +47,6 @@ def exception_handler(e):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_content(title: str) -> str:
|
|
||||||
"""Get article text."""
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"action": "query",
|
|
||||||
"format": "json",
|
|
||||||
"formatversion": 2,
|
|
||||||
"prop": "revisions|info",
|
|
||||||
"rvprop": "content|timestamp",
|
|
||||||
"titles": title,
|
|
||||||
}
|
|
||||||
data = requests.get(wiki_api_php, params=params).json()
|
|
||||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
|
||||||
return rev
|
|
||||||
|
|
||||||
|
|
||||||
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
||||||
"""Parse Articles With Multiple Dablinks."""
|
"""Parse Articles With Multiple Dablinks."""
|
||||||
articles = []
|
articles = []
|
||||||
|
@ -127,7 +112,7 @@ def save(enwiki: str) -> Response | str:
|
||||||
|
|
||||||
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
||||||
|
|
||||||
article_text = apply_edits(get_content(enwiki), edits)
|
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
|
||||||
|
|
||||||
return flask.render_template(
|
return flask.render_template(
|
||||||
"save.html",
|
"save.html",
|
||||||
|
|
Loading…
Reference in a new issue