Make mediawiki API calls via OAuth

The API had a timeout problem. Maybe this fixes it.
This commit is contained in:
Edward Betts 2022-08-17 20:04:43 +01:00
parent b1f402e1f9
commit e85cefbc2f
3 changed files with 15 additions and 14 deletions

View file

@ -1,8 +1,7 @@
"""Interface with the mediawiki API.""" """Interface with the mediawiki API."""
from typing import Any from typing import Any
from . import wikidata_oauth
import requests
wiki_hostname = "en.wikipedia.org" wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_api_php = f"https://{wiki_hostname}/w/api.php"
@ -21,16 +20,14 @@ def parse_page(enwiki: str) -> dict[str, Any]:
"disabletoc": 1, "disabletoc": 1,
} }
parse: dict[str, Any] = get(params)["parse"] parse: dict[str, Any] = call(params)["parse"]
return parse return parse
def get(params: dict[str, str | int]) -> dict[str, Any]: def call(params: dict[str, str | int]) -> dict[str, Any]:
"""Make GET request to mediawiki API.""" """Make GET request to mediawiki API."""
data: dict[str, Any] = requests.get( data: dict[str, Any] = wikidata_oauth.api_post_request(params)
wiki_api_php, headers={"User-Agent": user_agent}, params=params return data.json()
).json()
return data
def get_content(title: str) -> str: def get_content(title: str) -> str:
@ -43,6 +40,6 @@ def get_content(title: str) -> str:
"rvprop": "content|timestamp", "rvprop": "content|timestamp",
"titles": title, "titles": title,
} }
data = get(params) data = call(params)
rev: str = data["query"]["pages"][0]["revisions"][0]["content"] rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev return rev

View file

@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]:
def api_post_request(params: dict[str, str | int]): def api_post_request(params: dict[str, str | int]):
"""HTTP Post using Oauth.""" """HTTP Post using Oauth."""
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php"
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.post(url, data=params, timeout=4, proxies=proxies) return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
def raw_request(params): def raw_request(params):
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php?" + urlencode(params) url = api_url + "?" + urlencode(params)
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -44,7 +43,7 @@ def raw_request(params):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.get(url, timeout=4, proxies=proxies) return oauth.get(url, timeout=10, proxies=proxies)
def api_request(params): def api_request(params):

View file

@ -5,6 +5,8 @@ import flask
import lxml.html import lxml.html
from . import mediawiki_api from . import mediawiki_api
from pprint import pprint
from time import sleep
disambig_templates = [ disambig_templates = [
"Template:Disambiguation", "Template:Disambiguation",
@ -75,7 +77,9 @@ def get_article_links(enwiki: str) -> list[str]:
redirects = defaultdict(set) redirects = defaultdict(set)
while True: while True:
data = mediawiki_api.get(params) data = mediawiki_api.call(params)
if "query" not in data:
pprint(data)
pages = data["query"].pop("pages") pages = data["query"].pop("pages")
for r in data["query"].pop("redirects"): for r in data["query"].pop("redirects"):
redirects[r["to"]].add(r["from"]) redirects[r["to"]].add(r["from"])
@ -86,6 +90,7 @@ def get_article_links(enwiki: str) -> list[str]:
break break
params["gplcontinue"] = data["continue"]["gplcontinue"] params["gplcontinue"] = data["continue"]["gplcontinue"]
sleep(0.1)
for link in set(links): for link in set(links):
if link in redirects: if link in redirects: