"""Access MediaWiki API.""" import hashlib import json import os import typing import requests from . import utils from .type import CallParams, Entity wikidata_url = "https://www.wikidata.org/w/api.php" page_size = 50 hosts = { "commons": "commons.wikimedia.org", "enwiki": "en.wikipedia.org", "wikidata": "www.wikidata.org", } def api_call(params: CallParams, api_url: str = wikidata_url) -> requests.Response: """Mediawiki API call.""" call_params: CallParams = { "format": "json", "formatversion": 2, **params, } r = requests.get(api_url, params=call_params, timeout=5) return r def api_post(params: CallParams, api_url: str = wikidata_url) -> requests.Response: call_params: CallParams = { "format": "json", "formatversion": 2, **params, } r = requests.post(api_url, data=call_params, timeout=5) return r def get_list(list_name: str, **params: str | int) -> list[dict[str, typing.Any]]: r = api_call({"action": "query", "list": list_name, **params}) list_contents: list[dict[str, typing.Any]] = r.json()["query"][list_name] return list_contents def get_entity(qid: str, redirects: bool = False) -> Entity | None: """Get entity from wikibase.""" json_data = api_call( { "action": "wbgetentities", "ids": qid, "redirects": {True: "yes", False: "no"}[redirects], } ).json() try: entity = list(json_data["entities"].values())[0] except KeyError: return None if "missing" not in entity: return typing.cast(Entity, entity) return None def wbgetentities(ids: typing.Iterable[str], **params: str | int) -> dict[str, Entity]: """Get entities from wikibase.""" if not ids: return {} params = { "action": "wbgetentities", "ids": "|".join(ids), **params, } ret: dict[str, Entity] = api_call(params).json()["entities"] return ret def get_entities(ids: typing.Iterable[str], **params: str | int) -> list[Entity]: entity_list: list[Entity] = [] for cur in utils.chunk(ids, page_size): entity_list += wbgetentities(cur, **params).values() return entity_list def get_entities_dict(ids: str, **params: str | int) -> dict[str, Entity]: entities = {} for cur in utils.chunk(ids, page_size): entities.update(wbgetentities(cur, **params)) return entities def get_entity_with_cache(qid: str, refresh: bool = False) -> Entity | None: filename = f"cache/{qid}.json" entity: Entity | None if not refresh and os.path.exists(filename): entity = json.load(open(filename)) else: entity = get_entity(qid, redirects=True) json.dump(entity, open(filename, "w"), indent=2) return entity def get_entities_with_cache(ids: list[str], **params: typing.Any) -> list[Entity]: md5 = hashlib.md5(" ".join(ids).encode("utf-8")).hexdigest() entity_list: list[Entity] filename = f"cache/entities_{md5}.json" if os.path.exists(filename): entity_list = json.load(open(filename)) else: entity_list = get_entities(ids, **params) json.dump(entity_list, open(filename, "w"), indent=2) return entity_list def get_entities_dict_with_cache( all_ids: list[str], **params: typing.Any ) -> dict[str, Entity]: entities = {} for ids in utils.chunk(all_ids, page_size): md5 = hashlib.md5(" ".join(ids).encode("utf-8")).hexdigest() filename = f"cache/entities_dict_{md5}.json" if os.path.exists(filename): entities.update(json.load(open(filename))) continue cur = wbgetentities(ids, **params) json.dump(cur, open(filename, "w"), indent=2) entities.update(cur) return entities Page = dict[str, typing.Any] def mediawiki_query(titles: list[str], params: CallParams, site: str) -> list[Page]: """Mediawiki query.""" if not titles: return [] # avoid error: Too many values supplied for parameter "titles". The limit is 50. # FIXME: switch to utils.chunk if len(titles) > page_size: titles = titles[:page_size] base: CallParams = { "format": "json", "formatversion": 2, "action": "query", "continue": "", "titles": "|".join(titles), } p = base.copy() p.update(params) query_url = f"https://{hosts[site]}/w/api.php" r = requests.get(query_url, params=p) expect = "application/json; charset=utf-8" success = True if r.status_code != 200: print("status code: {r.status_code}".format(r=r)) success = False if r.headers["content-type"] != expect: print(f'content-type: {r.headers["content-type"]}') success = False assert success json_reply = r.json() if "query" not in json_reply: print(r.url) print(r.text) pages: list[Page] = json_reply["query"]["pages"] return pages def get_content_and_categories(title: str, site: str) -> tuple[str, list[str]]: """Get article contents and categories.""" params: CallParams = { "prop": "revisions|categories", "clshow": "!hidden", "cllimit": "max", "rvprop": "content", } pages = mediawiki_query([title], params, site) assert len(pages) == 1 page = pages[0] return (page["revisions"][0]["content"], page.get("categories", [])) def host_from_site(site: str) -> str: """Host from site.""" return hosts[site] def get_history(title: str, site: str) -> list[Page]: """Get history of a page.""" params: CallParams = { "prop": "revisions", "rvlimit": "max", "rvprop": "timestamp|user|comment|ids|content", "rvslots": "main", } return mediawiki_query([title], params, site)