"""Access the Wikidata API.""" import hashlib import json import os import subprocess import time import typing import requests import flask commons_url = "https://www.wikidata.org/w/api.php" wikidata_api = "https://www.wikidata.org/w/api.php" user_agent = "conference-archive/0.1 (contact: edward@4angle.com)" CallParams = dict[str, str | int] SearchHit = dict[str, typing.Any] s = requests.Session() s.headers.update({"User-Agent": user_agent}) def md5sum(s: str) -> str: """Generate hex md5sum.""" return hashlib.md5(s.encode("utf-8")).hexdigest() def get_cache_filename(q: str) -> str: """Cache filename for query.""" q_md5 = md5sum(q) data_dir = flask.current_app.config["DATA_DIR"] return os.path.join(data_dir, "cache", q_md5 + ".json") def get_item_filename(qid: str) -> str: data_dir = flask.current_app.config["DATA_DIR"] return os.path.join(data_dir, "item", qid + ".json") def search(q: str) -> list[SearchHit]: """Search Wikidata with caching.""" cache_filename = get_cache_filename(q) if os.path.exists(cache_filename): data = json.load(open(cache_filename)) return typing.cast(list[SearchHit], data["query"]["search"]) params: dict[str, str | int] = { "action": "query", "list": "search", "format": "json", "formatversion": 2, "srsearch": q, "srlimit": "10", } r = requests.get(wikidata_api, params=params) open(cache_filename, "w").write(r.text) data = r.json() return typing.cast(list[SearchHit], data["query"]["search"]) def api_image_detail_call(filename: str) -> requests.Response: """Call the Commons API.""" call_params: CallParams = { "format": "json", "formatversion": 2, "action": "query", "prop": "imageinfo", "iiprop": "url", "titles": f"File:{filename}", } return s.get(commons_url, params=call_params, timeout=5) def get_item(qid: str) -> typing.Any: """Get an item from Wikidata.""" item_filename = get_item_filename(qid) if os.path.exists(item_filename): item = json.load(open(item_filename)) else: params: dict[str, str | int] = { "action": "wbgetentities", "ids": qid, "format": "json", "formatversion": 2, } r = s.get(wikidata_api, params=params) item = r.json()["entities"][qid] with open(item_filename, "w") as f: json.dump(item, f, indent=2) return item def download_photo(filename: str) -> None: save_to = os.path.join("static", "wikidata_photo", filename) r = api_image_detail_call(filename) try: pages = r.json()["query"]["pages"] except requests.exceptions.JSONDecodeError: print(r.text) raise photo = pages[0]["imageinfo"][0] photo_url = photo["url"] while True: r = s.get(photo_url) if not r.content.startswith(b""): break time.sleep(1) with open(save_to, "wb") as out: out.write(r.content) print(len(r.content), filename) def get_photo(filename: str) -> None: """Download filename and resize.""" save_to = os.path.join("static", "wikidata_photo", filename) thumb = os.path.join("static", "wikidata_photo", "thumb", filename) if not os.path.exists(save_to): download_photo(filename) if not os.path.exists(thumb): subprocess.run(["convert", "-resize", "1024x", save_to, thumb]) if filename.endswith("jpg") or filename.endswith("jpeg"): subprocess.run(["jpegoptim", "-S1048576", thumb]) wikidata_properties = [ ("website", "P856", "official website", None), ("twitter", "P2002", "Twitter username", "https://twitter.com/$1"), ("github", "P2037", "GitHub username", "https://github.com/$1"), ( "linkedin", "P6634", "LinkedIn personal profile ID", "https://www.linkedin.com/in/$1/", ), ("mastodon_address", "P4033", "Mastodon address", None), ("dblp", "P2456", "DBLP author ID", "https://dblp.org/pid/$1"), ("blog_url", "P1581", "official blog URL", None), ( "hacker_news", "P7171", "Hacker News username", "https://news.ycombinator.com/user?id=$1", ), ("reddit", "P4265", "Reddit username", "https://www.reddit.com/user/$1"), ]