conference-archive/confarchive/wikidata.py
2023-09-25 18:38:02 +01:00

150 lines
4.3 KiB
Python

"""Access the Wikidata API."""
import hashlib
import json
import os
import subprocess
import time
import typing
import requests
import flask
commons_url = "https://www.wikidata.org/w/api.php"
wikidata_api = "https://www.wikidata.org/w/api.php"
user_agent = "conference-archive/0.1 (contact: edward@4angle.com)"
CallParams = dict[str, str | int]
SearchHit = dict[str, typing.Any]
s = requests.Session()
s.headers.update({"User-Agent": user_agent})
def md5sum(s: str) -> str:
"""Generate hex md5sum."""
return hashlib.md5(s.encode("utf-8")).hexdigest()
def get_cache_filename(q: str) -> str:
"""Cache filename for query."""
q_md5 = md5sum(q)
data_dir = flask.current_app.config["DATA_DIR"]
return os.path.join(data_dir, "cache", q_md5 + ".json")
def get_item_filename(qid: str) -> str:
data_dir = flask.current_app.config["DATA_DIR"]
return os.path.join(data_dir, "item", qid + ".json")
def search(q: str) -> list[SearchHit]:
"""Search Wikidata with caching."""
cache_filename = get_cache_filename(q)
if os.path.exists(cache_filename):
data = json.load(open(cache_filename))
return typing.cast(list[SearchHit], data["query"]["search"])
params: dict[str, str | int] = {
"action": "query",
"list": "search",
"format": "json",
"formatversion": 2,
"srsearch": q,
"srlimit": "10",
}
r = requests.get(wikidata_api, params=params)
open(cache_filename, "w").write(r.text)
data = r.json()
return typing.cast(list[SearchHit], data["query"]["search"])
def api_image_detail_call(filename: str) -> requests.Response:
"""Call the Commons API."""
call_params: CallParams = {
"format": "json",
"formatversion": 2,
"action": "query",
"prop": "imageinfo",
"iiprop": "url",
"titles": f"File:{filename}",
}
return s.get(commons_url, params=call_params, timeout=5)
def get_item(qid: str) -> typing.Any:
"""Get an item from Wikidata."""
item_filename = get_item_filename(qid)
if os.path.exists(item_filename):
item = json.load(open(item_filename))
else:
params: dict[str, str | int] = {
"action": "wbgetentities",
"ids": qid,
"format": "json",
"formatversion": 2,
}
r = s.get(wikidata_api, params=params)
item = r.json()["entities"][qid]
with open(item_filename, "w") as f:
json.dump(item, f, indent=2)
return item
def download_photo(filename: str) -> None:
save_to = os.path.join("static", "wikidata_photo", filename)
r = api_image_detail_call(filename)
try:
pages = r.json()["query"]["pages"]
except requests.exceptions.JSONDecodeError:
print(r.text)
raise
photo = pages[0]["imageinfo"][0]
photo_url = photo["url"]
while True:
r = s.get(photo_url)
if not r.content.startswith(b"<!DOCTYPE html>"):
break
time.sleep(1)
with open(save_to, "wb") as out:
out.write(r.content)
print(len(r.content), filename)
def get_photo(filename: str) -> None:
"""Download filename and resize."""
save_to = os.path.join("static", "wikidata_photo", filename)
thumb = os.path.join("static", "wikidata_photo", "thumb", filename)
if not os.path.exists(save_to):
download_photo(filename)
if not os.path.exists(thumb):
subprocess.run(["convert", "-resize", "1024x", save_to, thumb])
if filename.endswith("jpg") or filename.endswith("jpeg"):
subprocess.run(["jpegoptim", "-S1048576", thumb])
wikidata_properties = [
("website", "P856", "official website", None),
("twitter", "P2002", "Twitter username", "https://twitter.com/$1"),
("github", "P2037", "GitHub username", "https://github.com/$1"),
(
"linkedin",
"P6634",
"LinkedIn personal profile ID",
"https://www.linkedin.com/in/$1/",
),
("mastodon_address", "P4033", "Mastodon address", None),
("dblp", "P2456", "DBLP author ID", "https://dblp.org/pid/$1"),
("blog_url", "P1581", "official blog URL", None),
(
"hacker_news",
"P7171",
"Hacker News username",
"https://news.ycombinator.com/user?id=$1",
),
("reddit", "P4265", "Reddit username", "https://www.reddit.com/user/$1"),
]