conference-archive/confarchive/wikidata.py

import hashlib
import json
import os
import subprocess
import time
import typing

import requests

commons_url = "https://www.wikidata.org/w/api.php"
wikidata_api = "https://www.wikidata.org/w/api.php"

s = requests.Session()
s.headers.update({"User-Agent": "conference-archive/0.1 (contact: edward@4angle.com)"})


def md5sum(s: str) -> str:
    """Generate hex md5sum."""
    return hashlib.md5(s.encode("utf-8")).hexdigest()


def search(q: str) -> list[dict[str, typing.Any]]:
    q_md5 = md5sum(q)

    cache_filename = os.path.join("cache", q_md5 + ".json")

    if os.path.exists(cache_filename):
        data = json.load(open(cache_filename))
    else:
        params: dict[str, str | int] = {
            "action": "query",
            "list": "search",
            "format": "json",
            "formatversion": 2,
            "srsearch": q,
            "srlimit": "10",
        }
        r = requests.get(wikidata_api, params=params)
        open(cache_filename, "w").write(r.text)
        data = r.json()
        time.sleep(1)

    return typing.cast(list[dict[str, typing.Any]], data["query"]["search"])


def api_image_detail_call(filename: str) -> requests.Response:
    """Call the Commons API."""
    call_params = {
        "format": "json",
        "formatversion": 2,
        "action": "query",
        "prop": "imageinfo",
        "iiprop": "url",
        "titles": f"File:{filename}",
    }

    return s.get(commons_url, params=call_params, timeout=5)


def get_item(qid: str) -> typing.Any:
    cache_filename = os.path.join("items", qid + ".json")
    if os.path.exists(cache_filename):
        item = json.load(open(cache_filename))
    else:
        params: dict[str, str | int] = {
            "action": "wbgetentities",
            "ids": qid,
            "format": "json",
            "formatversion": 2,
        }
        r = s.get(wikidata_api, params=params)
        item = r.json()["entities"][qid]
        with open(cache_filename, "w") as f:
            json.dump(item, f, indent=2)
        time.sleep(0.1)
    return item


def download_photo(filename: str) -> None:
    save_to = os.path.join("static", "wikidata_photo", filename)
    r = api_image_detail_call(filename)
    try:
        pages = r.json()["query"]["pages"]
    except requests.exceptions.JSONDecodeError:
        print(r.text)
        raise
    photo = pages[0]["imageinfo"][0]
    photo_url = photo["url"]
    while True:
        r = s.get(photo_url)
        if not r.content.startswith(b"<!DOCTYPE html>"):
            break
        time.sleep(1)
    with open(save_to, "wb") as out:
        out.write(r.content)
    print(len(r.content), filename)


def get_photo(filename: str) -> None:
    save_to = os.path.join("static", "wikidata_photo", filename)
    thumb = os.path.join("static", "wikidata_photo", "thumb", filename)
    if not os.path.exists(save_to):
        download_photo(filename)
    if not os.path.exists(thumb):
        subprocess.run(["convert", "-resize", "1024x", save_to, thumb])
        if filename.endswith("jpg") or filename.endswith("jpeg"):
            subprocess.run(["jpegoptim", "-S1048576", thumb])


wikidata_properties = [
    ("website", "P856", "official website", None),
    ("twitter", "P2002", "Twitter username", "https://twitter.com/$1"),
    ("github", "P2037", "GitHub username", "https://github.com/$1"),
    (
        "linkedin",
        "P6634",
        "LinkedIn personal profile ID",
        "https://www.linkedin.com/in/$1/",
    ),
    ("mastodon_address", "P4033", "Mastodon address", None),
    ("dblp", "P2456", "DBLP author ID", "https://dblp.org/pid/$1"),
    ("blog_url", "P1581", "official blog URL", None),
    (
        "hacker_news",
        "P7171",
        "Hacker News username",
        "https://news.ycombinator.com/user?id=$1",
    ),
    ("reddit", "P4265", "Reddit username", "https://www.reddit.com/user/$1"),
]