depicts/depicts/saam.py

60 lines
1.6 KiB
Python

"""Smithsonian American Art Museum (Q1192305) - fine arts museum in Washington, D.C."""
import json
import os
import typing
import lxml.html
import requests
from .type import CatalogDict, EmptyDict
def get_html(saam_id: str | int) -> str:
"""Get HTML from web catalog."""
filename = f"cache/saam_{saam_id}.html"
url = "http://americanart.si.edu/collections/search/artwork/"
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url, params={"id": saam_id})
html = r.text
open(filename, "w").write(html)
return html
def parse_html(html: str) -> dict[str, typing.Any] | None:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html)
ld_json = root.findtext('.//script[@type="application/ld+json"]')
if ld_json is None:
return {"ld": {}, "keywords": []}
ld = json.loads(ld_json)
ul = root.find('.//ul[@class="ontology-list"]')
if ul is None:
return None
assert ul.tag == "ul"
keywords = [li.text for li in ul]
return {"ld": ld, "keywords": keywords}
def get_catalog(saam_id: int | str) -> CatalogDict | EmptyDict:
"""Get catalog web page and extract keywords."""
data = parse_html(get_html(saam_id))
empty: EmptyDict = {}
if not data:
return empty
ret: CatalogDict = {
"institution": "Smithsonian American Art Museum",
"keywords": [],
}
if data["keywords"]:
ret["keywords"] = data["keywords"]
if "description" in data["ld"]:
ret["description"] = data["ld"]["description"]
return ret if "description" in ret or "keywords" in ret else empty