60 lines
1.6 KiB
Python
60 lines
1.6 KiB
Python
"""Smithsonian American Art Museum (Q1192305) - fine arts museum in Washington, D.C."""
|
|
|
|
import json
|
|
import os
|
|
import typing
|
|
|
|
import lxml.html
|
|
import requests
|
|
|
|
from .type import CatalogDict, EmptyDict
|
|
|
|
|
|
def get_html(saam_id: str | int) -> str:
|
|
"""Get HTML from web catalog."""
|
|
filename = f"cache/saam_{saam_id}.html"
|
|
url = "http://americanart.si.edu/collections/search/artwork/"
|
|
|
|
if os.path.exists(filename):
|
|
html = open(filename).read()
|
|
else:
|
|
r = requests.get(url, params={"id": saam_id})
|
|
html = r.text
|
|
open(filename, "w").write(html)
|
|
|
|
return html
|
|
|
|
|
|
def parse_html(html: str) -> dict[str, typing.Any] | None:
|
|
"""Parse HTML and extract keywords."""
|
|
root = lxml.html.fromstring(html)
|
|
ld_json = root.findtext('.//script[@type="application/ld+json"]')
|
|
if ld_json is None:
|
|
return {"ld": {}, "keywords": []}
|
|
ld = json.loads(ld_json)
|
|
|
|
ul = root.find('.//ul[@class="ontology-list"]')
|
|
if ul is None:
|
|
return None
|
|
assert ul.tag == "ul"
|
|
keywords = [li.text for li in ul]
|
|
return {"ld": ld, "keywords": keywords}
|
|
|
|
|
|
def get_catalog(saam_id: int | str) -> CatalogDict | EmptyDict:
|
|
"""Get catalog web page and extract keywords."""
|
|
data = parse_html(get_html(saam_id))
|
|
empty: EmptyDict = {}
|
|
if not data:
|
|
return empty
|
|
ret: CatalogDict = {
|
|
"institution": "Smithsonian American Art Museum",
|
|
"keywords": [],
|
|
}
|
|
if data["keywords"]:
|
|
ret["keywords"] = data["keywords"]
|
|
if "description" in data["ld"]:
|
|
ret["description"] = data["ld"]["description"]
|
|
|
|
return ret if "description" in ret or "keywords" in ret else empty
|