depicts/depicts/dia.py

56 lines
1.3 KiB
Python

"""Detroit Institute of Arts (Q1201549) - art museum in Detroit, Michigan."""
import os
import re
import lxml.html
import requests
from .type import CatalogDict
re_url = re.compile(r"https?://www.dia.org/art/collection/object/(.+)$")
def get_html(url: str) -> str | None:
"""Get HTML from web catalog."""
m = re_url.search(url)
if not m:
return None
catalog_id = m.group(1).replace("/", "_")
filename = f"cache/dia_{catalog_id}.html"
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, "w").write(html)
return html
def parse_html(html: str) -> CatalogDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html)
keywords = []
for a in root.findall(".//a[@href]"):
href = a.get("href")
assert href is not None
if not href.startswith("/art/collection?keys="):
continue
assert a.text
keywords.append(a.text)
return {
"institution": "Detroit Institute of Arts",
"keywords": keywords,
}
def get_catalog(url: str) -> CatalogDict | None:
"""Get catalog web page and extract keywords."""
html = get_html(url)
return parse_html(html) if html else None