56 lines
1.3 KiB
Python
56 lines
1.3 KiB
Python
"""Detroit Institute of Arts (Q1201549) - art museum in Detroit, Michigan."""
|
|
|
|
import os
|
|
import re
|
|
|
|
import lxml.html
|
|
import requests
|
|
|
|
from .type import CatalogDict
|
|
|
|
re_url = re.compile(r"https?://www.dia.org/art/collection/object/(.+)$")
|
|
|
|
|
|
def get_html(url: str) -> str | None:
|
|
"""Get HTML from web catalog."""
|
|
m = re_url.search(url)
|
|
if not m:
|
|
return None
|
|
catalog_id = m.group(1).replace("/", "_")
|
|
|
|
filename = f"cache/dia_{catalog_id}.html"
|
|
|
|
if os.path.exists(filename):
|
|
html = open(filename).read()
|
|
else:
|
|
r = requests.get(url)
|
|
html = r.text
|
|
open(filename, "w").write(html)
|
|
|
|
return html
|
|
|
|
|
|
def parse_html(html: str) -> CatalogDict:
|
|
"""Parse HTML and extract keywords."""
|
|
root = lxml.html.fromstring(html)
|
|
keywords = []
|
|
|
|
for a in root.findall(".//a[@href]"):
|
|
href = a.get("href")
|
|
assert href is not None
|
|
if not href.startswith("/art/collection?keys="):
|
|
continue
|
|
assert a.text
|
|
keywords.append(a.text)
|
|
|
|
return {
|
|
"institution": "Detroit Institute of Arts",
|
|
"keywords": keywords,
|
|
}
|
|
|
|
|
|
def get_catalog(url: str) -> CatalogDict | None:
|
|
"""Get catalog web page and extract keywords."""
|
|
html = get_html(url)
|
|
return parse_html(html) if html else None
|