depicts/depicts/dia.py

"""Detroit Institute of Arts (Q1201549)  - art museum in Detroit, Michigan."""

import os
import re

import lxml.html
import requests

from .type import CatalogDict

re_url = re.compile(r"https?://www.dia.org/art/collection/object/(.+)$")


def get_html(url: str) -> str | None:
    """Get HTML from web catalog."""
    m = re_url.search(url)
    if not m:
        return None
    catalog_id = m.group(1).replace("/", "_")

    filename = f"cache/dia_{catalog_id}.html"

    if os.path.exists(filename):
        html = open(filename).read()
    else:
        r = requests.get(url)
        html = r.text
        open(filename, "w").write(html)

    return html


def parse_html(html: str) -> CatalogDict:
    """Parse HTML and extract keywords."""
    root = lxml.html.fromstring(html)
    keywords = []

    for a in root.findall(".//a[@href]"):
        href = a.get("href")
        assert href is not None
        if not href.startswith("/art/collection?keys="):
            continue
        assert a.text
        keywords.append(a.text)

    return {
        "institution": "Detroit Institute of Arts",
        "keywords": keywords,
    }


def get_catalog(url: str) -> CatalogDict | None:
    """Get catalog web page and extract keywords."""
    html = get_html(url)
    return parse_html(html) if html else None