depicts/depicts/npg.py

53 lines
1.2 KiB
Python

"""National Portrait Gallery (Q238587) - art museum in London, England."""
import os
import re
import lxml.html
import requests
from .type import CatalogDict
re_url = re.compile(r"www.npg.org.uk/collections/search/(.+)$")
def get_html(url: str) -> str:
"""Get HTML from web catalog."""
assert (m := re_url.search(url))
catalog_id = m.group(1).replace("/", "_")
filename = f"cache/npg_{catalog_id}.html"
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, "w").write(html)
return html
def parse_html(html: str) -> CatalogDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html)
keywords = [
a.text
for a in root.findall(".//a[@href]")
if "subj=" in a.get("href") # type: ignore
]
skip = {"oil", "painting"}
keywords = [k for k in keywords if k.lower() not in skip] # type: ignore
return {
"institution": "National Portrait Gallery",
"keywords": keywords, # type: ignore
}
def get_catalog(url: str) -> CatalogDict:
"""Get catalog web page and extract keywords."""
return parse_html(get_html(url))