53 lines
1.2 KiB
Python
53 lines
1.2 KiB
Python
"""National Portrait Gallery (Q238587) - art museum in London, England."""
|
|
|
|
import os
|
|
import re
|
|
|
|
import lxml.html
|
|
import requests
|
|
|
|
from .type import CatalogDict
|
|
|
|
re_url = re.compile(r"www.npg.org.uk/collections/search/(.+)$")
|
|
|
|
|
|
def get_html(url: str) -> str:
|
|
"""Get HTML from web catalog."""
|
|
assert (m := re_url.search(url))
|
|
catalog_id = m.group(1).replace("/", "_")
|
|
|
|
filename = f"cache/npg_{catalog_id}.html"
|
|
|
|
if os.path.exists(filename):
|
|
html = open(filename).read()
|
|
else:
|
|
r = requests.get(url)
|
|
html = r.text
|
|
open(filename, "w").write(html)
|
|
|
|
return html
|
|
|
|
|
|
def parse_html(html: str) -> CatalogDict:
|
|
"""Parse HTML and extract keywords."""
|
|
root = lxml.html.fromstring(html)
|
|
|
|
keywords = [
|
|
a.text
|
|
for a in root.findall(".//a[@href]")
|
|
if "subj=" in a.get("href") # type: ignore
|
|
]
|
|
|
|
skip = {"oil", "painting"}
|
|
keywords = [k for k in keywords if k.lower() not in skip] # type: ignore
|
|
|
|
return {
|
|
"institution": "National Portrait Gallery",
|
|
"keywords": keywords, # type: ignore
|
|
}
|
|
|
|
|
|
def get_catalog(url: str) -> CatalogDict:
|
|
"""Get catalog web page and extract keywords."""
|
|
return parse_html(get_html(url))
|