depicts/depicts/museodelprado.py

43 lines
955 B
Python

import requests
import lxml.html
import os
import re
re_url = re.compile(r'www.museodelprado.es/(.+)$')
def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
filename = f'cache/museodelprado_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, 'w').write(html)
return html
def parse_html(html):
root = lxml.html.fromstring(html)
keywords = []
for h2 in root.findall('.//h2'):
if h2.text.strip() != 'Displayed objects':
continue
div = h2.getparent()
for keyword_span in div.findall('.//span[@property]'):
keywords.append(keyword_span.text)
if not keywords:
return {}
return {
'institution': 'Museo del Prado',
'keywords': keywords,
}
def get_catalog(url):
return parse_html(get_html(url))