43 lines
970 B
Python
43 lines
970 B
Python
import requests
|
|
import lxml.html
|
|
import os
|
|
import re
|
|
|
|
re_url = re.compile(r'www.museodelprado.es/(.+)$')
|
|
|
|
def get_html(url):
|
|
catalog_id = re_url.search(url).group(1).replace('/', '_')
|
|
|
|
filename = f'cache/museodelprado_{catalog_id}.html'
|
|
|
|
if os.path.exists(filename):
|
|
html = open(filename).read()
|
|
else:
|
|
r = requests.get(url)
|
|
html = r.text
|
|
open(filename, 'w').write(html)
|
|
|
|
return html
|
|
|
|
def parse_html(html):
|
|
root = lxml.html.fromstring(html)
|
|
|
|
keywords = []
|
|
for h2 in root.findall('.//h2'):
|
|
if not h2.text or h2.text.strip() != 'Displayed objects':
|
|
continue
|
|
div = h2.getparent()
|
|
for keyword_span in div.findall('.//span[@property]'):
|
|
keywords.append(keyword_span.text)
|
|
|
|
if not keywords:
|
|
return {}
|
|
|
|
return {
|
|
'institution': 'Museo del Prado',
|
|
'keywords': keywords,
|
|
}
|
|
|
|
def get_catalog(url):
|
|
return parse_html(get_html(url))
|