Bug fix Smithsonian American Art Museum parsing.

This commit is contained in:
Edward Betts 2019-10-10 07:39:30 +01:00
parent ac570b05d2
commit d030f745de

View file

@ -21,6 +21,8 @@ def parse_html(html):
ld = json.loads(root.findtext('.//script[@type="application/ld+json"]'))
ul = root.find('.//ul[@class="ontology-list"]')
if ul is None:
return {'ld': ld, 'keywords': []}
assert ul.tag == 'ul'
keywords = [li.text for li in ul]
return {'ld': ld, 'keywords': keywords}
@ -29,10 +31,10 @@ def get_catalog(saam_id):
data = parse_html(get_html(saam_id))
ret = {
'institution': 'Smithsonian American Art Museum',
'keywords': data['keywords'],
}
if data['keywords']:
ret['keywords'] = data['keywords']
if 'description' in data['ld']:
ret['description'] = data['ld']['description']
return ret
return ret if 'description' in ret or 'keywords' in ret else {}