Bug fix Smithsonian American Art Museum parsing.

This commit is contained in:
Edward Betts 2019-10-10 07:39:30 +01:00
parent ac570b05d2
commit d030f745de

View file

@ -21,6 +21,8 @@ def parse_html(html):
ld = json.loads(root.findtext('.//script[@type="application/ld+json"]')) ld = json.loads(root.findtext('.//script[@type="application/ld+json"]'))
ul = root.find('.//ul[@class="ontology-list"]') ul = root.find('.//ul[@class="ontology-list"]')
if ul is None:
return {'ld': ld, 'keywords': []}
assert ul.tag == 'ul' assert ul.tag == 'ul'
keywords = [li.text for li in ul] keywords = [li.text for li in ul]
return {'ld': ld, 'keywords': keywords} return {'ld': ld, 'keywords': keywords}
@ -29,10 +31,10 @@ def get_catalog(saam_id):
data = parse_html(get_html(saam_id)) data = parse_html(get_html(saam_id))
ret = { ret = {
'institution': 'Smithsonian American Art Museum', 'institution': 'Smithsonian American Art Museum',
'keywords': data['keywords'],
} }
if data['keywords']:
ret['keywords'] = data['keywords']
if 'description' in data['ld']: if 'description' in data['ld']:
ret['description'] = data['ld']['description'] ret['description'] = data['ld']['description']
return ret
return ret if 'description' in ret or 'keywords' in ret else {}