From 3191ae51b005a386221805c2ec3fa0f75878fa98 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 30 Jun 2020 09:05:22 +0100 Subject: [PATCH] Bug fix saam parser --- depicts/saam.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/depicts/saam.py b/depicts/saam.py index 1c14a03..1d58b7c 100644 --- a/depicts/saam.py +++ b/depicts/saam.py @@ -18,17 +18,22 @@ def get_html(saam_id): def parse_html(html): root = lxml.html.fromstring(html) - ld = json.loads(root.findtext('.//script[@type="application/ld+json"]')) + ld_json = root.findtext('.//script[@type="application/ld+json"]') + if ld_json is None: + return {'ld': {}, 'keywords': []} + ld = json.loads(ld_json) ul = root.find('.//ul[@class="ontology-list"]') if ul is None: - return {'ld': ld, 'keywords': []} + return assert ul.tag == 'ul' keywords = [li.text for li in ul] return {'ld': ld, 'keywords': keywords} def get_catalog(saam_id): data = parse_html(get_html(saam_id)) + if not data: + return {} ret = { 'institution': 'Smithsonian American Art Museum', }