From d123c23270b5cee9d4b24184c1e313da2abb611e Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 25 Sep 2019 13:44:39 +0100 Subject: [PATCH] Add Smithsonian American Art Museum lookup code --- depicts/saam.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100755 depicts/saam.py diff --git a/depicts/saam.py b/depicts/saam.py new file mode 100755 index 0000000..6e64dfa --- /dev/null +++ b/depicts/saam.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 + +import requests +import lxml.html +import json +import os +from pprint import pprint + +def get_html(saam_id): + filename = f'cache/saam_{saam_id}.html' + url = 'http://americanart.si.edu/collections/search/artwork/' + + if os.path.exists(filename): + html = open(filename).read() + else: + r = requests.get(url, params={'id': saam_id}) + html = r.text + open(filename, 'w').write(html) + + return html + +def parse_html(html): + root = lxml.html.fromstring(html) + ld = json.loads(root.findtext('.//script[@type="application/ld+json"]')) + + ul = root.find('.//ul[@class="ontology-list"]') + assert ul.tag == 'ul' + keywords = [li.text for li in ul] + return {'ld': ld, 'keywords': keywords} + +def get_catalog(saam_id): + return parse_html(get_html(saam_id))