From 921824833f5216e269ca278873a4711d54666f28 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 27 Sep 2019 11:01:33 +0100 Subject: [PATCH] catalog page parsers --- depicts/barnesfoundation.py | 32 ++++++++++++++++++++++++ depicts/dia.py | 50 +++++++++++++++++++++++++++++++++++++ depicts/museodelprado.py | 42 +++++++++++++++++++++++++++++++ depicts/npg.py | 37 +++++++++++++++++++++++++++ depicts/rijksmuseum.py | 34 +++++++++++++++++++++++++ depicts/saam.py | 12 ++++++--- 6 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 depicts/barnesfoundation.py create mode 100644 depicts/dia.py create mode 100644 depicts/museodelprado.py create mode 100644 depicts/npg.py create mode 100644 depicts/rijksmuseum.py diff --git a/depicts/barnesfoundation.py b/depicts/barnesfoundation.py new file mode 100644 index 0000000..76987f9 --- /dev/null +++ b/depicts/barnesfoundation.py @@ -0,0 +1,32 @@ +import requests +import os +import json + +def get_json(catalog_id): + filename = f'cache/barnesfoundation_{catalog_id}.html' + + url = 'https://collection.barnesfoundation.org/api/search' + + body = {"query": {"bool": {"filter": {"exists": {"field": "imageSecret"}}, + "must": {"match": {"_id": int(catalog_id)}}}}} + + if os.path.exists(filename): + return json.load(open(filename)) + else: + r = requests.get(url, params={'body': json.dumps(body)}) + print(r.url) + open(filename, 'w').write(r.text) + return r.json() + +def parse_catalog(data): + hit = data['hits']['hits'][0]['_source'] + + return { + 'institution': 'Barnes Foundation', + 'description': hit['shortDescription'], + 'keywords': [tag['tag'] for tag in hit['tags']], + } + +def get_catalog(catalog_id): + data = get_json(catalog_id) + return parse_catalog(data) diff --git a/depicts/dia.py b/depicts/dia.py new file mode 100644 index 0000000..9d627e6 --- /dev/null +++ b/depicts/dia.py @@ -0,0 +1,50 @@ +import requests +import lxml.html +import os +import re + +re_url = re.compile(r'https?://www.dia.org/art/collection/object/(.+)$') + +def get_html(url): + catalog_id = re_url.search(url).group(1).replace('/', '_') + + filename = f'cache/dia_{catalog_id}.html' + + if os.path.exists(filename): + html = open(filename).read() + else: + r = requests.get(url) + html = r.text + open(filename, 'w').write(html) + + return html + +def parse_html(html): + root = lxml.html.fromstring(html) + keywords = [] + + for a in root.findall('.//a[@href]'): + href = a.get('href') + if not href.startswith('/art/collection?keys='): + continue + keywords.append(a.text) + + if False: + sidebar = root.find('.//aside[@id="sidebar"]') + h2_list = sidebar.findall('.//h2') + h2_keyword = next((h2 for h2 in h2_list if h2.text == 'Keywords'), None) + if not h2_keyword: + return {} + keyword_div = h2_keyword.getparent() + for a in keyword_div: + if a.tag != 'a': + continue + keywords.append(a.text) + + return { + 'institution': 'Detroit Institute of Arts', + 'keywords': keywords, + } + +def get_catalog(url): + return parse_html(get_html(url)) diff --git a/depicts/museodelprado.py b/depicts/museodelprado.py new file mode 100644 index 0000000..7795579 --- /dev/null +++ b/depicts/museodelprado.py @@ -0,0 +1,42 @@ +import requests +import lxml.html +import os +import re + +re_url = re.compile(r'www.museodelprado.es/en/.*/([^/]+)$') + +def get_html(url): + catalog_id = re_url.search(url).group(1).replace('/', '_') + + filename = f'cache/museodelprado_{catalog_id}.html' + + if os.path.exists(filename): + html = open(filename).read() + else: + r = requests.get(url) + html = r.text + open(filename, 'w').write(html) + + return html + +def parse_html(html): + root = lxml.html.fromstring(html) + + keywords = [] + for h2 in root.findall('.//h2'): + if h2.text.strip() != 'Displayed objects': + continue + div = h2.getparent() + for keyword_span in div.findall('.//span[@property]'): + keywords.append(keyword_span.text) + + if not keywords: + return {} + + return { + 'institution': 'Museo del Prado', + 'keywords': keywords, + } + +def get_catalog(url): + return parse_html(get_html(url)) diff --git a/depicts/npg.py b/depicts/npg.py new file mode 100644 index 0000000..d88cdd5 --- /dev/null +++ b/depicts/npg.py @@ -0,0 +1,37 @@ +import requests +import lxml.html +import os +import re + +re_url = re.compile(r'www.npg.org.uk/collections/search/(.+)$') + +def get_html(url): + catalog_id = re_url.search(url).group(1).replace('/', '_') + + filename = f'cache/npg_{catalog_id}.html' + + if os.path.exists(filename): + html = open(filename).read() + else: + r = requests.get(url) + html = r.text + open(filename, 'w').write(html) + + return html + +def parse_html(html): + root = lxml.html.fromstring(html) + + keywords = [a.text for a in root.findall('.//a[@href]') + if 'subj=' in a.get('href')] + + skip = {'oil', 'painting'} + keywords = [k for k in keywords if k.lower() not in skip] + + return { + 'institution': 'National Portrait Gallery', + 'keywords': keywords, + } + +def get_catalog(url): + return parse_html(get_html(url)) diff --git a/depicts/rijksmuseum.py b/depicts/rijksmuseum.py new file mode 100644 index 0000000..08ab331 --- /dev/null +++ b/depicts/rijksmuseum.py @@ -0,0 +1,34 @@ +import requests +import lxml.html +import os +import re + +re_url = re.compile(r'^https://www.rijksmuseum.nl/nl/collectie/([^/]+)$') + +def get_html(catalog_id): + filename = f'cache/rijksmuseum_{catalog_id}.html' + en_url = 'https://www.rijksmuseum.nl/en/collection/' + catalog_id + + if os.path.exists(filename): + html = open(filename).read() + else: + r = requests.get(en_url) + html = r.text + open(filename, 'w').write(html) + + return html + +def parse_html(html): + root = lxml.html.fromstring(html) + keywords = [a.text for a in root.findall('.//a[@href]') + if 'f.classification.iconClassDescription.sort' in a.get('href')] + + return { + 'institution': 'Rijksmuseum', + 'keywords': keywords, + } + +def get_catalog(url): + catalog_id = re_url.search(url).group(1) + + return parse_html(get_html(catalog_id)) diff --git a/depicts/saam.py b/depicts/saam.py index 6e64dfa..673539c 100755 --- a/depicts/saam.py +++ b/depicts/saam.py @@ -1,10 +1,7 @@ -#!/usr/bin/python3 - import requests import lxml.html import json import os -from pprint import pprint def get_html(saam_id): filename = f'cache/saam_{saam_id}.html' @@ -29,4 +26,11 @@ def parse_html(html): return {'ld': ld, 'keywords': keywords} def get_catalog(saam_id): - return parse_html(get_html(saam_id)) + data = parse_html(get_html(saam_id)) + return { + 'institution': 'Smithsonian American Art Museum', + 'keywords': data['keywords'], + 'description': data['ld']['description'] + } + +