catalog page parsers

This commit is contained in:
Edward Betts 2019-09-27 11:01:33 +01:00
parent 23b053c5a6
commit 921824833f
6 changed files with 203 additions and 4 deletions

View file

@ -0,0 +1,32 @@
import requests
import os
import json
def get_json(catalog_id):
filename = f'cache/barnesfoundation_{catalog_id}.html'
url = 'https://collection.barnesfoundation.org/api/search'
body = {"query": {"bool": {"filter": {"exists": {"field": "imageSecret"}},
"must": {"match": {"_id": int(catalog_id)}}}}}
if os.path.exists(filename):
return json.load(open(filename))
else:
r = requests.get(url, params={'body': json.dumps(body)})
print(r.url)
open(filename, 'w').write(r.text)
return r.json()
def parse_catalog(data):
hit = data['hits']['hits'][0]['_source']
return {
'institution': 'Barnes Foundation',
'description': hit['shortDescription'],
'keywords': [tag['tag'] for tag in hit['tags']],
}
def get_catalog(catalog_id):
data = get_json(catalog_id)
return parse_catalog(data)

50
depicts/dia.py Normal file
View file

@ -0,0 +1,50 @@
import requests
import lxml.html
import os
import re
re_url = re.compile(r'https?://www.dia.org/art/collection/object/(.+)$')
def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
filename = f'cache/dia_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, 'w').write(html)
return html
def parse_html(html):
root = lxml.html.fromstring(html)
keywords = []
for a in root.findall('.//a[@href]'):
href = a.get('href')
if not href.startswith('/art/collection?keys='):
continue
keywords.append(a.text)
if False:
sidebar = root.find('.//aside[@id="sidebar"]')
h2_list = sidebar.findall('.//h2')
h2_keyword = next((h2 for h2 in h2_list if h2.text == 'Keywords'), None)
if not h2_keyword:
return {}
keyword_div = h2_keyword.getparent()
for a in keyword_div:
if a.tag != 'a':
continue
keywords.append(a.text)
return {
'institution': 'Detroit Institute of Arts',
'keywords': keywords,
}
def get_catalog(url):
return parse_html(get_html(url))

42
depicts/museodelprado.py Normal file
View file

@ -0,0 +1,42 @@
import requests
import lxml.html
import os
import re
re_url = re.compile(r'www.museodelprado.es/en/.*/([^/]+)$')
def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
filename = f'cache/museodelprado_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, 'w').write(html)
return html
def parse_html(html):
root = lxml.html.fromstring(html)
keywords = []
for h2 in root.findall('.//h2'):
if h2.text.strip() != 'Displayed objects':
continue
div = h2.getparent()
for keyword_span in div.findall('.//span[@property]'):
keywords.append(keyword_span.text)
if not keywords:
return {}
return {
'institution': 'Museo del Prado',
'keywords': keywords,
}
def get_catalog(url):
return parse_html(get_html(url))

37
depicts/npg.py Normal file
View file

@ -0,0 +1,37 @@
import requests
import lxml.html
import os
import re
re_url = re.compile(r'www.npg.org.uk/collections/search/(.+)$')
def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
filename = f'cache/npg_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(url)
html = r.text
open(filename, 'w').write(html)
return html
def parse_html(html):
root = lxml.html.fromstring(html)
keywords = [a.text for a in root.findall('.//a[@href]')
if 'subj=' in a.get('href')]
skip = {'oil', 'painting'}
keywords = [k for k in keywords if k.lower() not in skip]
return {
'institution': 'National Portrait Gallery',
'keywords': keywords,
}
def get_catalog(url):
return parse_html(get_html(url))

34
depicts/rijksmuseum.py Normal file
View file

@ -0,0 +1,34 @@
import requests
import lxml.html
import os
import re
re_url = re.compile(r'^https://www.rijksmuseum.nl/nl/collectie/([^/]+)$')
def get_html(catalog_id):
filename = f'cache/rijksmuseum_{catalog_id}.html'
en_url = 'https://www.rijksmuseum.nl/en/collection/' + catalog_id
if os.path.exists(filename):
html = open(filename).read()
else:
r = requests.get(en_url)
html = r.text
open(filename, 'w').write(html)
return html
def parse_html(html):
root = lxml.html.fromstring(html)
keywords = [a.text for a in root.findall('.//a[@href]')
if 'f.classification.iconClassDescription.sort' in a.get('href')]
return {
'institution': 'Rijksmuseum',
'keywords': keywords,
}
def get_catalog(url):
catalog_id = re_url.search(url).group(1)
return parse_html(get_html(catalog_id))

View file

@ -1,10 +1,7 @@
#!/usr/bin/python3
import requests import requests
import lxml.html import lxml.html
import json import json
import os import os
from pprint import pprint
def get_html(saam_id): def get_html(saam_id):
filename = f'cache/saam_{saam_id}.html' filename = f'cache/saam_{saam_id}.html'
@ -29,4 +26,11 @@ def parse_html(html):
return {'ld': ld, 'keywords': keywords} return {'ld': ld, 'keywords': keywords}
def get_catalog(saam_id): def get_catalog(saam_id):
return parse_html(get_html(saam_id)) data = parse_html(get_html(saam_id))
return {
'institution': 'Smithsonian American Art Museum',
'keywords': data['keywords'],
'description': data['ld']['description']
}