catalog page parsers
This commit is contained in:
parent
23b053c5a6
commit
921824833f
32
depicts/barnesfoundation.py
Normal file
32
depicts/barnesfoundation.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import requests
|
||||
import os
|
||||
import json
|
||||
|
||||
def get_json(catalog_id):
|
||||
filename = f'cache/barnesfoundation_{catalog_id}.html'
|
||||
|
||||
url = 'https://collection.barnesfoundation.org/api/search'
|
||||
|
||||
body = {"query": {"bool": {"filter": {"exists": {"field": "imageSecret"}},
|
||||
"must": {"match": {"_id": int(catalog_id)}}}}}
|
||||
|
||||
if os.path.exists(filename):
|
||||
return json.load(open(filename))
|
||||
else:
|
||||
r = requests.get(url, params={'body': json.dumps(body)})
|
||||
print(r.url)
|
||||
open(filename, 'w').write(r.text)
|
||||
return r.json()
|
||||
|
||||
def parse_catalog(data):
|
||||
hit = data['hits']['hits'][0]['_source']
|
||||
|
||||
return {
|
||||
'institution': 'Barnes Foundation',
|
||||
'description': hit['shortDescription'],
|
||||
'keywords': [tag['tag'] for tag in hit['tags']],
|
||||
}
|
||||
|
||||
def get_catalog(catalog_id):
|
||||
data = get_json(catalog_id)
|
||||
return parse_catalog(data)
|
50
depicts/dia.py
Normal file
50
depicts/dia.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import requests
|
||||
import lxml.html
|
||||
import os
|
||||
import re
|
||||
|
||||
re_url = re.compile(r'https?://www.dia.org/art/collection/object/(.+)$')
|
||||
|
||||
def get_html(url):
|
||||
catalog_id = re_url.search(url).group(1).replace('/', '_')
|
||||
|
||||
filename = f'cache/dia_{catalog_id}.html'
|
||||
|
||||
if os.path.exists(filename):
|
||||
html = open(filename).read()
|
||||
else:
|
||||
r = requests.get(url)
|
||||
html = r.text
|
||||
open(filename, 'w').write(html)
|
||||
|
||||
return html
|
||||
|
||||
def parse_html(html):
|
||||
root = lxml.html.fromstring(html)
|
||||
keywords = []
|
||||
|
||||
for a in root.findall('.//a[@href]'):
|
||||
href = a.get('href')
|
||||
if not href.startswith('/art/collection?keys='):
|
||||
continue
|
||||
keywords.append(a.text)
|
||||
|
||||
if False:
|
||||
sidebar = root.find('.//aside[@id="sidebar"]')
|
||||
h2_list = sidebar.findall('.//h2')
|
||||
h2_keyword = next((h2 for h2 in h2_list if h2.text == 'Keywords'), None)
|
||||
if not h2_keyword:
|
||||
return {}
|
||||
keyword_div = h2_keyword.getparent()
|
||||
for a in keyword_div:
|
||||
if a.tag != 'a':
|
||||
continue
|
||||
keywords.append(a.text)
|
||||
|
||||
return {
|
||||
'institution': 'Detroit Institute of Arts',
|
||||
'keywords': keywords,
|
||||
}
|
||||
|
||||
def get_catalog(url):
|
||||
return parse_html(get_html(url))
|
42
depicts/museodelprado.py
Normal file
42
depicts/museodelprado.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
import requests
|
||||
import lxml.html
|
||||
import os
|
||||
import re
|
||||
|
||||
re_url = re.compile(r'www.museodelprado.es/en/.*/([^/]+)$')
|
||||
|
||||
def get_html(url):
|
||||
catalog_id = re_url.search(url).group(1).replace('/', '_')
|
||||
|
||||
filename = f'cache/museodelprado_{catalog_id}.html'
|
||||
|
||||
if os.path.exists(filename):
|
||||
html = open(filename).read()
|
||||
else:
|
||||
r = requests.get(url)
|
||||
html = r.text
|
||||
open(filename, 'w').write(html)
|
||||
|
||||
return html
|
||||
|
||||
def parse_html(html):
|
||||
root = lxml.html.fromstring(html)
|
||||
|
||||
keywords = []
|
||||
for h2 in root.findall('.//h2'):
|
||||
if h2.text.strip() != 'Displayed objects':
|
||||
continue
|
||||
div = h2.getparent()
|
||||
for keyword_span in div.findall('.//span[@property]'):
|
||||
keywords.append(keyword_span.text)
|
||||
|
||||
if not keywords:
|
||||
return {}
|
||||
|
||||
return {
|
||||
'institution': 'Museo del Prado',
|
||||
'keywords': keywords,
|
||||
}
|
||||
|
||||
def get_catalog(url):
|
||||
return parse_html(get_html(url))
|
37
depicts/npg.py
Normal file
37
depicts/npg.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import requests
|
||||
import lxml.html
|
||||
import os
|
||||
import re
|
||||
|
||||
re_url = re.compile(r'www.npg.org.uk/collections/search/(.+)$')
|
||||
|
||||
def get_html(url):
|
||||
catalog_id = re_url.search(url).group(1).replace('/', '_')
|
||||
|
||||
filename = f'cache/npg_{catalog_id}.html'
|
||||
|
||||
if os.path.exists(filename):
|
||||
html = open(filename).read()
|
||||
else:
|
||||
r = requests.get(url)
|
||||
html = r.text
|
||||
open(filename, 'w').write(html)
|
||||
|
||||
return html
|
||||
|
||||
def parse_html(html):
|
||||
root = lxml.html.fromstring(html)
|
||||
|
||||
keywords = [a.text for a in root.findall('.//a[@href]')
|
||||
if 'subj=' in a.get('href')]
|
||||
|
||||
skip = {'oil', 'painting'}
|
||||
keywords = [k for k in keywords if k.lower() not in skip]
|
||||
|
||||
return {
|
||||
'institution': 'National Portrait Gallery',
|
||||
'keywords': keywords,
|
||||
}
|
||||
|
||||
def get_catalog(url):
|
||||
return parse_html(get_html(url))
|
34
depicts/rijksmuseum.py
Normal file
34
depicts/rijksmuseum.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import requests
|
||||
import lxml.html
|
||||
import os
|
||||
import re
|
||||
|
||||
re_url = re.compile(r'^https://www.rijksmuseum.nl/nl/collectie/([^/]+)$')
|
||||
|
||||
def get_html(catalog_id):
|
||||
filename = f'cache/rijksmuseum_{catalog_id}.html'
|
||||
en_url = 'https://www.rijksmuseum.nl/en/collection/' + catalog_id
|
||||
|
||||
if os.path.exists(filename):
|
||||
html = open(filename).read()
|
||||
else:
|
||||
r = requests.get(en_url)
|
||||
html = r.text
|
||||
open(filename, 'w').write(html)
|
||||
|
||||
return html
|
||||
|
||||
def parse_html(html):
|
||||
root = lxml.html.fromstring(html)
|
||||
keywords = [a.text for a in root.findall('.//a[@href]')
|
||||
if 'f.classification.iconClassDescription.sort' in a.get('href')]
|
||||
|
||||
return {
|
||||
'institution': 'Rijksmuseum',
|
||||
'keywords': keywords,
|
||||
}
|
||||
|
||||
def get_catalog(url):
|
||||
catalog_id = re_url.search(url).group(1)
|
||||
|
||||
return parse_html(get_html(catalog_id))
|
|
@ -1,10 +1,7 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import requests
|
||||
import lxml.html
|
||||
import json
|
||||
import os
|
||||
from pprint import pprint
|
||||
|
||||
def get_html(saam_id):
|
||||
filename = f'cache/saam_{saam_id}.html'
|
||||
|
@ -29,4 +26,11 @@ def parse_html(html):
|
|||
return {'ld': ld, 'keywords': keywords}
|
||||
|
||||
def get_catalog(saam_id):
|
||||
return parse_html(get_html(saam_id))
|
||||
data = parse_html(get_html(saam_id))
|
||||
return {
|
||||
'institution': 'Smithsonian American Art Museum',
|
||||
'keywords': data['keywords'],
|
||||
'description': data['ld']['description']
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue