catalog page parsers
This commit is contained in:
		
							parent
							
								
									23b053c5a6
								
							
						
					
					
						commit
						921824833f
					
				
							
								
								
									
										32
									
								
								depicts/barnesfoundation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								depicts/barnesfoundation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,32 @@
 | 
			
		|||
import requests
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
def get_json(catalog_id):
 | 
			
		||||
    filename = f'cache/barnesfoundation_{catalog_id}.html'
 | 
			
		||||
 | 
			
		||||
    url = 'https://collection.barnesfoundation.org/api/search'
 | 
			
		||||
 | 
			
		||||
    body = {"query": {"bool": {"filter": {"exists": {"field": "imageSecret"}},
 | 
			
		||||
                               "must": {"match": {"_id": int(catalog_id)}}}}}
 | 
			
		||||
 | 
			
		||||
    if os.path.exists(filename):
 | 
			
		||||
        return json.load(open(filename))
 | 
			
		||||
    else:
 | 
			
		||||
        r = requests.get(url, params={'body': json.dumps(body)})
 | 
			
		||||
        print(r.url)
 | 
			
		||||
        open(filename, 'w').write(r.text)
 | 
			
		||||
        return r.json()
 | 
			
		||||
 | 
			
		||||
def parse_catalog(data):
 | 
			
		||||
    hit = data['hits']['hits'][0]['_source']
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'Barnes Foundation',
 | 
			
		||||
        'description': hit['shortDescription'],
 | 
			
		||||
        'keywords': [tag['tag'] for tag in hit['tags']],
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
def get_catalog(catalog_id):
 | 
			
		||||
    data = get_json(catalog_id)
 | 
			
		||||
    return parse_catalog(data)
 | 
			
		||||
							
								
								
									
										50
									
								
								depicts/dia.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								depicts/dia.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,50 @@
 | 
			
		|||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
re_url = re.compile(r'https?://www.dia.org/art/collection/object/(.+)$')
 | 
			
		||||
 | 
			
		||||
def get_html(url):
 | 
			
		||||
    catalog_id = re_url.search(url).group(1).replace('/', '_')
 | 
			
		||||
 | 
			
		||||
    filename = f'cache/dia_{catalog_id}.html'
 | 
			
		||||
 | 
			
		||||
    if os.path.exists(filename):
 | 
			
		||||
        html = open(filename).read()
 | 
			
		||||
    else:
 | 
			
		||||
        r = requests.get(url)
 | 
			
		||||
        html = r.text
 | 
			
		||||
        open(filename, 'w').write(html)
 | 
			
		||||
 | 
			
		||||
    return html
 | 
			
		||||
 | 
			
		||||
def parse_html(html):
 | 
			
		||||
    root = lxml.html.fromstring(html)
 | 
			
		||||
    keywords = []
 | 
			
		||||
 | 
			
		||||
    for a in root.findall('.//a[@href]'):
 | 
			
		||||
        href = a.get('href')
 | 
			
		||||
        if not href.startswith('/art/collection?keys='):
 | 
			
		||||
            continue
 | 
			
		||||
        keywords.append(a.text)
 | 
			
		||||
 | 
			
		||||
    if False:
 | 
			
		||||
        sidebar = root.find('.//aside[@id="sidebar"]')
 | 
			
		||||
        h2_list = sidebar.findall('.//h2')
 | 
			
		||||
        h2_keyword = next((h2 for h2 in h2_list if h2.text == 'Keywords'), None)
 | 
			
		||||
        if not h2_keyword:
 | 
			
		||||
            return {}
 | 
			
		||||
        keyword_div = h2_keyword.getparent()
 | 
			
		||||
        for a in keyword_div:
 | 
			
		||||
            if a.tag != 'a':
 | 
			
		||||
                continue
 | 
			
		||||
            keywords.append(a.text)
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'Detroit Institute of Arts',
 | 
			
		||||
        'keywords': keywords,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
def get_catalog(url):
 | 
			
		||||
    return parse_html(get_html(url))
 | 
			
		||||
							
								
								
									
										42
									
								
								depicts/museodelprado.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								depicts/museodelprado.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,42 @@
 | 
			
		|||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
re_url = re.compile(r'www.museodelprado.es/en/.*/([^/]+)$')
 | 
			
		||||
 | 
			
		||||
def get_html(url):
 | 
			
		||||
    catalog_id = re_url.search(url).group(1).replace('/', '_')
 | 
			
		||||
 | 
			
		||||
    filename = f'cache/museodelprado_{catalog_id}.html'
 | 
			
		||||
 | 
			
		||||
    if os.path.exists(filename):
 | 
			
		||||
        html = open(filename).read()
 | 
			
		||||
    else:
 | 
			
		||||
        r = requests.get(url)
 | 
			
		||||
        html = r.text
 | 
			
		||||
        open(filename, 'w').write(html)
 | 
			
		||||
 | 
			
		||||
    return html
 | 
			
		||||
 | 
			
		||||
def parse_html(html):
 | 
			
		||||
    root = lxml.html.fromstring(html)
 | 
			
		||||
 | 
			
		||||
    keywords = []
 | 
			
		||||
    for h2 in root.findall('.//h2'):
 | 
			
		||||
        if h2.text.strip() != 'Displayed objects':
 | 
			
		||||
            continue
 | 
			
		||||
        div = h2.getparent()
 | 
			
		||||
        for keyword_span in div.findall('.//span[@property]'):
 | 
			
		||||
            keywords.append(keyword_span.text)
 | 
			
		||||
 | 
			
		||||
    if not keywords:
 | 
			
		||||
        return {}
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'Museo del Prado',
 | 
			
		||||
        'keywords': keywords,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
def get_catalog(url):
 | 
			
		||||
    return parse_html(get_html(url))
 | 
			
		||||
							
								
								
									
										37
									
								
								depicts/npg.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								depicts/npg.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,37 @@
 | 
			
		|||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
re_url = re.compile(r'www.npg.org.uk/collections/search/(.+)$')
 | 
			
		||||
 | 
			
		||||
def get_html(url):
 | 
			
		||||
    catalog_id = re_url.search(url).group(1).replace('/', '_')
 | 
			
		||||
 | 
			
		||||
    filename = f'cache/npg_{catalog_id}.html'
 | 
			
		||||
 | 
			
		||||
    if os.path.exists(filename):
 | 
			
		||||
        html = open(filename).read()
 | 
			
		||||
    else:
 | 
			
		||||
        r = requests.get(url)
 | 
			
		||||
        html = r.text
 | 
			
		||||
        open(filename, 'w').write(html)
 | 
			
		||||
 | 
			
		||||
    return html
 | 
			
		||||
 | 
			
		||||
def parse_html(html):
 | 
			
		||||
    root = lxml.html.fromstring(html)
 | 
			
		||||
 | 
			
		||||
    keywords = [a.text for a in root.findall('.//a[@href]')
 | 
			
		||||
                if 'subj=' in a.get('href')]
 | 
			
		||||
 | 
			
		||||
    skip = {'oil', 'painting'}
 | 
			
		||||
    keywords = [k for k in keywords if k.lower() not in skip]
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'National Portrait Gallery',
 | 
			
		||||
        'keywords': keywords,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
def get_catalog(url):
 | 
			
		||||
    return parse_html(get_html(url))
 | 
			
		||||
							
								
								
									
										34
									
								
								depicts/rijksmuseum.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								depicts/rijksmuseum.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,34 @@
 | 
			
		|||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
re_url = re.compile(r'^https://www.rijksmuseum.nl/nl/collectie/([^/]+)$')
 | 
			
		||||
 | 
			
		||||
def get_html(catalog_id):
 | 
			
		||||
    filename = f'cache/rijksmuseum_{catalog_id}.html'
 | 
			
		||||
    en_url = 'https://www.rijksmuseum.nl/en/collection/' + catalog_id
 | 
			
		||||
 | 
			
		||||
    if os.path.exists(filename):
 | 
			
		||||
        html = open(filename).read()
 | 
			
		||||
    else:
 | 
			
		||||
        r = requests.get(en_url)
 | 
			
		||||
        html = r.text
 | 
			
		||||
        open(filename, 'w').write(html)
 | 
			
		||||
 | 
			
		||||
    return html
 | 
			
		||||
 | 
			
		||||
def parse_html(html):
 | 
			
		||||
    root = lxml.html.fromstring(html)
 | 
			
		||||
    keywords = [a.text for a in root.findall('.//a[@href]')
 | 
			
		||||
                if 'f.classification.iconClassDescription.sort' in a.get('href')]
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'Rijksmuseum',
 | 
			
		||||
        'keywords': keywords,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
def get_catalog(url):
 | 
			
		||||
    catalog_id = re_url.search(url).group(1)
 | 
			
		||||
 | 
			
		||||
    return parse_html(get_html(catalog_id))
 | 
			
		||||
| 
						 | 
				
			
			@ -1,10 +1,7 @@
 | 
			
		|||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
from pprint import pprint
 | 
			
		||||
 | 
			
		||||
def get_html(saam_id):
 | 
			
		||||
    filename = f'cache/saam_{saam_id}.html'
 | 
			
		||||
| 
						 | 
				
			
			@ -29,4 +26,11 @@ def parse_html(html):
 | 
			
		|||
    return {'ld': ld, 'keywords': keywords}
 | 
			
		||||
 | 
			
		||||
def get_catalog(saam_id):
 | 
			
		||||
    return parse_html(get_html(saam_id))
 | 
			
		||||
    data = parse_html(get_html(saam_id))
 | 
			
		||||
    return {
 | 
			
		||||
        'institution': 'Smithsonian American Art Museum',
 | 
			
		||||
        'keywords': data['keywords'],
 | 
			
		||||
        'description': data['ld']['description']
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue