add timeout to catalog pages

This commit is contained in:
Edward Betts 2019-09-29 21:46:39 +01:00
parent 42a8353ecc
commit 6eb79ccfd5

67
app.py
View file

@ -388,7 +388,7 @@ def get_catalog_page(property_id, value):
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(url, headers={'User-Agent': user_agent}) r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.text html = r.text
open(filename, 'w').write(html) open(filename, 'w').write(html)
@ -429,38 +429,41 @@ def item_page(item_id):
catalog_url = first_datavalue(entity, 'P973') catalog_url = first_datavalue(entity, 'P973')
catalog = None catalog = None
if 'P4704' in entity['claims']: try:
saam_id = first_datavalue(entity, 'P4704') if 'P4704' in entity['claims']:
catalog = saam.get_catalog(saam_id) saam_id = first_datavalue(entity, 'P4704')
elif 'P4709' in entity['claims']: catalog = saam.get_catalog(saam_id)
catalog_id = first_datavalue(entity, 'P4709') elif 'P4709' in entity['claims']:
catalog = barnesfoundation.get_catalog(catalog_id) catalog_id = first_datavalue(entity, 'P4709')
elif catalog_url and 'www.dia.org' in catalog_url: catalog = barnesfoundation.get_catalog(catalog_id)
catalog = dia.get_catalog(catalog_url) elif catalog_url and 'www.dia.org' in catalog_url:
elif catalog_url and 'www.rijksmuseum.nl' in catalog_url: catalog = dia.get_catalog(catalog_url)
catalog = rijksmuseum.get_catalog(catalog_url) elif catalog_url and 'www.rijksmuseum.nl' in catalog_url:
elif catalog_url and 'www.npg.org.uk' in catalog_url: catalog = rijksmuseum.get_catalog(catalog_url)
catalog = npg.get_catalog(catalog_url) elif catalog_url and 'www.npg.org.uk' in catalog_url:
elif catalog_url and 'www.museodelprado.es' in catalog_url: catalog = npg.get_catalog(catalog_url)
catalog = museodelprado.get_catalog(catalog_url) elif catalog_url and 'www.museodelprado.es' in catalog_url:
catalog = museodelprado.get_catalog(catalog_url)
if not catalog and catalog_ids: if not catalog and catalog_ids:
for property_id in sorted(catalog_ids): for property_id in sorted(catalog_ids):
if property_id == 'P350': if property_id == 'P350':
continue # RKDimages ID continue # RKDimages ID
value = first_datavalue(entity, property_id) value = first_datavalue(entity, property_id)
detail = wd_catalog.lookup(property_id, value) detail = wd_catalog.lookup(property_id, value)
try: try:
html = get_catalog_page(property_id, value) html = get_catalog_page(property_id, value)
except requests.exceptions.SSLError: except requests.exceptions.SSLError:
continue # ignore this error continue # ignore this error
description = get_description_from_page(html) description = get_description_from_page(html)
if not description: if not description:
continue continue
catalog = { catalog = {
'institution': detail['label'], 'institution': detail['label'],
'description': description, 'description': description,
} }
except requests.exceptions.ReadTimeout:
pass
return render_template('item.html', return render_template('item.html',
qid=qid, qid=qid,