Get description from catalog page.

This commit is contained in:
Edward Betts 2019-09-30 12:49:28 +01:00
parent 514c55cd6e
commit faf70a8811
2 changed files with 70 additions and 1 deletions

55
app.py
View file

@ -3,7 +3,7 @@
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database, from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation, dia, rijksmuseum, npg, museodelprado, barnesfoundation,
wd_catalog) wd_catalog, relaxed_ssl)
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem, from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language) Language)
@ -14,6 +14,7 @@ from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct from sqlalchemy import func, distinct
from collections import defaultdict from collections import defaultdict
import hashlib
import requests.exceptions import requests.exceptions
import requests import requests
import lxml.html import lxml.html
@ -400,12 +401,47 @@ def get_catalog_page(property_id, value):
return html return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename).read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.text
open(filename, 'w').write(html)
return html
def get_description_from_page(html): def get_description_from_page(html):
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]') div = root.find('.//div[@itemprop="description"]')
if div is not None: if div is not None:
return div.text return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
@app.route("/item/Q<int:item_id>") @app.route("/item/Q<int:item_id>")
def item_page(item_id): def item_page(item_id):
qid = f'Q{item_id}' qid = f'Q{item_id}'
@ -421,6 +457,15 @@ def item_page(item_id):
label = label_and_language['label'] label = label_and_language['label']
other = get_other(item.entity) other = get_other(item.entity)
if 'P276' in entity['claims']:
location = first_datavalue(entity, 'P276')['id']
institution = other[location]
elif 'P195' in entity['claims']:
collection = first_datavalue(entity, 'P195')['id']
institution = other[collection]
else:
institution = '???'
painting_item = PaintingItem.query.get(item_id) painting_item = PaintingItem.query.get(item_id)
if painting_item is None: if painting_item is None:
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity) painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
@ -452,6 +497,14 @@ def item_page(item_id):
elif catalog_url and 'www.museodelprado.es' in catalog_url: elif catalog_url and 'www.museodelprado.es' in catalog_url:
catalog = museodelprado.get_catalog(catalog_url) catalog = museodelprado.get_catalog(catalog_url)
if not catalog and catalog_url:
html = get_catalog_url(catalog_url)
description = get_description_from_page(html)
catalog = {
'institution': institution,
'description': description,
}
if not catalog and catalog_ids: if not catalog and catalog_ids:
for property_id in sorted(catalog_ids): for property_id in sorted(catalog_ids):
if property_id == 'P350': if property_id == 'P350':

16
depicts/relaxed_ssl.py Normal file
View file

@ -0,0 +1,16 @@
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
CIPHERS = 'DEFAULT@SECLEVEL=1'
class HTTPSAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=CIPHERS)
kwargs['ssl_context'] = context
return super().init_poolmanager(*args, **kwargs)
def get(*args, **kwargs):
s = requests.Session()
s.mount('https://', HTTPSAdapter())
return s.get(*args, **kwargs)