Get description from catalog page.

This commit is contained in:
Edward Betts 2019-09-30 12:49:28 +01:00
parent 514c55cd6e
commit faf70a8811
2 changed files with 70 additions and 1 deletions

55
app.py
View file

@ -3,7 +3,7 @@
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
wd_catalog)
wd_catalog, relaxed_ssl)
from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language)
@ -14,6 +14,7 @@ from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct
from collections import defaultdict
import hashlib
import requests.exceptions
import requests
import lxml.html
@ -400,12 +401,47 @@ def get_catalog_page(property_id, value):
return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename).read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.text
open(filename, 'w').write(html)
return html
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
@app.route("/item/Q<int:item_id>")
def item_page(item_id):
qid = f'Q{item_id}'
@ -421,6 +457,15 @@ def item_page(item_id):
label = label_and_language['label']
other = get_other(item.entity)
if 'P276' in entity['claims']:
location = first_datavalue(entity, 'P276')['id']
institution = other[location]
elif 'P195' in entity['claims']:
collection = first_datavalue(entity, 'P195')['id']
institution = other[collection]
else:
institution = '???'
painting_item = PaintingItem.query.get(item_id)
if painting_item is None:
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
@ -452,6 +497,14 @@ def item_page(item_id):
elif catalog_url and 'www.museodelprado.es' in catalog_url:
catalog = museodelprado.get_catalog(catalog_url)
if not catalog and catalog_url:
html = get_catalog_url(catalog_url)
description = get_description_from_page(html)
catalog = {
'institution': institution,
'description': description,
}
if not catalog and catalog_ids:
for property_id in sorted(catalog_ids):
if property_id == 'P350':

16
depicts/relaxed_ssl.py Normal file
View file

@ -0,0 +1,16 @@
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
CIPHERS = 'DEFAULT@SECLEVEL=1'
class HTTPSAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=CIPHERS)
kwargs['ssl_context'] = context
return super().init_poolmanager(*args, **kwargs)
def get(*args, **kwargs):
s = requests.Session()
s.mount('https://', HTTPSAdapter())
return s.get(*args, **kwargs)