Get description from catalog page.
This commit is contained in:
parent
514c55cd6e
commit
faf70a8811
55
app.py
55
app.py
|
@ -3,7 +3,7 @@
|
|||
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
||||
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
||||
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
||||
wd_catalog)
|
||||
wd_catalog, relaxed_ssl)
|
||||
from depicts.pager import Pagination, init_pager
|
||||
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
||||
Language)
|
||||
|
@ -14,6 +14,7 @@ from werkzeug.exceptions import InternalServerError
|
|||
from werkzeug.debug.tbtools import get_current_traceback
|
||||
from sqlalchemy import func, distinct
|
||||
from collections import defaultdict
|
||||
import hashlib
|
||||
import requests.exceptions
|
||||
import requests
|
||||
import lxml.html
|
||||
|
@ -400,12 +401,47 @@ def get_catalog_page(property_id, value):
|
|||
|
||||
return html
|
||||
|
||||
def get_catalog_url(url):
|
||||
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
||||
filename = 'cache/' + md5_filename
|
||||
|
||||
if os.path.exists(filename):
|
||||
html = open(filename).read()
|
||||
else:
|
||||
r = relaxed_ssl.get(url,
|
||||
headers={'User-Agent': user_agent},
|
||||
timeout=2)
|
||||
html = r.text
|
||||
open(filename, 'w').write(html)
|
||||
|
||||
return html
|
||||
|
||||
def get_description_from_page(html):
|
||||
root = lxml.html.fromstring(html)
|
||||
div = root.find('.//div[@itemprop="description"]')
|
||||
if div is not None:
|
||||
return div.text
|
||||
|
||||
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
|
||||
twitter_description = meta_twitter_description.get('content')
|
||||
if not twitter_description:
|
||||
return
|
||||
twitter_description = twitter_description.strip()
|
||||
|
||||
if not twitter_description:
|
||||
return
|
||||
|
||||
for element in root.getiterator():
|
||||
if not element.text:
|
||||
continue
|
||||
text = element.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
if text != twitter_description and text.startswith(twitter_description):
|
||||
return text
|
||||
|
||||
return twitter_description
|
||||
|
||||
@app.route("/item/Q<int:item_id>")
|
||||
def item_page(item_id):
|
||||
qid = f'Q{item_id}'
|
||||
|
@ -421,6 +457,15 @@ def item_page(item_id):
|
|||
label = label_and_language['label']
|
||||
other = get_other(item.entity)
|
||||
|
||||
if 'P276' in entity['claims']:
|
||||
location = first_datavalue(entity, 'P276')['id']
|
||||
institution = other[location]
|
||||
elif 'P195' in entity['claims']:
|
||||
collection = first_datavalue(entity, 'P195')['id']
|
||||
institution = other[collection]
|
||||
else:
|
||||
institution = '???'
|
||||
|
||||
painting_item = PaintingItem.query.get(item_id)
|
||||
if painting_item is None:
|
||||
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
|
||||
|
@ -452,6 +497,14 @@ def item_page(item_id):
|
|||
elif catalog_url and 'www.museodelprado.es' in catalog_url:
|
||||
catalog = museodelprado.get_catalog(catalog_url)
|
||||
|
||||
if not catalog and catalog_url:
|
||||
html = get_catalog_url(catalog_url)
|
||||
description = get_description_from_page(html)
|
||||
catalog = {
|
||||
'institution': institution,
|
||||
'description': description,
|
||||
}
|
||||
|
||||
if not catalog and catalog_ids:
|
||||
for property_id in sorted(catalog_ids):
|
||||
if property_id == 'P350':
|
||||
|
|
16
depicts/relaxed_ssl.py
Normal file
16
depicts/relaxed_ssl.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
|
||||
|
||||
CIPHERS = 'DEFAULT@SECLEVEL=1'
|
||||
|
||||
class HTTPSAdapter(HTTPAdapter):
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
context = create_urllib3_context(ciphers=CIPHERS)
|
||||
kwargs['ssl_context'] = context
|
||||
return super().init_poolmanager(*args, **kwargs)
|
||||
|
||||
def get(*args, **kwargs):
|
||||
s = requests.Session()
|
||||
s.mount('https://', HTTPSAdapter())
|
||||
return s.get(*args, **kwargs)
|
Loading…
Reference in a new issue