Refactor: move more catalog code to module
This commit is contained in:
parent
8cbd0f246f
commit
6a6b5ad373
39
app.py
39
app.py
|
@ -3,8 +3,7 @@
|
||||||
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
||||||
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
||||||
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
||||||
wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth,
|
wd_catalog, human, wikibase, wikidata_oauth, parse_catalog)
|
||||||
parse_catalog)
|
|
||||||
from depicts.pager import Pagination, init_pager
|
from depicts.pager import Pagination, init_pager
|
||||||
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
||||||
Language)
|
Language)
|
||||||
|
@ -14,7 +13,6 @@ from werkzeug.exceptions import InternalServerError
|
||||||
from werkzeug.debug.tbtools import get_current_traceback
|
from werkzeug.debug.tbtools import get_current_traceback
|
||||||
from sqlalchemy import func, distinct
|
from sqlalchemy import func, distinct
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import hashlib
|
|
||||||
import requests.exceptions
|
import requests.exceptions
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
|
@ -308,37 +306,6 @@ def image_with_cache(qid, image_filename, width):
|
||||||
|
|
||||||
return detail[image_filename]
|
return detail[image_filename]
|
||||||
|
|
||||||
def get_catalog_page(property_id, value):
|
|
||||||
detail = wd_catalog.lookup(property_id, value)
|
|
||||||
url = detail['url']
|
|
||||||
catalog_id = value.replace('/', '_')
|
|
||||||
|
|
||||||
filename = f'cache/{property_id}_{catalog_id}.html'
|
|
||||||
|
|
||||||
if os.path.exists(filename):
|
|
||||||
html = open(filename, 'rb').read()
|
|
||||||
else:
|
|
||||||
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
|
|
||||||
html = r.content
|
|
||||||
open(filename, 'wb').write(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
def get_catalog_url(url):
|
|
||||||
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
|
||||||
filename = 'cache/' + md5_filename
|
|
||||||
|
|
||||||
if os.path.exists(filename):
|
|
||||||
html = open(filename, 'rb').read()
|
|
||||||
else:
|
|
||||||
r = relaxed_ssl.get(url,
|
|
||||||
headers={'User-Agent': user_agent},
|
|
||||||
timeout=2)
|
|
||||||
html = r.content
|
|
||||||
open(filename, 'wb').write(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
def existing_depicts_from_entity(entity):
|
def existing_depicts_from_entity(entity):
|
||||||
if 'P180' not in entity['claims']:
|
if 'P180' not in entity['claims']:
|
||||||
return []
|
return []
|
||||||
|
@ -430,7 +397,7 @@ def item_page(item_id):
|
||||||
catalog = museodelprado.get_catalog(catalog_url)
|
catalog = museodelprado.get_catalog(catalog_url)
|
||||||
|
|
||||||
if not catalog and catalog_url:
|
if not catalog and catalog_url:
|
||||||
html = get_catalog_url(catalog_url)
|
html = parse_catalog.get_catalog_url(catalog_url)
|
||||||
description = parse_catalog.get_description_from_page(html)
|
description = parse_catalog.get_description_from_page(html)
|
||||||
if description:
|
if description:
|
||||||
catalog = {
|
catalog = {
|
||||||
|
@ -445,7 +412,7 @@ def item_page(item_id):
|
||||||
value = wikibase.first_datavalue(entity, property_id)
|
value = wikibase.first_datavalue(entity, property_id)
|
||||||
detail = wd_catalog.lookup(property_id, value)
|
detail = wd_catalog.lookup(property_id, value)
|
||||||
try:
|
try:
|
||||||
html = get_catalog_page(property_id, value)
|
html = parse_catalog.get_catalog_page(property_id, value)
|
||||||
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
|
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
|
||||||
continue # ignore this error
|
continue # ignore this error
|
||||||
description = parse_catalog.get_description_from_page(html)
|
description = parse_catalog.get_description_from_page(html)
|
||||||
|
|
|
@ -1,4 +1,11 @@
|
||||||
|
from depicts import wd_catalog, relaxed_ssl
|
||||||
|
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
import os.path
|
||||||
|
import requests
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
|
||||||
|
|
||||||
def get_description_from_page(html):
|
def get_description_from_page(html):
|
||||||
root = lxml.html.fromstring(html)
|
root = lxml.html.fromstring(html)
|
||||||
|
@ -27,3 +34,34 @@ def get_description_from_page(html):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
return twitter_description
|
return twitter_description
|
||||||
|
|
||||||
|
def get_catalog_page(property_id, value):
|
||||||
|
detail = wd_catalog.lookup(property_id, value)
|
||||||
|
url = detail['url']
|
||||||
|
catalog_id = value.replace('/', '_')
|
||||||
|
|
||||||
|
filename = f'cache/{property_id}_{catalog_id}.html'
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
html = open(filename, 'rb').read()
|
||||||
|
else:
|
||||||
|
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
|
||||||
|
html = r.content
|
||||||
|
open(filename, 'wb').write(html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
|
def get_catalog_url(url):
|
||||||
|
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
||||||
|
filename = 'cache/' + md5_filename
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
html = open(filename, 'rb').read()
|
||||||
|
else:
|
||||||
|
r = relaxed_ssl.get(url,
|
||||||
|
headers={'User-Agent': user_agent},
|
||||||
|
timeout=2)
|
||||||
|
html = r.content
|
||||||
|
open(filename, 'wb').write(html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
Loading…
Reference in a new issue