Refactor: move catalog parse code to module

This commit is contained in:
Edward Betts 2019-10-09 16:11:43 +01:00
parent a5e5fa20df
commit 8cbd0f246f
2 changed files with 33 additions and 32 deletions

36
app.py
View file

@ -3,7 +3,8 @@
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth)
wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth,
parse_catalog)
from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language)
@ -16,7 +17,6 @@ from collections import defaultdict
import hashlib
import requests.exceptions
import requests
import lxml.html
import json
import os
import locale
@ -339,34 +339,6 @@ def get_catalog_url(url):
return html
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
def existing_depicts_from_entity(entity):
if 'P180' not in entity['claims']:
return []
@ -459,7 +431,7 @@ def item_page(item_id):
if not catalog and catalog_url:
html = get_catalog_url(catalog_url)
description = get_description_from_page(html)
description = parse_catalog.get_description_from_page(html)
if description:
catalog = {
'institution': institution,
@ -476,7 +448,7 @@ def item_page(item_id):
html = get_catalog_page(property_id, value)
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
continue # ignore this error
description = get_description_from_page(html)
description = parse_catalog.get_description_from_page(html)
if not description:
continue
catalog = {

29
depicts/parse_catalog.py Normal file
View file

@ -0,0 +1,29 @@
import lxml.html
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description