43 lines
960 B
Python
43 lines
960 B
Python
import requests
|
|
from .model import Item
|
|
import os.path
|
|
import re
|
|
|
|
project_dir = os.path.dirname(os.path.dirname(__file__))
|
|
cache_location = os.path.join(project_dir, 'cache')
|
|
|
|
re_colon_slash = re.compile('[/:]+')
|
|
|
|
def url_filename(url):
|
|
return re_colon_slash.sub('_', url)
|
|
|
|
def get_text(url):
|
|
# assume UTF-8
|
|
|
|
text = get_url(url)
|
|
|
|
heading = url.rsplit('/', 1)[-1]
|
|
return {
|
|
'url': url,
|
|
'text': text,
|
|
'heading': heading,
|
|
'length': len(text),
|
|
}
|
|
|
|
def get_url(url):
|
|
item = Item.from_external(url)
|
|
if item:
|
|
return item.text
|
|
content = requests.get(url).content
|
|
return content.decode(errors='replace')
|
|
|
|
filename = os.path.join(cache_location, url_filename(url))
|
|
|
|
if os.path.exists(filename):
|
|
content = open(filename, 'rb').read()
|
|
else:
|
|
content = requests.get(url).content
|
|
open(filename, 'wb').write(content)
|
|
|
|
return content.decode(errors='replace')
|