sourcing/sourcing/url.py

import requests
from .model import Item
import os.path
import re

project_dir = os.path.dirname(os.path.dirname(__file__))
cache_location = os.path.join(project_dir, 'cache')

re_colon_slash = re.compile('[/:]+')

def url_filename(url):
    return re_colon_slash.sub('_', url)

def get_text(url):
    # assume UTF-8

    text = get_url(url)

    heading = url.rsplit('/', 1)[-1]
    return {
        'url': url,
        'text': text,
        'heading': heading,
        'length': len(text),
    }

def get_url(url):
    item = Item.from_external(url)
    if item:
        return item.text
    content = requests.get(url).content
    return content.decode(errors='replace')

    filename = os.path.join(cache_location, url_filename(url))

    if os.path.exists(filename):
        content = open(filename, 'rb').read()
    else:
        content = requests.get(url).content
        open(filename, 'wb').write(content)

    return content.decode(errors='replace')
No results found.