from .url import get_url, get_text from .parse import get_span, parse_span, parse_link, parse_sourcedoc_facet, xanadoc_span_html, span_html, get_urls from collections import defaultdict from html import escape from .utils import protect_start_spaces import re re_comment = re.compile(r'#.*') re_xanalink = re.compile('xanalink: +([^ ]+) *$') max_sourcedoc_size = 600000 def fulfil_edl(edl): text = {} for url, start, length in parse_edl(edl)['spans']: if url not in text: text[url] = get_text(url) yield get_span(text, url, start, length) def parse_edl(edl_text): edl = { 'spans': [], 'links': [], } for line in edl_text.splitlines(): line = re_comment.sub('', line).strip() if not line: continue span_pointer = parse_span(line) if span_pointer: edl['spans'].append(span_pointer) continue m = re_xanalink.match(line) if m: link_url = m.group(1) edl['links'].append({ 'url': link_url, 'text': get_url(link_url), }) continue return edl def fulfil_edl_with_sources(edl_text): edl = parse_edl(edl_text) spans = edl['spans'] hide_transclusions = set() two_facet_links = [] link_num = 0 for link in edl['links']: link_detail = parse_link(link['text']) if link_detail['type'] == 'HideTransclusions': hide_transclusions.add(parse_sourcedoc_facet(link_detail['facets'][0])) elif len(link_detail['facets']) == 2: two_facet_links.append((link_num, [parse_span(span[0]) for span in link_detail['facets']])) link_num += 1 source = [get_text(url) for url in get_urls(spans)] source_text = {s['url']: s['text'] for s in source} source_doc_links = defaultdict(list) for link_num, facets in two_facet_links: for span in facets: url, start, length = span source_doc_links[url].append((start, length, link_num, 'link')) if url in source_text: continue s = get_text(url) source.append(s) source_text[s['url']] = s['text'] for s in source_doc_links.values(): s.sort() spans = list(enumerate(spans)) doc_spans = [] for num, (url, start, length) in spans: highlight = url not in hide_transclusions span_text = source_text[url] # [start:start + length] new_text = '' pos = start for link_start, link_len, link_num, span_type in source_doc_links[url]: link_end = link_start + link_len if link_start >= start + length: break if link_end < start: continue open_tag = ''.format(link_num) link_span = (open_tag + escape(span_text[link_start:link_end]) + '') new_text += escape(span_text[pos:link_start]) + link_span pos = link_end new_text += escape(span_text[pos:start + length]) cur = xanadoc_span_html(num, new_text, url, start, length, highlight=highlight) doc_spans.append(cur) doc = ''.join(doc_spans) for s in source: text = protect_start_spaces(s.pop('text')) if s['length'] > max_sourcedoc_size: # print('{} > {}'.format(s['length'], max_sourcedoc_size)) continue if s['url'] in hide_transclusions: continue source_spans = [(start, length, num, 'transclusion') for num, (url, start, length) in spans if url == s['url']] source_spans += source_doc_links[s['url']] source_spans.sort() new_text = '' pos = 0 for start, length, num, span_type in source_spans: end = start + length new_text += (escape(text[pos:start]) + span_html(span_type, num) + escape(text[start:end]) + '') pos = end new_text += escape(text[pos:]) new_text = new_text.replace('\n', '
\n') s['text'] = new_text return { 'source': source, 'doc': doc.replace('\n', '
\n'), 'span_count': len(spans), 'link_count': len(two_facet_links), }