From f8454fa295ca51671d4e7c5974e19ea40ae233b2 Mon Sep 17 00:00:00 2001 From: Edward Betts <edward@4angle.com> Date: Sun, 12 Nov 2023 16:15:12 +0100 Subject: [PATCH] Initial commit --- parse.py | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100755 parse.py diff --git a/parse.py b/parse.py new file mode 100755 index 0000000..85c2700 --- /dev/null +++ b/parse.py @@ -0,0 +1,274 @@ +#!/usr/bin/python3 + +import hashlib +import re +import sys +import typing +import urllib.parse +from datetime import time + +import lxml.etree + +event_id = 1 +presenter_id = 1 + + +def md5sum(s: str) -> str: + """Generate hex md5sum.""" + return hashlib.md5(s.encode("utf-8")).hexdigest() + + +re_day_heading = re.compile( + r'<h2 style="border-bottom: 0px;"><b>' + + r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>' +) + +re_time = re.compile(r"^\| ?(\d{2}):(\d{2})") +url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection" + +# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]') +re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)') +re_colspan = re.compile(r'\| colspan="(\d+)') + +re_speaker = re.compile(r"<small>(.*)</small>") + + +class Session(typing.TypedDict): + """Session.""" + + name: str + start: time + duration: int + room: str + speakers: list[str] + + +GroupedSlots = dict[int, dict[str, list[Session]]] + + +def content_to_url(s: str) -> str: + try: + assert s.startswith("[[GLAM") + except AssertionError: + print(s) + raise + page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_")) + return "https://meta.wikimedia.org/wiki/" + page_title + + +meta = [ + ("title", "GLAM Wiki 2023"), + ("subtitle", "Galleries, Libraries, Archives, Museums, etc."), + ("venue", "University of the Republic of Uruguay"), + ("city", "Montevideo, Uruguay"), + ("start", "2023-11-16"), + ("end", "2023-11-18"), + ("days", 3), + ("day_change", "08:00"), + ("timeslot_duration", "00:15"), + ("time_zone_name", "America/Montevideo"), +] + +rooms = [ + "Auditorium", + "Posgrado 1 (110)", + "Posgrado 2 (111)", + "401", + "410", + "411", +] + + +def minutes_to_duration(mins: int) -> str: + """Convert minutes to duration string.""" + return f"{mins // 60:02d}:{mins % 60:02d}" + + +def build_event(item: Session, room: lxml.etree._Element) -> None: + global event_id + """Build an event element.""" + if item["name"][0] != "[" or item["name"].startswith("[[Event:"): + return None + name: str = item["name"].partition("|")[2][:-2] + event_data = [ + ("start", str(item["start"])), + ("duration", minutes_to_duration(item["duration"])), + ("room", item["room"]), + ("title", name), + ("subtitle", ""), + # ("track", item["track"] or ""), + ("language", ""), + # ("abstract", item["abstract"]), + ("description", ""), + ("url", content_to_url(item["name"])), + ] + + event = lxml.etree.SubElement(room, "event", id=str(event_id)) + event_id += 1 + + for key, value in event_data: + lxml.etree.SubElement(event, key).text = value + + persons = lxml.etree.SubElement(event, "persons") + for s in item["speakers"]: + lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s + + +def read_wikitext(filename: str) -> GroupedSlots: + group_by_day_and_room: GroupedSlots = { + index: {room: [] for room in rooms} for index in range(1, 5) + } + + expect = "h2" # state machine + start_time = None + day_index = 0 + col: int = 0 + current: Session | None = None + room: str | None = None + for line in open(filename): + if expect == "h2" and line.startswith("<h2 "): + m = re_day_heading.match(line) + assert m + day_index += 1 + expect = "wikidatabe" + continue + if expect == "wikidatabe": + assert line.startswith('{| class="wikitable"') + expect = "session" + continue + if expect == "session" and line == "|-\n": + expect = "time" + continue + if expect == "heading" and line == "|-\n": + expect = "time" + continue + if expect == "time": + if line == "|\n" or line.startswith('| rowspan="3" |'): + expect = "heading" + continue + if "Detailed program" in line: + expect = "session" + continue + if line[0] == "!": + expect = "heading" + continue + if line == '| rowspan="3" |\n': + expect = "heading" + continue + if line != "|\n": + m = re_time.match(line) + if not m: + print(repr(line)) + assert m + t = time(int(m.group(1)), int(m.group(2))) + start_time = t + col = 1 + # print("time: ", t) + expect = "session" + continue + if expect == "session" and line == "|\n": + col += 1 + + if ( + expect == "session" + and "small" in line + and (line.startswith("''") or line.startswith("<small")) + ): + assert current and isinstance(current["speakers"], list) + m = re_speaker.search(line) + if not m: + print(line) + assert m + speaker_name = m.group(1).strip("'") + if speaker_name.startswith("("): + assert current["speakers"] + current["speakers"][-1] += " " + speaker_name + else: + current["speakers"].append(speaker_name) + # print(current) + continue + + if ( + expect == "session" + and line.startswith("| colspan") + or line.startswith('| rowspan="') + ): + room = rooms[col - 1] + if line.startswith('| rowspan="'): + colspan = 1 + else: + m = re_colspan.match(line) + assert m + colspan = int(m.group(1)) + col += colspan + + if re.match(r'\| rowspan="\d" \|\n', line): + continue + + if line == '| colspan="1" |\n': + continue + if 'style="background-color:#' in line: + continue + m = re_session.match(line) + if not m: + print(repr(line)) + assert m + content = m.group(2) + if not content: + continue + duration = 15 * int(m.group(1)) + + assert start_time + # print(day_index, start_time, col, room, duration, content) + current = { + "name": content, + "start": start_time, + "duration": duration, + "room": room, + "speakers": [], + } + group_by_day_and_room[day_index][room].append(current) + + if expect == "session" and line == "|}\n": + expect = "h2" + continue + + return group_by_day_and_room + + +def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element: + """Generate the schedule XML.""" + root = lxml.etree.Element("schedule") + conf = lxml.etree.SubElement(root, "conference") + + for key, value in meta: + lxml.etree.SubElement(conf, key).text = str(value) + + days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)] + for index, d in days: + day = lxml.etree.SubElement(root, "day") + day.set("index", str(index)) + day.set("date", d) + + for room_name in rooms: + room = lxml.etree.SubElement(day, "room", name=room_name) + + for slot in grouped[index][room_name]: + build_event(slot, room) + + return root + + +def main(filename: str) -> None: + """Parse JSON and convert to XML.""" + root = generate_schedule(read_wikitext(filename)) + + as_xml = lxml.etree.tostring( + root, xml_declaration=True, encoding="utf-8", pretty_print=True + ) + + print(as_xml.decode("utf-8"), end="") + + +if __name__ == "__main__": + main(sys.argv[1])