commit f8454fa295ca51671d4e7c5974e19ea40ae233b2 Author: Edward Betts Date: Sun Nov 12 16:15:12 2023 +0100 Initial commit diff --git a/parse.py b/parse.py new file mode 100755 index 0000000..85c2700 --- /dev/null +++ b/parse.py @@ -0,0 +1,274 @@ +#!/usr/bin/python3 + +import hashlib +import re +import sys +import typing +import urllib.parse +from datetime import time + +import lxml.etree + +event_id = 1 +presenter_id = 1 + + +def md5sum(s: str) -> str: + """Generate hex md5sum.""" + return hashlib.md5(s.encode("utf-8")).hexdigest() + + +re_day_heading = re.compile( + r'

' + + r' .*(.*?) *

' +) + +re_time = re.compile(r"^\| ?(\d{2}):(\d{2})") +url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection" + +# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]') +re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)') +re_colspan = re.compile(r'\| colspan="(\d+)') + +re_speaker = re.compile(r"(.*)") + + +class Session(typing.TypedDict): + """Session.""" + + name: str + start: time + duration: int + room: str + speakers: list[str] + + +GroupedSlots = dict[int, dict[str, list[Session]]] + + +def content_to_url(s: str) -> str: + try: + assert s.startswith("[[GLAM") + except AssertionError: + print(s) + raise + page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_")) + return "https://meta.wikimedia.org/wiki/" + page_title + + +meta = [ + ("title", "GLAM Wiki 2023"), + ("subtitle", "Galleries, Libraries, Archives, Museums, etc."), + ("venue", "University of the Republic of Uruguay"), + ("city", "Montevideo, Uruguay"), + ("start", "2023-11-16"), + ("end", "2023-11-18"), + ("days", 3), + ("day_change", "08:00"), + ("timeslot_duration", "00:15"), + ("time_zone_name", "America/Montevideo"), +] + +rooms = [ + "Auditorium", + "Posgrado 1 (110)", + "Posgrado 2 (111)", + "401", + "410", + "411", +] + + +def minutes_to_duration(mins: int) -> str: + """Convert minutes to duration string.""" + return f"{mins // 60:02d}:{mins % 60:02d}" + + +def build_event(item: Session, room: lxml.etree._Element) -> None: + global event_id + """Build an event element.""" + if item["name"][0] != "[" or item["name"].startswith("[[Event:"): + return None + name: str = item["name"].partition("|")[2][:-2] + event_data = [ + ("start", str(item["start"])), + ("duration", minutes_to_duration(item["duration"])), + ("room", item["room"]), + ("title", name), + ("subtitle", ""), + # ("track", item["track"] or ""), + ("language", ""), + # ("abstract", item["abstract"]), + ("description", ""), + ("url", content_to_url(item["name"])), + ] + + event = lxml.etree.SubElement(room, "event", id=str(event_id)) + event_id += 1 + + for key, value in event_data: + lxml.etree.SubElement(event, key).text = value + + persons = lxml.etree.SubElement(event, "persons") + for s in item["speakers"]: + lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s + + +def read_wikitext(filename: str) -> GroupedSlots: + group_by_day_and_room: GroupedSlots = { + index: {room: [] for room in rooms} for index in range(1, 5) + } + + expect = "h2" # state machine + start_time = None + day_index = 0 + col: int = 0 + current: Session | None = None + room: str | None = None + for line in open(filename): + if expect == "h2" and line.startswith("

lxml.etree._Element: + """Generate the schedule XML.""" + root = lxml.etree.Element("schedule") + conf = lxml.etree.SubElement(root, "conference") + + for key, value in meta: + lxml.etree.SubElement(conf, key).text = str(value) + + days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)] + for index, d in days: + day = lxml.etree.SubElement(root, "day") + day.set("index", str(index)) + day.set("date", d) + + for room_name in rooms: + room = lxml.etree.SubElement(day, "room", name=room_name) + + for slot in grouped[index][room_name]: + build_event(slot, room) + + return root + + +def main(filename: str) -> None: + """Parse JSON and convert to XML.""" + root = generate_schedule(read_wikitext(filename)) + + as_xml = lxml.etree.tostring( + root, xml_declaration=True, encoding="utf-8", pretty_print=True + ) + + print(as_xml.decode("utf-8"), end="") + + +if __name__ == "__main__": + main(sys.argv[1])