From f8454fa295ca51671d4e7c5974e19ea40ae233b2 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Sun, 12 Nov 2023 16:15:12 +0100
Subject: [PATCH] Initial commit

---
 parse.py | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100755 parse.py

diff --git a/parse.py b/parse.py
new file mode 100755
index 0000000..85c2700
--- /dev/null
+++ b/parse.py
@@ -0,0 +1,274 @@
+#!/usr/bin/python3
+
+import hashlib
+import re
+import sys
+import typing
+import urllib.parse
+from datetime import time
+
+import lxml.etree
+
+event_id = 1
+presenter_id = 1
+
+
+def md5sum(s: str) -> str:
+    """Generate hex md5sum."""
+    return hashlib.md5(s.encode("utf-8")).hexdigest()
+
+
+re_day_heading = re.compile(
+    r'<h2 style="border-bottom: 0px;"><b>'
+    + r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>'
+)
+
+re_time = re.compile(r"^\| ?(\d{2}):(\d{2})")
+url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection"
+
+# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]')
+re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)')
+re_colspan = re.compile(r'\| colspan="(\d+)')
+
+re_speaker = re.compile(r"<small>(.*)</small>")
+
+
+class Session(typing.TypedDict):
+    """Session."""
+
+    name: str
+    start: time
+    duration: int
+    room: str
+    speakers: list[str]
+
+
+GroupedSlots = dict[int, dict[str, list[Session]]]
+
+
+def content_to_url(s: str) -> str:
+    try:
+        assert s.startswith("[[GLAM")
+    except AssertionError:
+        print(s)
+        raise
+    page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_"))
+    return "https://meta.wikimedia.org/wiki/" + page_title
+
+
+meta = [
+    ("title", "GLAM Wiki 2023"),
+    ("subtitle", "Galleries, Libraries, Archives, Museums, etc."),
+    ("venue", "University of the Republic of Uruguay"),
+    ("city", "Montevideo, Uruguay"),
+    ("start", "2023-11-16"),
+    ("end", "2023-11-18"),
+    ("days", 3),
+    ("day_change", "08:00"),
+    ("timeslot_duration", "00:15"),
+    ("time_zone_name", "America/Montevideo"),
+]
+
+rooms = [
+    "Auditorium",
+    "Posgrado 1 (110)",
+    "Posgrado 2 (111)",
+    "401",
+    "410",
+    "411",
+]
+
+
+def minutes_to_duration(mins: int) -> str:
+    """Convert minutes to duration string."""
+    return f"{mins // 60:02d}:{mins % 60:02d}"
+
+
+def build_event(item: Session, room: lxml.etree._Element) -> None:
+    global event_id
+    """Build an event element."""
+    if item["name"][0] != "[" or item["name"].startswith("[[Event:"):
+        return None
+    name: str = item["name"].partition("|")[2][:-2]
+    event_data = [
+        ("start", str(item["start"])),
+        ("duration", minutes_to_duration(item["duration"])),
+        ("room", item["room"]),
+        ("title", name),
+        ("subtitle", ""),
+        # ("track", item["track"] or ""),
+        ("language", ""),
+        # ("abstract", item["abstract"]),
+        ("description", ""),
+        ("url", content_to_url(item["name"])),
+    ]
+
+    event = lxml.etree.SubElement(room, "event", id=str(event_id))
+    event_id += 1
+
+    for key, value in event_data:
+        lxml.etree.SubElement(event, key).text = value
+
+    persons = lxml.etree.SubElement(event, "persons")
+    for s in item["speakers"]:
+        lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s
+
+
+def read_wikitext(filename: str) -> GroupedSlots:
+    group_by_day_and_room: GroupedSlots = {
+        index: {room: [] for room in rooms} for index in range(1, 5)
+    }
+
+    expect = "h2"  # state machine
+    start_time = None
+    day_index = 0
+    col: int = 0
+    current: Session | None = None
+    room: str | None = None
+    for line in open(filename):
+        if expect == "h2" and line.startswith("<h2 "):
+            m = re_day_heading.match(line)
+            assert m
+            day_index += 1
+            expect = "wikidatabe"
+            continue
+        if expect == "wikidatabe":
+            assert line.startswith('{| class="wikitable"')
+            expect = "session"
+            continue
+        if expect == "session" and line == "|-\n":
+            expect = "time"
+            continue
+        if expect == "heading" and line == "|-\n":
+            expect = "time"
+            continue
+        if expect == "time":
+            if line == "|\n" or line.startswith('| rowspan="3" |'):
+                expect = "heading"
+                continue
+            if "Detailed program" in line:
+                expect = "session"
+                continue
+            if line[0] == "!":
+                expect = "heading"
+                continue
+            if line == '| rowspan="3" |\n':
+                expect = "heading"
+                continue
+            if line != "|\n":
+                m = re_time.match(line)
+                if not m:
+                    print(repr(line))
+                assert m
+                t = time(int(m.group(1)), int(m.group(2)))
+                start_time = t
+                col = 1
+                # print("time: ", t)
+            expect = "session"
+            continue
+        if expect == "session" and line == "|\n":
+            col += 1
+
+        if (
+            expect == "session"
+            and "small" in line
+            and (line.startswith("''") or line.startswith("<small"))
+        ):
+            assert current and isinstance(current["speakers"], list)
+            m = re_speaker.search(line)
+            if not m:
+                print(line)
+            assert m
+            speaker_name = m.group(1).strip("'")
+            if speaker_name.startswith("("):
+                assert current["speakers"]
+                current["speakers"][-1] += " " + speaker_name
+            else:
+                current["speakers"].append(speaker_name)
+            # print(current)
+            continue
+
+        if (
+            expect == "session"
+            and line.startswith("| colspan")
+            or line.startswith('| rowspan="')
+        ):
+            room = rooms[col - 1]
+            if line.startswith('| rowspan="'):
+                colspan = 1
+            else:
+                m = re_colspan.match(line)
+                assert m
+                colspan = int(m.group(1))
+            col += colspan
+
+            if re.match(r'\| rowspan="\d" \|\n', line):
+                continue
+
+            if line == '| colspan="1" |\n':
+                continue
+            if 'style="background-color:#' in line:
+                continue
+            m = re_session.match(line)
+            if not m:
+                print(repr(line))
+            assert m
+            content = m.group(2)
+            if not content:
+                continue
+            duration = 15 * int(m.group(1))
+
+            assert start_time
+            # print(day_index, start_time, col, room, duration, content)
+            current = {
+                "name": content,
+                "start": start_time,
+                "duration": duration,
+                "room": room,
+                "speakers": [],
+            }
+            group_by_day_and_room[day_index][room].append(current)
+
+        if expect == "session" and line == "|}\n":
+            expect = "h2"
+            continue
+
+    return group_by_day_and_room
+
+
+def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element:
+    """Generate the schedule XML."""
+    root = lxml.etree.Element("schedule")
+    conf = lxml.etree.SubElement(root, "conference")
+
+    for key, value in meta:
+        lxml.etree.SubElement(conf, key).text = str(value)
+
+    days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)]
+    for index, d in days:
+        day = lxml.etree.SubElement(root, "day")
+        day.set("index", str(index))
+        day.set("date", d)
+
+        for room_name in rooms:
+            room = lxml.etree.SubElement(day, "room", name=room_name)
+
+            for slot in grouped[index][room_name]:
+                build_event(slot, room)
+
+    return root
+
+
+def main(filename: str) -> None:
+    """Parse JSON and convert to XML."""
+    root = generate_schedule(read_wikitext(filename))
+
+    as_xml = lxml.etree.tostring(
+        root, xml_declaration=True, encoding="utf-8", pretty_print=True
+    )
+
+    print(as_xml.decode("utf-8"), end="")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])