glam-wiki-schedule/parse.py
2023-11-12 16:15:12 +01:00

275 lines
7.7 KiB
Python
Executable file

#!/usr/bin/python3
import hashlib
import re
import sys
import typing
import urllib.parse
from datetime import time
import lxml.etree
event_id = 1
presenter_id = 1
def md5sum(s: str) -> str:
"""Generate hex md5sum."""
return hashlib.md5(s.encode("utf-8")).hexdigest()
re_day_heading = re.compile(
r'<h2 style="border-bottom: 0px;"><b>'
+ r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>'
)
re_time = re.compile(r"^\| ?(\d{2}):(\d{2})")
url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection"
# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]')
re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)')
re_colspan = re.compile(r'\| colspan="(\d+)')
re_speaker = re.compile(r"<small>(.*)</small>")
class Session(typing.TypedDict):
"""Session."""
name: str
start: time
duration: int
room: str
speakers: list[str]
GroupedSlots = dict[int, dict[str, list[Session]]]
def content_to_url(s: str) -> str:
try:
assert s.startswith("[[GLAM")
except AssertionError:
print(s)
raise
page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_"))
return "https://meta.wikimedia.org/wiki/" + page_title
meta = [
("title", "GLAM Wiki 2023"),
("subtitle", "Galleries, Libraries, Archives, Museums, etc."),
("venue", "University of the Republic of Uruguay"),
("city", "Montevideo, Uruguay"),
("start", "2023-11-16"),
("end", "2023-11-18"),
("days", 3),
("day_change", "08:00"),
("timeslot_duration", "00:15"),
("time_zone_name", "America/Montevideo"),
]
rooms = [
"Auditorium",
"Posgrado 1 (110)",
"Posgrado 2 (111)",
"401",
"410",
"411",
]
def minutes_to_duration(mins: int) -> str:
"""Convert minutes to duration string."""
return f"{mins // 60:02d}:{mins % 60:02d}"
def build_event(item: Session, room: lxml.etree._Element) -> None:
global event_id
"""Build an event element."""
if item["name"][0] != "[" or item["name"].startswith("[[Event:"):
return None
name: str = item["name"].partition("|")[2][:-2]
event_data = [
("start", str(item["start"])),
("duration", minutes_to_duration(item["duration"])),
("room", item["room"]),
("title", name),
("subtitle", ""),
# ("track", item["track"] or ""),
("language", ""),
# ("abstract", item["abstract"]),
("description", ""),
("url", content_to_url(item["name"])),
]
event = lxml.etree.SubElement(room, "event", id=str(event_id))
event_id += 1
for key, value in event_data:
lxml.etree.SubElement(event, key).text = value
persons = lxml.etree.SubElement(event, "persons")
for s in item["speakers"]:
lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s
def read_wikitext(filename: str) -> GroupedSlots:
group_by_day_and_room: GroupedSlots = {
index: {room: [] for room in rooms} for index in range(1, 5)
}
expect = "h2" # state machine
start_time = None
day_index = 0
col: int = 0
current: Session | None = None
room: str | None = None
for line in open(filename):
if expect == "h2" and line.startswith("<h2 "):
m = re_day_heading.match(line)
assert m
day_index += 1
expect = "wikidatabe"
continue
if expect == "wikidatabe":
assert line.startswith('{| class="wikitable"')
expect = "session"
continue
if expect == "session" and line == "|-\n":
expect = "time"
continue
if expect == "heading" and line == "|-\n":
expect = "time"
continue
if expect == "time":
if line == "|\n" or line.startswith('| rowspan="3" |'):
expect = "heading"
continue
if "Detailed program" in line:
expect = "session"
continue
if line[0] == "!":
expect = "heading"
continue
if line == '| rowspan="3" |\n':
expect = "heading"
continue
if line != "|\n":
m = re_time.match(line)
if not m:
print(repr(line))
assert m
t = time(int(m.group(1)), int(m.group(2)))
start_time = t
col = 1
# print("time: ", t)
expect = "session"
continue
if expect == "session" and line == "|\n":
col += 1
if (
expect == "session"
and "small" in line
and (line.startswith("''") or line.startswith("<small"))
):
assert current and isinstance(current["speakers"], list)
m = re_speaker.search(line)
if not m:
print(line)
assert m
speaker_name = m.group(1).strip("'")
if speaker_name.startswith("("):
assert current["speakers"]
current["speakers"][-1] += " " + speaker_name
else:
current["speakers"].append(speaker_name)
# print(current)
continue
if (
expect == "session"
and line.startswith("| colspan")
or line.startswith('| rowspan="')
):
room = rooms[col - 1]
if line.startswith('| rowspan="'):
colspan = 1
else:
m = re_colspan.match(line)
assert m
colspan = int(m.group(1))
col += colspan
if re.match(r'\| rowspan="\d" \|\n', line):
continue
if line == '| colspan="1" |\n':
continue
if 'style="background-color:#' in line:
continue
m = re_session.match(line)
if not m:
print(repr(line))
assert m
content = m.group(2)
if not content:
continue
duration = 15 * int(m.group(1))
assert start_time
# print(day_index, start_time, col, room, duration, content)
current = {
"name": content,
"start": start_time,
"duration": duration,
"room": room,
"speakers": [],
}
group_by_day_and_room[day_index][room].append(current)
if expect == "session" and line == "|}\n":
expect = "h2"
continue
return group_by_day_and_room
def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element:
"""Generate the schedule XML."""
root = lxml.etree.Element("schedule")
conf = lxml.etree.SubElement(root, "conference")
for key, value in meta:
lxml.etree.SubElement(conf, key).text = str(value)
days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)]
for index, d in days:
day = lxml.etree.SubElement(root, "day")
day.set("index", str(index))
day.set("date", d)
for room_name in rooms:
room = lxml.etree.SubElement(day, "room", name=room_name)
for slot in grouped[index][room_name]:
build_event(slot, room)
return root
def main(filename: str) -> None:
"""Parse JSON and convert to XML."""
root = generate_schedule(read_wikitext(filename))
as_xml = lxml.etree.tostring(
root, xml_declaration=True, encoding="utf-8", pretty_print=True
)
print(as_xml.decode("utf-8"), end="")
if __name__ == "__main__":
main(sys.argv[1])