From e48acab387f994594826b3e6e2e5a1df354863f7 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 27 Sep 2023 17:35:16 +0100 Subject: [PATCH] Code for importing data from pretalx --- confarchive/model.py | 18 +- confarchive/templates/import/event.html | 83 ++++++++ confarchive/templates/import/start.html | 40 ++++ confarchive/templates/person.html | 2 +- confarchive/view.py | 254 +++++++++++++++++++++++- 5 files changed, 392 insertions(+), 5 deletions(-) create mode 100644 confarchive/templates/import/event.html create mode 100644 confarchive/templates/import/start.html diff --git a/confarchive/model.py b/confarchive/model.py index 70394cf..720037e 100644 --- a/confarchive/model.py +++ b/confarchive/model.py @@ -19,6 +19,12 @@ from .database import session Base: sqlalchemy.orm.decl_api.DeclarativeMeta = declarative_base() Base.query = session.query_property() +content_type_to_extension = { + "image/jpeg": "jpg", + "image/png": "png", + "image/gif": "gif", +} + class TimeStampedModel(Base): """Time stamped model.""" @@ -140,6 +146,7 @@ class ConferencePerson(Base): url = Column(String) affiliation = Column(String) photo_url = Column(String) + photo_url_content_type = Column(String) person = relationship("Person", back_populates="conferences_association") conference = relationship("Conference", back_populates="people_detail") @@ -181,7 +188,6 @@ class Event(TimeStampedModel): people_detail = relationship( "EventPerson", order_by="EventPerson.position", - lazy="dynamic", back_populates="event", collection_class=ordering_list("position"), ) @@ -273,15 +279,21 @@ class Person(TimeStampedModel): return typing.cast(ConferencePerson, best) def photo_filename(self) -> str | None: + """Speaker photo filename.""" if self.wikidata_photo: + assert isinstance(self.wikidata_photo[0], str) return os.path.join("wikidata_photo", "thumb", self.wikidata_photo[0]) - q = self.conferences_association.filter(ConferencePerson.photo_url.isnot(None)) + q = self.conferences_association.filter( + ConferencePerson.photo_url.isnot(None), + ConferencePerson.photo_url_content_type.isnot(None), + ) if q.count() == 0: return None best = max(q, key=lambda cp: cp.conference.start) - ext = best.photo_url.rpartition(".")[-1] + ext = content_type_to_extension[best.photo_url_content_type] + filename = f"{best.conference_id}_{self.id}.{ext}" return os.path.join("conference_photo", filename) diff --git a/confarchive/templates/import/event.html b/confarchive/templates/import/event.html new file mode 100644 index 0000000..e251d70 --- /dev/null +++ b/confarchive/templates/import/event.html @@ -0,0 +1,83 @@ +{% extends "base.html" %} + +{% block title %}Import – Conference archive{% endblock %} + +{% block content %} +

Import

+ +
+ +
+ +

event

+
+{{ event.slug }} – +{{ prefer_en_label(event.name) }} +({{ event.date_from }} to {{ event.date_to }}) +
+ + +

rooms

+

room count: {{ rooms.count }}

+ + +

speakers

+

speaker count: {{ speakers | count }}

+{% for speaker in speakers %} +
+

{{ speaker.name }}

+ {% if speaker.avatar %} +
+ {% endif %} + {% if speaker.biography %} +
+ {% for paragraph in speaker.biography.splitlines() %} +

{{ paragraph }}

+ {% endfor %} +
+ {% else %} +

No biography

+ {% endif %} +

{{ plural(person_candidates[speaker.code].count(), "candidate") }} found

+ {% for candidate in person_candidates[speaker.code] %} +
{{ candidate.name }}
+ {% endfor %} +
+ {#
{{ speaker | pprint }}
#} +{% endfor %} + +

talks

+

talk count: {{ talks.count }}

+{% for talk in talks["results"] %} +
+

{{ talk.title }}

+ {% if talk.speakers %} +
speakers: + {% for speaker in talk.speakers %} + {{ speaker.name }} ({{ speaker.code }}) + {% endfor %} +
+ {% else %} +
no speakers
+ {% endif %} +
start: {{ talk.slot.start }}
+
duration: {{ talk.duration }}
+
room: {{ prefer_en_label(talk.slot.room) }}
+
track: {{ prefer_en_label(talk.track) }}
+

abstract

+
{{ talk.abstract }}
+

description

+
{{ talk.description }}
+
+ {#
{{ talk | pprint }}
#} +{% endfor %} + +
+ +
+ +{% endblock %} diff --git a/confarchive/templates/import/start.html b/confarchive/templates/import/start.html new file mode 100644 index 0000000..38135fc --- /dev/null +++ b/confarchive/templates/import/start.html @@ -0,0 +1,40 @@ +{% extends "base.html" %} + +{% block title %}Import – Conference archive{% endblock %} + +{% block content %} +

Import

+ +
+
+ + +
+ +
+ +{% if events %} +
+ {% for event in events %} +
+

+ {% set href = url_for("import_event", hostname=hostname, slug=event.slug) %} + + {{ prefer_en_label(event.name) or "[ anme missing ]" }} + +

+
+ {{event.slug}} ({{ event.date_from }} to {{ event.date_to }}) +
+ {% if event.existing %} + {% set conf = event.existing %} +
+   Already loaded: + {{ conf.title }} +
+ {% endif %} + {% endfor %} +
+{% endif %} + +{% endblock %} diff --git a/confarchive/templates/person.html b/confarchive/templates/person.html index 520b496..a26c1ff 100644 --- a/confarchive/templates/person.html +++ b/confarchive/templates/person.html @@ -154,7 +154,7 @@ {% endif %} {% endif %} - {% if event.people_detail.count() > 1 %} + {% if event.people_detail | count > 1 %}
Other people: {% for p in event.people %} diff --git a/confarchive/view.py b/confarchive/view.py index ea8cdc4..faeac65 100644 --- a/confarchive/view.py +++ b/confarchive/view.py @@ -1,12 +1,17 @@ """Flask views.""" +import json import os +import re +import typing +from datetime import datetime, timedelta import flask +import requests from sqlalchemy import func, or_, update from werkzeug.wrappers import Response -from confarchive import database, model, wikidata, query, utils +from confarchive import database, model, query, utils, wikidata app = flask.Flask(__name__) app.debug = True @@ -14,6 +19,8 @@ app.debug = True app.config.from_object("config.default") database.init_app(app) +user_agent = "conference-archive/0.1 (contact: edward@4angle.com)" + @app.route("/person/", methods=["GET", "POST"]) def person(person_id: int) -> str | Response: @@ -425,6 +432,251 @@ def github_wikidata() -> str: return flask.render_template("github.html", items=items) +@app.route("/import") +def import_start() -> str | Response: + """Begin import.""" + assert check_admin_mode() + url = flask.request.args.get("url") + if not url: + return flask.render_template("import/start.html") + + m = re.match("https?://([^/]+)/", url) + assert m + hostname = m.group(1) + + return flask.redirect(flask.url_for("import_hostname", hostname=hostname)) + + +def pretalx_api(hostname: str, path: str) -> typing.Any: + """Call pretalx API and cache results.""" + import_dir = os.path.join(app.config["DATA_DIR"], "import") + api_url = f"https://{hostname}/api/{path}" + + cache_start = hostname + "_" + path.strip("/").replace("/", "_") + existing = [ + f for f in os.listdir(import_dir) if f.startswith(cache_start + "_2023") + ] + if existing: + recent_filename = max(existing) + recent = datetime.strptime( + recent_filename, cache_start + "_%Y-%m-%d_%H:%M.json" + ) + delta = flask.g.now - recent + if not existing or delta > timedelta(hours=1): + filename = os.path.join(import_dir, f"{cache_start}_{flask.g.now_str}.json") + r = requests.get(api_url, params={"limit": 500}) + with open(filename, "w") as out: + out.write(r.text) + events = r.json() + else: + events = json.load(open(os.path.join(import_dir, recent_filename))) + + return typing.cast(typing.Any, events) + + +def set_now() -> None: + """Record current datetime in flask globals.""" + flask.g.now = datetime.utcnow() + flask.g.now_str = flask.g.now.strftime("%Y-%m-%d_%H:%M") + + +@app.route("/import/") +def import_hostname(hostname: str) -> str: + """Import from hostname.""" + assert check_admin_mode() + set_now() + events = pretalx_api(hostname, "events/") + + slugs = [event["slug"] for event in events] + titles = [prefer_en_label(event["name"]) for event in events] + slug_lookup = {event["slug"]: event for event in events} + name_lookup = {prefer_en_label(event["name"]): event for event in events} + + print(slugs) + + q = model.Conference.query.filter( + or_(model.Conference.short_name.in_(slugs), model.Conference.title.in_(titles)) + ) + print(q.count()) + + for conf in q: + slug = conf.short_name + if slug in slug_lookup: + slug_lookup[slug]["existing"] = conf + if conf.title in name_lookup: + name_lookup[conf.title]["existing"] = conf + + return flask.render_template( + "import/start.html", + events=events, + hostname=hostname, + prefer_en_label=prefer_en_label, + ) + + +def run_import( + event: dict[str, typing.Any], + speakers: list[dict[str, typing.Any]], + talks: list[dict[str, typing.Any]], + speaker_lookup: dict[str, typing.Any], +) -> model.Conference: + if True: + conf = model.Conference( + short_name=event["slug"], + title=prefer_en_label(event["name"]), + start=event["date_from"], + end=event["date_to"], + ) + database.session.add(conf) + + code_to_speaker = {} + for speaker in speakers: + code, name, photo_url = speaker["code"], speaker["name"], speaker["avatar"] + if code == "9NX3NE": + continue + + if speaker_lookup[code].count(): + person = speaker_lookup[code].one() + else: + person = model.Person(name=name) + database.session.add(person) + code_to_speaker[code] = person + + cp = model.ConferencePerson( + person=person, + conference=conf, + named_as=name, + photo_url=photo_url or None, + ) + database.session.add(cp) + + for talk in talks: + if not talk["speakers"]: + continue + + for s in talk["speakers"]: + if s["code"] == "9NX3NE": + s["code"] = "3BRWWP" + + start_time = talk["slot"]["start"][11:16] + assert re.match(r"\d\d:\d\d", start_time) + event = model.Event( + title=talk["title"], + conference=conf, + event_date=talk["slot"]["start"], + duration=talk["duration"], + room=prefer_en_label(talk["slot"]["room"]), + track=prefer_en_label(talk["track"]), + abstract=talk["abstract"], + description=talk["description"], + start=start_time, + people=[code_to_speaker[s["code"]] for s in talk["speakers"]], + ) + database.session.add(event) + + database.session.commit() + + if False: + conf = model.Conference.query.filter_by(short_name="osfc2018").one() + + q = model.ConferencePerson.query.filter( + model.ConferencePerson.conference == conf, + model.ConferencePerson.photo_url.isnot(None), + ) + + photo_dir = "confarchive/static/conference_photo" + web_session = requests.Session() + web_session.headers.update({"User-Agent": user_agent}) + + for cp in q: + if not cp.photo_url: + continue + print(cp.photo_url) + r = web_session.get(cp.photo_url) + content_type = r.headers.get("Content-Type") + print(content_type) + assert content_type in model.content_type_to_extension + image_ext = model.content_type_to_extension[content_type] + photo_filename = f"{conf.id}_{cp.person_id}.{image_ext}" + print((cp.person_id, cp.named_as, content_type, image_ext, photo_filename)) + full_photo = os.path.join(photo_dir, photo_filename) + + with open(full_photo, "wb") as out: + out.write(r.content) + cp.photo_url_content_type = content_type + + database.session.commit() + + return conf + + +def prefer_en_label(labels: dict[str, str] | None) -> str | None: + if labels is None: + return None + if "en" in labels: + return labels["en"] + if len(labels) == 1: + return list(labels.values())[0] + else: + return " / ".join(f"{lang}: {label}" for lang, label in labels.items()) + + +def find_matching_name(name: str): + """People with a matching name.""" + name_parts = name.split() + q1 = model.Person.query.filter(model.Person.name == name) + if q1.count(): + return q1 + name_pattern = "%" + "%".join(name_parts) + "%" + q2 = model.Person.query.filter(model.Person.name.ilike(name_pattern)) + if len(name_parts) == 1 and q2.count() > 1: + return q1 + return q2 + + +@app.route("/import//", methods=["GET", "POST"]) +def import_event(hostname: str, slug: str) -> str | Response: + """Import event.""" + set_now() + + event = pretalx_api(hostname, f"events/{slug}") + rooms = pretalx_api(hostname, f"events/{slug}/rooms") + speakers = pretalx_api(hostname, f"events/{slug}/speakers") + talks = pretalx_api(hostname, f"events/{slug}/talks") + + print(len(speakers["results"]), speakers["count"]) + print((speakers["next"], speakers["previous"])) + assert len(speakers["results"]) == speakers["count"] + assert len(talks["results"]) == talks["count"] + + all_talk_speakers: set[str] = set() + for talk in talks["results"]: + all_talk_speakers.update(speaker["code"] for speaker in talk["speakers"]) + + person_candidates = { + speaker["code"]: find_matching_name(speaker["name"]) + for speaker in speakers["results"] + } + + if flask.request.method == "GET": + return flask.render_template( + "import/event.html", + hostname=hostname, + slug=slug, + event=event, + rooms=rooms, + speakers=[s for s in speakers["results"] if s["code"] in all_talk_speakers], + talks=talks, + person_candidates=person_candidates, + plural=utils.plural, + prefer_en_label=prefer_en_label, + ) + + conf = run_import(event, speakers["results"], talks["results"], person_candidates) + + return flask.redirect(flask.url_for("conference_page", short_name=conf.short_name)) + + @app.route("/reports") def reports_page() -> str: """Page showing statistics."""