From 2cc25cd20356c44c5a87aaeff3a72f714079d9b1 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 19 Mar 2026 10:52:34 +0000 Subject: [PATCH] add-new-conference script --- scripts/add-new-conference | 463 +++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100755 scripts/add-new-conference diff --git a/scripts/add-new-conference b/scripts/add-new-conference new file mode 100755 index 0000000..0f34d2c --- /dev/null +++ b/scripts/add-new-conference @@ -0,0 +1,463 @@ +#!/usr/bin/python3 + +import configparser +import json +import os +import re +import sys +import typing +from datetime import date, datetime, time, timezone +from urllib.parse import parse_qs, urlparse + +import html2text +import openai +import pycountry +import requests +import yaml +from bs4 import BeautifulSoup + +user_agent = "add-new-conference/0.1" +coordinate_patterns = ( + re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), +) + + +def read_api_key() -> str: + """Read API key from ~/.config/openai/config.""" + config_path = os.path.expanduser("~/.config/openai/config") + parser = configparser.ConfigParser() + parser.read(config_path) + return parser["openai"]["api_key"] + + +def build_prompt( + url: str, + source_text: str, + detected_coordinates: tuple[float, float] | None, +) -> str: + """Build prompt with embedded YAML examples.""" + examples = """ +- name: Geomob London + topic: Maps + location: London + country: gb + start: 2026-01-28 18:00:00+00:00 + end: 2026-01-28 23:00:00+00:00 + url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details + venue: Geovation Hub + address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN + latitude: 51.5242464 + longitude: -0.0997024 + free: true + going: true + hashtag: '#geomobLON' + +- name: DebConf 25 + topic: Debian + location: Plouzané (Breast) + country: fr + start: 2025-07-07 + end: 2025-07-20 + url: https://wiki.debian.org/DebConf/25 + going: true + cfp_url: https://debconf25.debconf.org/talks/new/ + venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire + campus de Brest + latitude: 48.35934 + longitude: -4.569889 + +- name: Wikimedia Hackathon + topic: Wikimedia + location: Istanbul + country: tr + start: 2025-05-02 + end: 2025-05-04 + venue: Renaissance Polat Istanbul Hotel + address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul + latitude: 40.959946 + longitude: 28.838763 + url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025 + going: true + free: true + hackathon: true + registered: true +""" + coordinate_note = "" + if detected_coordinates is not None: + coordinate_note = ( + "\nDetected venue coordinates from a map link on the page:\n" + f"latitude: {detected_coordinates[0]}\n" + f"longitude: {detected_coordinates[1]}\n" + ) + + prompt = f""" +I keep a record of interesting conferences in a YAML file. + +Here are some examples of the format I use: + +{examples} + +Now here is a new conference of interest: + +Conference URL: {url} + +Return the YAML representation for this conference following the +same style and keys as the examples. Only include keys if the +information is available. Do not invent details. + +Important: the `country` field must always be a valid ISO 3166-1 alpha-2 +country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). +Do not output full country names. + +Important: always include an `end` field. If the event is a single-day event, +the `end` can be the same date as `start`, or a same-day datetime if the page +provides an end time. + +Important: if this is a Geomob event, use an `end` datetime of 22:00 local time +on the event date unless the page explicitly provides a different end time. +{coordinate_note} + +Wrap your answer in a JSON object with a single key "yaml". +=== +{source_text} +""" + return prompt + + +def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]: + """Pass prompt to OpenAI and get reply.""" + client = openai.OpenAI(api_key=read_api_key()) + + response = client.chat.completions.create( + messages=[{"role": "user", "content": prompt}], + model=model, + response_format={"type": "json_object"}, + ) + + reply = response.choices[0].message.content + assert isinstance(reply, str) + return typing.cast(dict[str, str], json.loads(reply)) + + +def fetch_webpage(url: str) -> BeautifulSoup: + """Fetch webpage HTML and parse it.""" + response = requests.get(url, headers={"User-Agent": user_agent}) + response.raise_for_status() + return BeautifulSoup(response.content, "lxml") + + +def webpage_to_text(soup: BeautifulSoup) -> str: + """Convert parsed HTML into readable text content.""" + soup_copy = BeautifulSoup(str(soup), "lxml") + + for script_or_style in soup_copy(["script", "style"]): + script_or_style.decompose() + + text_maker = html2text.HTML2Text() + text_maker.ignore_links = True + text_maker.ignore_images = True + return text_maker.handle(str(soup_copy)) + + +def parse_osm_url(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from an OpenStreetMap URL.""" + parsed = urlparse(url) + query = parse_qs(parsed.query) + + mlat = query.get("mlat") + mlon = query.get("mlon") + if mlat and mlon: + return float(mlat[0]), float(mlon[0]) + + if parsed.fragment.startswith("map="): + parts = parsed.fragment.split("/") + if len(parts) >= 3: + return float(parts[-2]), float(parts[-1]) + + return None + + +def extract_google_maps_latlon(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from a Google Maps URL.""" + for pattern in coordinate_patterns: + match = pattern.search(url) + if match: + return float(match.group(1)), float(match.group(2)) + + return None + + +def latlon_from_google_maps_url( + url: str, timeout: int = 10 +) -> tuple[float, float] | None: + """Resolve a Google Maps URL and extract latitude/longitude.""" + response = requests.get( + url, + allow_redirects=True, + timeout=timeout, + headers={"User-Agent": "lookup.py/1.0"}, + ) + response.raise_for_status() + + coordinates = extract_google_maps_latlon(response.url) + if coordinates is not None: + return coordinates + + return extract_google_maps_latlon(response.text) + + +def parse_coordinates_from_url(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from a supported map URL.""" + lower_url = url.lower() + + if "openstreetmap.org" in lower_url: + return parse_osm_url(url) + + if "google." in lower_url or "maps.app.goo.gl" in lower_url: + coordinates = extract_google_maps_latlon(url) + if coordinates is not None: + return coordinates + + try: + return latlon_from_google_maps_url(url) + except requests.RequestException: + return None + + return None + + +def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None: + """Detect venue coordinates from Google Maps or OSM links.""" + for link in soup.find_all("a", href=True): + href = str(link["href"]).strip() + if not href: + continue + + coordinates = parse_coordinates_from_url(href) + if coordinates is not None: + return coordinates + + return None + + +def parse_date(date_str: str) -> datetime: + """Parse ISO date or datetime into a naive datetime (UTC if tz-aware).""" + try: + dt = datetime.fromisoformat(date_str) + except ValueError: + # fallback: just take the YYYY-MM-DD part + dt = datetime.fromisoformat(date_str.split("T")[0]) + + if dt.tzinfo is not None: + # normalise tz-aware datetimes to UTC, then strip tzinfo + dt = dt.astimezone(timezone.utc).replace(tzinfo=None) + + return dt + + +def url_has_year_component(url: str) -> bool: + """Return True if the URL contains any digit (assume year-specific).""" + return any(ch.isdigit() for ch in url) + + +def insert_sorted( + conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any] +) -> list[dict[str, typing.Any]]: + """Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness).""" + new_url = new_conf.get("url") + new_start = parse_date(str(new_conf["start"])) + new_year = new_start.year + + if new_url: + for conf in conferences: + if conf.get("url") == new_url: + existing_start = parse_date(str(conf["start"])) + existing_year = existing_start.year + + if url_has_year_component(new_url): + # If URL has a year in it, treat exact URL as unique + print(f"⚠️ Conference with URL {new_url} already exists, skipping.") + return conferences + elif existing_year == new_year: + # Same URL, same year → definitely duplicate + print( + f"⚠️ Conference already exists in YAML " + f"(url={new_url}, year={existing_year}), skipping." + ) + return conferences + else: + # Same URL reused for different year → allow new entry + continue + + # Insert sorted by start date + for idx, conf in enumerate(conferences): + existing_start = parse_date(str(conf["start"])) + if new_start < existing_start: + conferences.insert(idx, new_conf) + return conferences + conferences.append(new_conf) + return conferences + + +def validate_country(conf: dict[str, typing.Any]) -> None: + """Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible.""" + country = conf.get("country") + if not country: + return + + country = country.strip() + # Already a 2-letter code + if len(country) == 2: + if pycountry.countries.get(alpha_2=country.upper()): + conf["country"] = country.lower() + return + else: + raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'") + + # Try lookup by name + match = pycountry.countries.get(name=country) + if not match: + # fuzzy lookup (handles “United States” vs “United States of America”) + try: + match = pycountry.countries.search_fuzzy(country)[0] + except LookupError: + raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1") + + conf["country"] = match.alpha_2.lower() + + +def parse_yaml_datetime(value: typing.Any) -> datetime | None: + """Convert YAML date/datetime values to a datetime.""" + if isinstance(value, datetime): + return value + + if isinstance(value, date): + return datetime.combine(value, time()) + + if isinstance(value, str): + try: + return datetime.fromisoformat(value) + except ValueError: + return datetime.combine(date.fromisoformat(value.split("T")[0]), time()) + + return None + + +def same_type_as_start( + start_value: typing.Any, + new_dt: datetime, + keep_timezone: bool = True, + prefer_datetime: bool = False, +) -> typing.Any: + """Return end value shaped like the start value when possible.""" + if isinstance(start_value, datetime): + if keep_timezone: + return new_dt + return new_dt.replace(tzinfo=None) + + if isinstance(start_value, date): + if prefer_datetime: + return new_dt + return new_dt.date() + + if isinstance(start_value, str): + if prefer_datetime or " " in start_value or "T" in start_value: + return new_dt.isoformat(sep=" ") + return new_dt.date().isoformat() + + return new_dt + + +def maybe_extract_explicit_end_time(source_text: str) -> int | None: + """Extract an explicit 12-hour clock end time for Geomob-style pages.""" + lowered = source_text.lower() + + if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered: + return 22 + + if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered: + return 23 + + return None + + +def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None: + """Ensure an end value exists, with a Geomob-specific fallback.""" + start_value = new_conf.get("start") + if start_value is None: + return + + start_dt = parse_yaml_datetime(start_value) + if start_dt is None: + return + + name = str(new_conf.get("name", "")) + url = str(new_conf.get("url", "")) + is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower() + + if is_geomob: + end_hour = maybe_extract_explicit_end_time(source_text) + if end_hour is None: + end_hour = 22 + + geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0) + new_conf["end"] = same_type_as_start( + start_value, geomob_end, prefer_datetime=True + ) + return + + if "end" not in new_conf: + new_conf["end"] = same_type_as_start(start_value, start_dt) + + +def main() -> None: + """Fetch page, generate YAML via LLM, update conferences.yaml.""" + url = sys.argv[1] + yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml") + + # Load conferences first + with open(yaml_path) as f: + conferences = yaml.safe_load(f) + + # Early exit: if URL contains a year and already exists, skip + if url_has_year_component(url): + for conf in conferences: + if conf.get("url") == url: + print( + "⚠️ Conference already exists in YAML " + + f"(url={url}), skipping before API call." + ) + return + + # Otherwise proceed with full workflow + soup = fetch_webpage(url) + source_text = webpage_to_text(soup) + detected_coordinates = detect_page_coordinates(soup) + prompt = build_prompt(url, source_text, detected_coordinates) + new_yaml_text = get_from_open_ai(prompt)["yaml"] + + new_conf = yaml.safe_load(new_yaml_text) + if isinstance(new_conf, list): + new_conf = new_conf[0] + + validate_country(new_conf) + normalise_end_field(new_conf, source_text) + + if detected_coordinates is not None: + new_conf["latitude"] = detected_coordinates[0] + new_conf["longitude"] = detected_coordinates[1] + + updated = insert_sorted(conferences, new_conf) + + with open(yaml_path, "w") as f: + text = yaml.dump(updated, sort_keys=False, allow_unicode=True) + text = text.replace("\n- name:", "\n\n- name:") # keep blank lines + f.write(text.lstrip()) + + +if __name__ == "__main__": + main()