agenda/scripts/add-new-conference

463 lines
14 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import configparser
import json
import os
import re
import sys
import typing
from datetime import date, datetime, time, timezone
from urllib.parse import parse_qs, urlparse
import html2text
import openai
import pycountry
import requests
import yaml
from bs4 import BeautifulSoup
user_agent = "add-new-conference/0.1"
coordinate_patterns = (
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
)
def read_api_key() -> str:
"""Read API key from ~/.config/openai/config."""
config_path = os.path.expanduser("~/.config/openai/config")
parser = configparser.ConfigParser()
parser.read(config_path)
return parser["openai"]["api_key"]
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
) -> str:
"""Build prompt with embedded YAML examples."""
examples = """
- name: Geomob London
topic: Maps
location: London
country: gb
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 23:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464
longitude: -0.0997024
free: true
going: true
hashtag: '#geomobLON'
- name: DebConf 25
topic: Debian
location: Plouzané (Breast)
country: fr
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
campus de Brest
latitude: 48.35934
longitude: -4.569889
- name: Wikimedia Hackathon
topic: Wikimedia
location: Istanbul
country: tr
start: 2025-05-02
end: 2025-05-04
venue: Renaissance Polat Istanbul Hotel
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
latitude: 40.959946
longitude: 28.838763
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
hackathon: true
registered: true
"""
coordinate_note = ""
if detected_coordinates is not None:
coordinate_note = (
"\nDetected venue coordinates from a map link on the page:\n"
f"latitude: {detected_coordinates[0]}\n"
f"longitude: {detected_coordinates[1]}\n"
)
prompt = f"""
I keep a record of interesting conferences in a YAML file.
Here are some examples of the format I use:
{examples}
Now here is a new conference of interest:
Conference URL: {url}
Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note}
Wrap your answer in a JSON object with a single key "yaml".
===
{source_text}
"""
return prompt
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
"""Pass prompt to OpenAI and get reply."""
client = openai.OpenAI(api_key=read_api_key())
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
response_format={"type": "json_object"},
)
reply = response.choices[0].message.content
assert isinstance(reply, str)
return typing.cast(dict[str, str], json.loads(reply))
def fetch_webpage(url: str) -> BeautifulSoup:
"""Fetch webpage HTML and parse it."""
response = requests.get(url, headers={"User-Agent": user_agent})
response.raise_for_status()
return BeautifulSoup(response.content, "lxml")
def webpage_to_text(soup: BeautifulSoup) -> str:
"""Convert parsed HTML into readable text content."""
soup_copy = BeautifulSoup(str(soup), "lxml")
for script_or_style in soup_copy(["script", "style"]):
script_or_style.decompose()
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
return text_maker.handle(str(soup_copy))
def parse_osm_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from an OpenStreetMap URL."""
parsed = urlparse(url)
query = parse_qs(parsed.query)
mlat = query.get("mlat")
mlon = query.get("mlon")
if mlat and mlon:
return float(mlat[0]), float(mlon[0])
if parsed.fragment.startswith("map="):
parts = parsed.fragment.split("/")
if len(parts) >= 3:
return float(parts[-2]), float(parts[-1])
return None
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a Google Maps URL."""
for pattern in coordinate_patterns:
match = pattern.search(url)
if match:
return float(match.group(1)), float(match.group(2))
return None
def latlon_from_google_maps_url(
url: str, timeout: int = 10
) -> tuple[float, float] | None:
"""Resolve a Google Maps URL and extract latitude/longitude."""
response = requests.get(
url,
allow_redirects=True,
timeout=timeout,
headers={"User-Agent": "lookup.py/1.0"},
)
response.raise_for_status()
coordinates = extract_google_maps_latlon(response.url)
if coordinates is not None:
return coordinates
return extract_google_maps_latlon(response.text)
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a supported map URL."""
lower_url = url.lower()
if "openstreetmap.org" in lower_url:
return parse_osm_url(url)
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
coordinates = extract_google_maps_latlon(url)
if coordinates is not None:
return coordinates
try:
return latlon_from_google_maps_url(url)
except requests.RequestException:
return None
return None
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
"""Detect venue coordinates from Google Maps or OSM links."""
for link in soup.find_all("a", href=True):
href = str(link["href"]).strip()
if not href:
continue
coordinates = parse_coordinates_from_url(href)
if coordinates is not None:
return coordinates
return None
def parse_date(date_str: str) -> datetime:
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
try:
dt = datetime.fromisoformat(date_str)
except ValueError:
# fallback: just take the YYYY-MM-DD part
dt = datetime.fromisoformat(date_str.split("T")[0])
if dt.tzinfo is not None:
# normalise tz-aware datetimes to UTC, then strip tzinfo
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit (assume year-specific)."""
return any(ch.isdigit() for ch in url)
def insert_sorted(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> list[dict[str, typing.Any]]:
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
new_url = new_conf.get("url")
new_start = parse_date(str(new_conf["start"]))
new_year = new_start.year
if new_url:
for conf in conferences:
if conf.get("url") == new_url:
existing_start = parse_date(str(conf["start"]))
existing_year = existing_start.year
if url_has_year_component(new_url):
# If URL has a year in it, treat exact URL as unique
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
return conferences
elif existing_year == new_year:
# Same URL, same year → definitely duplicate
print(
f"⚠️ Conference already exists in YAML "
f"(url={new_url}, year={existing_year}), skipping."
)
return conferences
else:
# Same URL reused for different year → allow new entry
continue
# Insert sorted by start date
for idx, conf in enumerate(conferences):
existing_start = parse_date(str(conf["start"]))
if new_start < existing_start:
conferences.insert(idx, new_conf)
return conferences
conferences.append(new_conf)
return conferences
def validate_country(conf: dict[str, typing.Any]) -> None:
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
country = conf.get("country")
if not country:
return
country = country.strip()
# Already a 2-letter code
if len(country) == 2:
if pycountry.countries.get(alpha_2=country.upper()):
conf["country"] = country.lower()
return
else:
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
# Try lookup by name
match = pycountry.countries.get(name=country)
if not match:
# fuzzy lookup (handles “United States” vs “United States of America”)
try:
match = pycountry.countries.search_fuzzy(country)[0]
except LookupError:
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
conf["country"] = match.alpha_2.lower()
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
"""Convert YAML date/datetime values to a datetime."""
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, time())
if isinstance(value, str):
try:
return datetime.fromisoformat(value)
except ValueError:
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
return None
def same_type_as_start(
start_value: typing.Any,
new_dt: datetime,
keep_timezone: bool = True,
prefer_datetime: bool = False,
) -> typing.Any:
"""Return end value shaped like the start value when possible."""
if isinstance(start_value, datetime):
if keep_timezone:
return new_dt
return new_dt.replace(tzinfo=None)
if isinstance(start_value, date):
if prefer_datetime:
return new_dt
return new_dt.date()
if isinstance(start_value, str):
if prefer_datetime or " " in start_value or "T" in start_value:
return new_dt.isoformat(sep=" ")
return new_dt.date().isoformat()
return new_dt
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower()
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
return 22
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
return 23
return None
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
"""Ensure an end value exists, with a Geomob-specific fallback."""
start_value = new_conf.get("start")
if start_value is None:
return
start_dt = parse_yaml_datetime(start_value)
if start_dt is None:
return
name = str(new_conf.get("name", ""))
url = str(new_conf.get("url", ""))
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
if is_geomob:
end_hour = maybe_extract_explicit_end_time(source_text)
if end_hour is None:
end_hour = 22
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
new_conf["end"] = same_type_as_start(
start_value, geomob_end, prefer_datetime=True
)
return
if "end" not in new_conf:
new_conf["end"] = same_type_as_start(start_value, start_dt)
def main() -> None:
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
url = sys.argv[1]
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
# Load conferences first
with open(yaml_path) as f:
conferences = yaml.safe_load(f)
# Early exit: if URL contains a year and already exists, skip
if url_has_year_component(url):
for conf in conferences:
if conf.get("url") == url:
print(
"⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call."
)
return
# Otherwise proceed with full workflow
soup = fetch_webpage(url)
source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates)
new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text)
if isinstance(new_conf, list):
new_conf = new_conf[0]
validate_country(new_conf)
normalise_end_field(new_conf, source_text)
if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0]
new_conf["longitude"] = detected_coordinates[1]
updated = insert_sorted(conferences, new_conf)
with open(yaml_path, "w") as f:
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
f.write(text.lstrip())
if __name__ == "__main__":
main()