add-new-conference script
This commit is contained in:
parent
1fff1f136d
commit
2cc25cd203
1 changed files with 463 additions and 0 deletions
463
scripts/add-new-conference
Executable file
463
scripts/add-new-conference
Executable file
|
|
@ -0,0 +1,463 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import typing
|
||||||
|
from datetime import date, datetime, time, timezone
|
||||||
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
|
||||||
|
import html2text
|
||||||
|
import openai
|
||||||
|
import pycountry
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
user_agent = "add-new-conference/0.1"
|
||||||
|
coordinate_patterns = (
|
||||||
|
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||||
|
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||||
|
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||||
|
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
|
||||||
|
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def read_api_key() -> str:
|
||||||
|
"""Read API key from ~/.config/openai/config."""
|
||||||
|
config_path = os.path.expanduser("~/.config/openai/config")
|
||||||
|
parser = configparser.ConfigParser()
|
||||||
|
parser.read(config_path)
|
||||||
|
return parser["openai"]["api_key"]
|
||||||
|
|
||||||
|
|
||||||
|
def build_prompt(
|
||||||
|
url: str,
|
||||||
|
source_text: str,
|
||||||
|
detected_coordinates: tuple[float, float] | None,
|
||||||
|
) -> str:
|
||||||
|
"""Build prompt with embedded YAML examples."""
|
||||||
|
examples = """
|
||||||
|
- name: Geomob London
|
||||||
|
topic: Maps
|
||||||
|
location: London
|
||||||
|
country: gb
|
||||||
|
start: 2026-01-28 18:00:00+00:00
|
||||||
|
end: 2026-01-28 23:00:00+00:00
|
||||||
|
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
|
||||||
|
venue: Geovation Hub
|
||||||
|
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
|
||||||
|
latitude: 51.5242464
|
||||||
|
longitude: -0.0997024
|
||||||
|
free: true
|
||||||
|
going: true
|
||||||
|
hashtag: '#geomobLON'
|
||||||
|
|
||||||
|
- name: DebConf 25
|
||||||
|
topic: Debian
|
||||||
|
location: Plouzané (Breast)
|
||||||
|
country: fr
|
||||||
|
start: 2025-07-07
|
||||||
|
end: 2025-07-20
|
||||||
|
url: https://wiki.debian.org/DebConf/25
|
||||||
|
going: true
|
||||||
|
cfp_url: https://debconf25.debconf.org/talks/new/
|
||||||
|
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
|
||||||
|
campus de Brest
|
||||||
|
latitude: 48.35934
|
||||||
|
longitude: -4.569889
|
||||||
|
|
||||||
|
- name: Wikimedia Hackathon
|
||||||
|
topic: Wikimedia
|
||||||
|
location: Istanbul
|
||||||
|
country: tr
|
||||||
|
start: 2025-05-02
|
||||||
|
end: 2025-05-04
|
||||||
|
venue: Renaissance Polat Istanbul Hotel
|
||||||
|
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
|
||||||
|
latitude: 40.959946
|
||||||
|
longitude: 28.838763
|
||||||
|
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
|
||||||
|
going: true
|
||||||
|
free: true
|
||||||
|
hackathon: true
|
||||||
|
registered: true
|
||||||
|
"""
|
||||||
|
coordinate_note = ""
|
||||||
|
if detected_coordinates is not None:
|
||||||
|
coordinate_note = (
|
||||||
|
"\nDetected venue coordinates from a map link on the page:\n"
|
||||||
|
f"latitude: {detected_coordinates[0]}\n"
|
||||||
|
f"longitude: {detected_coordinates[1]}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
I keep a record of interesting conferences in a YAML file.
|
||||||
|
|
||||||
|
Here are some examples of the format I use:
|
||||||
|
|
||||||
|
{examples}
|
||||||
|
|
||||||
|
Now here is a new conference of interest:
|
||||||
|
|
||||||
|
Conference URL: {url}
|
||||||
|
|
||||||
|
Return the YAML representation for this conference following the
|
||||||
|
same style and keys as the examples. Only include keys if the
|
||||||
|
information is available. Do not invent details.
|
||||||
|
|
||||||
|
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
|
||||||
|
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
|
||||||
|
Do not output full country names.
|
||||||
|
|
||||||
|
Important: always include an `end` field. If the event is a single-day event,
|
||||||
|
the `end` can be the same date as `start`, or a same-day datetime if the page
|
||||||
|
provides an end time.
|
||||||
|
|
||||||
|
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
|
||||||
|
on the event date unless the page explicitly provides a different end time.
|
||||||
|
{coordinate_note}
|
||||||
|
|
||||||
|
Wrap your answer in a JSON object with a single key "yaml".
|
||||||
|
===
|
||||||
|
{source_text}
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
|
||||||
|
"""Pass prompt to OpenAI and get reply."""
|
||||||
|
client = openai.OpenAI(api_key=read_api_key())
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
model=model,
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
)
|
||||||
|
|
||||||
|
reply = response.choices[0].message.content
|
||||||
|
assert isinstance(reply, str)
|
||||||
|
return typing.cast(dict[str, str], json.loads(reply))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_webpage(url: str) -> BeautifulSoup:
|
||||||
|
"""Fetch webpage HTML and parse it."""
|
||||||
|
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||||
|
response.raise_for_status()
|
||||||
|
return BeautifulSoup(response.content, "lxml")
|
||||||
|
|
||||||
|
|
||||||
|
def webpage_to_text(soup: BeautifulSoup) -> str:
|
||||||
|
"""Convert parsed HTML into readable text content."""
|
||||||
|
soup_copy = BeautifulSoup(str(soup), "lxml")
|
||||||
|
|
||||||
|
for script_or_style in soup_copy(["script", "style"]):
|
||||||
|
script_or_style.decompose()
|
||||||
|
|
||||||
|
text_maker = html2text.HTML2Text()
|
||||||
|
text_maker.ignore_links = True
|
||||||
|
text_maker.ignore_images = True
|
||||||
|
return text_maker.handle(str(soup_copy))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_osm_url(url: str) -> tuple[float, float] | None:
|
||||||
|
"""Extract latitude/longitude from an OpenStreetMap URL."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
query = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
mlat = query.get("mlat")
|
||||||
|
mlon = query.get("mlon")
|
||||||
|
if mlat and mlon:
|
||||||
|
return float(mlat[0]), float(mlon[0])
|
||||||
|
|
||||||
|
if parsed.fragment.startswith("map="):
|
||||||
|
parts = parsed.fragment.split("/")
|
||||||
|
if len(parts) >= 3:
|
||||||
|
return float(parts[-2]), float(parts[-1])
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
|
||||||
|
"""Extract latitude/longitude from a Google Maps URL."""
|
||||||
|
for pattern in coordinate_patterns:
|
||||||
|
match = pattern.search(url)
|
||||||
|
if match:
|
||||||
|
return float(match.group(1)), float(match.group(2))
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def latlon_from_google_maps_url(
|
||||||
|
url: str, timeout: int = 10
|
||||||
|
) -> tuple[float, float] | None:
|
||||||
|
"""Resolve a Google Maps URL and extract latitude/longitude."""
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
allow_redirects=True,
|
||||||
|
timeout=timeout,
|
||||||
|
headers={"User-Agent": "lookup.py/1.0"},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
coordinates = extract_google_maps_latlon(response.url)
|
||||||
|
if coordinates is not None:
|
||||||
|
return coordinates
|
||||||
|
|
||||||
|
return extract_google_maps_latlon(response.text)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
|
||||||
|
"""Extract latitude/longitude from a supported map URL."""
|
||||||
|
lower_url = url.lower()
|
||||||
|
|
||||||
|
if "openstreetmap.org" in lower_url:
|
||||||
|
return parse_osm_url(url)
|
||||||
|
|
||||||
|
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
|
||||||
|
coordinates = extract_google_maps_latlon(url)
|
||||||
|
if coordinates is not None:
|
||||||
|
return coordinates
|
||||||
|
|
||||||
|
try:
|
||||||
|
return latlon_from_google_maps_url(url)
|
||||||
|
except requests.RequestException:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
|
||||||
|
"""Detect venue coordinates from Google Maps or OSM links."""
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
href = str(link["href"]).strip()
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
coordinates = parse_coordinates_from_url(href)
|
||||||
|
if coordinates is not None:
|
||||||
|
return coordinates
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(date_str: str) -> datetime:
|
||||||
|
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(date_str)
|
||||||
|
except ValueError:
|
||||||
|
# fallback: just take the YYYY-MM-DD part
|
||||||
|
dt = datetime.fromisoformat(date_str.split("T")[0])
|
||||||
|
|
||||||
|
if dt.tzinfo is not None:
|
||||||
|
# normalise tz-aware datetimes to UTC, then strip tzinfo
|
||||||
|
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||||
|
|
||||||
|
return dt
|
||||||
|
|
||||||
|
|
||||||
|
def url_has_year_component(url: str) -> bool:
|
||||||
|
"""Return True if the URL contains any digit (assume year-specific)."""
|
||||||
|
return any(ch.isdigit() for ch in url)
|
||||||
|
|
||||||
|
|
||||||
|
def insert_sorted(
|
||||||
|
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
|
||||||
|
) -> list[dict[str, typing.Any]]:
|
||||||
|
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
|
||||||
|
new_url = new_conf.get("url")
|
||||||
|
new_start = parse_date(str(new_conf["start"]))
|
||||||
|
new_year = new_start.year
|
||||||
|
|
||||||
|
if new_url:
|
||||||
|
for conf in conferences:
|
||||||
|
if conf.get("url") == new_url:
|
||||||
|
existing_start = parse_date(str(conf["start"]))
|
||||||
|
existing_year = existing_start.year
|
||||||
|
|
||||||
|
if url_has_year_component(new_url):
|
||||||
|
# If URL has a year in it, treat exact URL as unique
|
||||||
|
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
|
||||||
|
return conferences
|
||||||
|
elif existing_year == new_year:
|
||||||
|
# Same URL, same year → definitely duplicate
|
||||||
|
print(
|
||||||
|
f"⚠️ Conference already exists in YAML "
|
||||||
|
f"(url={new_url}, year={existing_year}), skipping."
|
||||||
|
)
|
||||||
|
return conferences
|
||||||
|
else:
|
||||||
|
# Same URL reused for different year → allow new entry
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Insert sorted by start date
|
||||||
|
for idx, conf in enumerate(conferences):
|
||||||
|
existing_start = parse_date(str(conf["start"]))
|
||||||
|
if new_start < existing_start:
|
||||||
|
conferences.insert(idx, new_conf)
|
||||||
|
return conferences
|
||||||
|
conferences.append(new_conf)
|
||||||
|
return conferences
|
||||||
|
|
||||||
|
|
||||||
|
def validate_country(conf: dict[str, typing.Any]) -> None:
|
||||||
|
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
|
||||||
|
country = conf.get("country")
|
||||||
|
if not country:
|
||||||
|
return
|
||||||
|
|
||||||
|
country = country.strip()
|
||||||
|
# Already a 2-letter code
|
||||||
|
if len(country) == 2:
|
||||||
|
if pycountry.countries.get(alpha_2=country.upper()):
|
||||||
|
conf["country"] = country.lower()
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
|
||||||
|
|
||||||
|
# Try lookup by name
|
||||||
|
match = pycountry.countries.get(name=country)
|
||||||
|
if not match:
|
||||||
|
# fuzzy lookup (handles “United States” vs “United States of America”)
|
||||||
|
try:
|
||||||
|
match = pycountry.countries.search_fuzzy(country)[0]
|
||||||
|
except LookupError:
|
||||||
|
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
|
||||||
|
|
||||||
|
conf["country"] = match.alpha_2.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
|
||||||
|
"""Convert YAML date/datetime values to a datetime."""
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
return value
|
||||||
|
|
||||||
|
if isinstance(value, date):
|
||||||
|
return datetime.combine(value, time())
|
||||||
|
|
||||||
|
if isinstance(value, str):
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value)
|
||||||
|
except ValueError:
|
||||||
|
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def same_type_as_start(
|
||||||
|
start_value: typing.Any,
|
||||||
|
new_dt: datetime,
|
||||||
|
keep_timezone: bool = True,
|
||||||
|
prefer_datetime: bool = False,
|
||||||
|
) -> typing.Any:
|
||||||
|
"""Return end value shaped like the start value when possible."""
|
||||||
|
if isinstance(start_value, datetime):
|
||||||
|
if keep_timezone:
|
||||||
|
return new_dt
|
||||||
|
return new_dt.replace(tzinfo=None)
|
||||||
|
|
||||||
|
if isinstance(start_value, date):
|
||||||
|
if prefer_datetime:
|
||||||
|
return new_dt
|
||||||
|
return new_dt.date()
|
||||||
|
|
||||||
|
if isinstance(start_value, str):
|
||||||
|
if prefer_datetime or " " in start_value or "T" in start_value:
|
||||||
|
return new_dt.isoformat(sep=" ")
|
||||||
|
return new_dt.date().isoformat()
|
||||||
|
|
||||||
|
return new_dt
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
|
||||||
|
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
|
||||||
|
lowered = source_text.lower()
|
||||||
|
|
||||||
|
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
|
||||||
|
return 22
|
||||||
|
|
||||||
|
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
|
||||||
|
return 23
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
|
||||||
|
"""Ensure an end value exists, with a Geomob-specific fallback."""
|
||||||
|
start_value = new_conf.get("start")
|
||||||
|
if start_value is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
start_dt = parse_yaml_datetime(start_value)
|
||||||
|
if start_dt is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
name = str(new_conf.get("name", ""))
|
||||||
|
url = str(new_conf.get("url", ""))
|
||||||
|
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
|
||||||
|
|
||||||
|
if is_geomob:
|
||||||
|
end_hour = maybe_extract_explicit_end_time(source_text)
|
||||||
|
if end_hour is None:
|
||||||
|
end_hour = 22
|
||||||
|
|
||||||
|
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
|
||||||
|
new_conf["end"] = same_type_as_start(
|
||||||
|
start_value, geomob_end, prefer_datetime=True
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
if "end" not in new_conf:
|
||||||
|
new_conf["end"] = same_type_as_start(start_value, start_dt)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
|
||||||
|
url = sys.argv[1]
|
||||||
|
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
|
||||||
|
|
||||||
|
# Load conferences first
|
||||||
|
with open(yaml_path) as f:
|
||||||
|
conferences = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Early exit: if URL contains a year and already exists, skip
|
||||||
|
if url_has_year_component(url):
|
||||||
|
for conf in conferences:
|
||||||
|
if conf.get("url") == url:
|
||||||
|
print(
|
||||||
|
"⚠️ Conference already exists in YAML "
|
||||||
|
+ f"(url={url}), skipping before API call."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Otherwise proceed with full workflow
|
||||||
|
soup = fetch_webpage(url)
|
||||||
|
source_text = webpage_to_text(soup)
|
||||||
|
detected_coordinates = detect_page_coordinates(soup)
|
||||||
|
prompt = build_prompt(url, source_text, detected_coordinates)
|
||||||
|
new_yaml_text = get_from_open_ai(prompt)["yaml"]
|
||||||
|
|
||||||
|
new_conf = yaml.safe_load(new_yaml_text)
|
||||||
|
if isinstance(new_conf, list):
|
||||||
|
new_conf = new_conf[0]
|
||||||
|
|
||||||
|
validate_country(new_conf)
|
||||||
|
normalise_end_field(new_conf, source_text)
|
||||||
|
|
||||||
|
if detected_coordinates is not None:
|
||||||
|
new_conf["latitude"] = detected_coordinates[0]
|
||||||
|
new_conf["longitude"] = detected_coordinates[1]
|
||||||
|
|
||||||
|
updated = insert_sorted(conferences, new_conf)
|
||||||
|
|
||||||
|
with open(yaml_path, "w") as f:
|
||||||
|
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
|
||||||
|
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
|
||||||
|
f.write(text.lstrip())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue