463 lines
14 KiB
Python
Executable file
463 lines
14 KiB
Python
Executable file
#!/usr/bin/python3
|
||
|
||
import configparser
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import typing
|
||
from datetime import date, datetime, time, timezone
|
||
from urllib.parse import parse_qs, urlparse
|
||
|
||
import html2text
|
||
import openai
|
||
import pycountry
|
||
import requests
|
||
import yaml
|
||
from bs4 import BeautifulSoup
|
||
|
||
user_agent = "add-new-conference/0.1"
|
||
coordinate_patterns = (
|
||
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
|
||
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||
)
|
||
|
||
|
||
def read_api_key() -> str:
|
||
"""Read API key from ~/.config/openai/config."""
|
||
config_path = os.path.expanduser("~/.config/openai/config")
|
||
parser = configparser.ConfigParser()
|
||
parser.read(config_path)
|
||
return parser["openai"]["api_key"]
|
||
|
||
|
||
def build_prompt(
|
||
url: str,
|
||
source_text: str,
|
||
detected_coordinates: tuple[float, float] | None,
|
||
) -> str:
|
||
"""Build prompt with embedded YAML examples."""
|
||
examples = """
|
||
- name: Geomob London
|
||
topic: Maps
|
||
location: London
|
||
country: gb
|
||
start: 2026-01-28 18:00:00+00:00
|
||
end: 2026-01-28 23:00:00+00:00
|
||
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
|
||
venue: Geovation Hub
|
||
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
|
||
latitude: 51.5242464
|
||
longitude: -0.0997024
|
||
free: true
|
||
going: true
|
||
hashtag: '#geomobLON'
|
||
|
||
- name: DebConf 25
|
||
topic: Debian
|
||
location: Plouzané (Breast)
|
||
country: fr
|
||
start: 2025-07-07
|
||
end: 2025-07-20
|
||
url: https://wiki.debian.org/DebConf/25
|
||
going: true
|
||
cfp_url: https://debconf25.debconf.org/talks/new/
|
||
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
|
||
campus de Brest
|
||
latitude: 48.35934
|
||
longitude: -4.569889
|
||
|
||
- name: Wikimedia Hackathon
|
||
topic: Wikimedia
|
||
location: Istanbul
|
||
country: tr
|
||
start: 2025-05-02
|
||
end: 2025-05-04
|
||
venue: Renaissance Polat Istanbul Hotel
|
||
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
|
||
latitude: 40.959946
|
||
longitude: 28.838763
|
||
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
|
||
going: true
|
||
free: true
|
||
hackathon: true
|
||
registered: true
|
||
"""
|
||
coordinate_note = ""
|
||
if detected_coordinates is not None:
|
||
coordinate_note = (
|
||
"\nDetected venue coordinates from a map link on the page:\n"
|
||
f"latitude: {detected_coordinates[0]}\n"
|
||
f"longitude: {detected_coordinates[1]}\n"
|
||
)
|
||
|
||
prompt = f"""
|
||
I keep a record of interesting conferences in a YAML file.
|
||
|
||
Here are some examples of the format I use:
|
||
|
||
{examples}
|
||
|
||
Now here is a new conference of interest:
|
||
|
||
Conference URL: {url}
|
||
|
||
Return the YAML representation for this conference following the
|
||
same style and keys as the examples. Only include keys if the
|
||
information is available. Do not invent details.
|
||
|
||
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
|
||
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
|
||
Do not output full country names.
|
||
|
||
Important: always include an `end` field. If the event is a single-day event,
|
||
the `end` can be the same date as `start`, or a same-day datetime if the page
|
||
provides an end time.
|
||
|
||
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
|
||
on the event date unless the page explicitly provides a different end time.
|
||
{coordinate_note}
|
||
|
||
Wrap your answer in a JSON object with a single key "yaml".
|
||
===
|
||
{source_text}
|
||
"""
|
||
return prompt
|
||
|
||
|
||
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
|
||
"""Pass prompt to OpenAI and get reply."""
|
||
client = openai.OpenAI(api_key=read_api_key())
|
||
|
||
response = client.chat.completions.create(
|
||
messages=[{"role": "user", "content": prompt}],
|
||
model=model,
|
||
response_format={"type": "json_object"},
|
||
)
|
||
|
||
reply = response.choices[0].message.content
|
||
assert isinstance(reply, str)
|
||
return typing.cast(dict[str, str], json.loads(reply))
|
||
|
||
|
||
def fetch_webpage(url: str) -> BeautifulSoup:
|
||
"""Fetch webpage HTML and parse it."""
|
||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||
response.raise_for_status()
|
||
return BeautifulSoup(response.content, "lxml")
|
||
|
||
|
||
def webpage_to_text(soup: BeautifulSoup) -> str:
|
||
"""Convert parsed HTML into readable text content."""
|
||
soup_copy = BeautifulSoup(str(soup), "lxml")
|
||
|
||
for script_or_style in soup_copy(["script", "style"]):
|
||
script_or_style.decompose()
|
||
|
||
text_maker = html2text.HTML2Text()
|
||
text_maker.ignore_links = True
|
||
text_maker.ignore_images = True
|
||
return text_maker.handle(str(soup_copy))
|
||
|
||
|
||
def parse_osm_url(url: str) -> tuple[float, float] | None:
|
||
"""Extract latitude/longitude from an OpenStreetMap URL."""
|
||
parsed = urlparse(url)
|
||
query = parse_qs(parsed.query)
|
||
|
||
mlat = query.get("mlat")
|
||
mlon = query.get("mlon")
|
||
if mlat and mlon:
|
||
return float(mlat[0]), float(mlon[0])
|
||
|
||
if parsed.fragment.startswith("map="):
|
||
parts = parsed.fragment.split("/")
|
||
if len(parts) >= 3:
|
||
return float(parts[-2]), float(parts[-1])
|
||
|
||
return None
|
||
|
||
|
||
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
|
||
"""Extract latitude/longitude from a Google Maps URL."""
|
||
for pattern in coordinate_patterns:
|
||
match = pattern.search(url)
|
||
if match:
|
||
return float(match.group(1)), float(match.group(2))
|
||
|
||
return None
|
||
|
||
|
||
def latlon_from_google_maps_url(
|
||
url: str, timeout: int = 10
|
||
) -> tuple[float, float] | None:
|
||
"""Resolve a Google Maps URL and extract latitude/longitude."""
|
||
response = requests.get(
|
||
url,
|
||
allow_redirects=True,
|
||
timeout=timeout,
|
||
headers={"User-Agent": "lookup.py/1.0"},
|
||
)
|
||
response.raise_for_status()
|
||
|
||
coordinates = extract_google_maps_latlon(response.url)
|
||
if coordinates is not None:
|
||
return coordinates
|
||
|
||
return extract_google_maps_latlon(response.text)
|
||
|
||
|
||
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
|
||
"""Extract latitude/longitude from a supported map URL."""
|
||
lower_url = url.lower()
|
||
|
||
if "openstreetmap.org" in lower_url:
|
||
return parse_osm_url(url)
|
||
|
||
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
|
||
coordinates = extract_google_maps_latlon(url)
|
||
if coordinates is not None:
|
||
return coordinates
|
||
|
||
try:
|
||
return latlon_from_google_maps_url(url)
|
||
except requests.RequestException:
|
||
return None
|
||
|
||
return None
|
||
|
||
|
||
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
|
||
"""Detect venue coordinates from Google Maps or OSM links."""
|
||
for link in soup.find_all("a", href=True):
|
||
href = str(link["href"]).strip()
|
||
if not href:
|
||
continue
|
||
|
||
coordinates = parse_coordinates_from_url(href)
|
||
if coordinates is not None:
|
||
return coordinates
|
||
|
||
return None
|
||
|
||
|
||
def parse_date(date_str: str) -> datetime:
|
||
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
|
||
try:
|
||
dt = datetime.fromisoformat(date_str)
|
||
except ValueError:
|
||
# fallback: just take the YYYY-MM-DD part
|
||
dt = datetime.fromisoformat(date_str.split("T")[0])
|
||
|
||
if dt.tzinfo is not None:
|
||
# normalise tz-aware datetimes to UTC, then strip tzinfo
|
||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||
|
||
return dt
|
||
|
||
|
||
def url_has_year_component(url: str) -> bool:
|
||
"""Return True if the URL contains any digit (assume year-specific)."""
|
||
return any(ch.isdigit() for ch in url)
|
||
|
||
|
||
def insert_sorted(
|
||
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
|
||
) -> list[dict[str, typing.Any]]:
|
||
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
|
||
new_url = new_conf.get("url")
|
||
new_start = parse_date(str(new_conf["start"]))
|
||
new_year = new_start.year
|
||
|
||
if new_url:
|
||
for conf in conferences:
|
||
if conf.get("url") == new_url:
|
||
existing_start = parse_date(str(conf["start"]))
|
||
existing_year = existing_start.year
|
||
|
||
if url_has_year_component(new_url):
|
||
# If URL has a year in it, treat exact URL as unique
|
||
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
|
||
return conferences
|
||
elif existing_year == new_year:
|
||
# Same URL, same year → definitely duplicate
|
||
print(
|
||
f"⚠️ Conference already exists in YAML "
|
||
f"(url={new_url}, year={existing_year}), skipping."
|
||
)
|
||
return conferences
|
||
else:
|
||
# Same URL reused for different year → allow new entry
|
||
continue
|
||
|
||
# Insert sorted by start date
|
||
for idx, conf in enumerate(conferences):
|
||
existing_start = parse_date(str(conf["start"]))
|
||
if new_start < existing_start:
|
||
conferences.insert(idx, new_conf)
|
||
return conferences
|
||
conferences.append(new_conf)
|
||
return conferences
|
||
|
||
|
||
def validate_country(conf: dict[str, typing.Any]) -> None:
|
||
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
|
||
country = conf.get("country")
|
||
if not country:
|
||
return
|
||
|
||
country = country.strip()
|
||
# Already a 2-letter code
|
||
if len(country) == 2:
|
||
if pycountry.countries.get(alpha_2=country.upper()):
|
||
conf["country"] = country.lower()
|
||
return
|
||
else:
|
||
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
|
||
|
||
# Try lookup by name
|
||
match = pycountry.countries.get(name=country)
|
||
if not match:
|
||
# fuzzy lookup (handles “United States” vs “United States of America”)
|
||
try:
|
||
match = pycountry.countries.search_fuzzy(country)[0]
|
||
except LookupError:
|
||
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
|
||
|
||
conf["country"] = match.alpha_2.lower()
|
||
|
||
|
||
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
|
||
"""Convert YAML date/datetime values to a datetime."""
|
||
if isinstance(value, datetime):
|
||
return value
|
||
|
||
if isinstance(value, date):
|
||
return datetime.combine(value, time())
|
||
|
||
if isinstance(value, str):
|
||
try:
|
||
return datetime.fromisoformat(value)
|
||
except ValueError:
|
||
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
|
||
|
||
return None
|
||
|
||
|
||
def same_type_as_start(
|
||
start_value: typing.Any,
|
||
new_dt: datetime,
|
||
keep_timezone: bool = True,
|
||
prefer_datetime: bool = False,
|
||
) -> typing.Any:
|
||
"""Return end value shaped like the start value when possible."""
|
||
if isinstance(start_value, datetime):
|
||
if keep_timezone:
|
||
return new_dt
|
||
return new_dt.replace(tzinfo=None)
|
||
|
||
if isinstance(start_value, date):
|
||
if prefer_datetime:
|
||
return new_dt
|
||
return new_dt.date()
|
||
|
||
if isinstance(start_value, str):
|
||
if prefer_datetime or " " in start_value or "T" in start_value:
|
||
return new_dt.isoformat(sep=" ")
|
||
return new_dt.date().isoformat()
|
||
|
||
return new_dt
|
||
|
||
|
||
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
|
||
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
|
||
lowered = source_text.lower()
|
||
|
||
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
|
||
return 22
|
||
|
||
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
|
||
return 23
|
||
|
||
return None
|
||
|
||
|
||
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
|
||
"""Ensure an end value exists, with a Geomob-specific fallback."""
|
||
start_value = new_conf.get("start")
|
||
if start_value is None:
|
||
return
|
||
|
||
start_dt = parse_yaml_datetime(start_value)
|
||
if start_dt is None:
|
||
return
|
||
|
||
name = str(new_conf.get("name", ""))
|
||
url = str(new_conf.get("url", ""))
|
||
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
|
||
|
||
if is_geomob:
|
||
end_hour = maybe_extract_explicit_end_time(source_text)
|
||
if end_hour is None:
|
||
end_hour = 22
|
||
|
||
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
|
||
new_conf["end"] = same_type_as_start(
|
||
start_value, geomob_end, prefer_datetime=True
|
||
)
|
||
return
|
||
|
||
if "end" not in new_conf:
|
||
new_conf["end"] = same_type_as_start(start_value, start_dt)
|
||
|
||
|
||
def main() -> None:
|
||
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
|
||
url = sys.argv[1]
|
||
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
|
||
|
||
# Load conferences first
|
||
with open(yaml_path) as f:
|
||
conferences = yaml.safe_load(f)
|
||
|
||
# Early exit: if URL contains a year and already exists, skip
|
||
if url_has_year_component(url):
|
||
for conf in conferences:
|
||
if conf.get("url") == url:
|
||
print(
|
||
"⚠️ Conference already exists in YAML "
|
||
+ f"(url={url}), skipping before API call."
|
||
)
|
||
return
|
||
|
||
# Otherwise proceed with full workflow
|
||
soup = fetch_webpage(url)
|
||
source_text = webpage_to_text(soup)
|
||
detected_coordinates = detect_page_coordinates(soup)
|
||
prompt = build_prompt(url, source_text, detected_coordinates)
|
||
new_yaml_text = get_from_open_ai(prompt)["yaml"]
|
||
|
||
new_conf = yaml.safe_load(new_yaml_text)
|
||
if isinstance(new_conf, list):
|
||
new_conf = new_conf[0]
|
||
|
||
validate_country(new_conf)
|
||
normalise_end_field(new_conf, source_text)
|
||
|
||
if detected_coordinates is not None:
|
||
new_conf["latitude"] = detected_coordinates[0]
|
||
new_conf["longitude"] = detected_coordinates[1]
|
||
|
||
updated = insert_sorted(conferences, new_conf)
|
||
|
||
with open(yaml_path, "w") as f:
|
||
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
|
||
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
|
||
f.write(text.lstrip())
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|