Refactor add-new-conference into agenda module
This commit is contained in:
parent
2cc25cd203
commit
76360c25f3
3 changed files with 634 additions and 455 deletions
467
agenda/add_new_conference.py
Normal file
467
agenda/add_new_conference.py
Normal file
|
|
@ -0,0 +1,467 @@
|
|||
"""Helpers for adding conferences to the YAML data file."""
|
||||
|
||||
import configparser
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
from datetime import date, datetime, time, timezone
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import html2text
|
||||
import lxml.html # type: ignore[import-untyped]
|
||||
import openai
|
||||
import pycountry
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
USER_AGENT = "add-new-conference/0.1"
|
||||
COORDINATE_PATTERNS = (
|
||||
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
)
|
||||
|
||||
|
||||
def read_api_key() -> str:
|
||||
"""Read API key from ~/.config/openai/config."""
|
||||
config_path = os.path.expanduser("~/.config/openai/config")
|
||||
parser = configparser.ConfigParser()
|
||||
parser.read(config_path)
|
||||
return parser["openai"]["api_key"]
|
||||
|
||||
|
||||
def build_prompt(
|
||||
url: str,
|
||||
source_text: str,
|
||||
detected_coordinates: tuple[float, float] | None,
|
||||
) -> str:
|
||||
"""Build prompt with embedded YAML examples."""
|
||||
examples = """
|
||||
- name: Geomob London
|
||||
topic: Maps
|
||||
location: London
|
||||
country: gb
|
||||
start: 2026-01-28 18:00:00+00:00
|
||||
end: 2026-01-28 23:00:00+00:00
|
||||
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
|
||||
venue: Geovation Hub
|
||||
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
|
||||
latitude: 51.5242464
|
||||
longitude: -0.0997024
|
||||
free: true
|
||||
going: true
|
||||
hashtag: '#geomobLON'
|
||||
|
||||
- name: DebConf 25
|
||||
topic: Debian
|
||||
location: Plouzané (Breast)
|
||||
country: fr
|
||||
start: 2025-07-07
|
||||
end: 2025-07-20
|
||||
url: https://wiki.debian.org/DebConf/25
|
||||
going: true
|
||||
cfp_url: https://debconf25.debconf.org/talks/new/
|
||||
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
|
||||
campus de Brest
|
||||
latitude: 48.35934
|
||||
longitude: -4.569889
|
||||
|
||||
- name: Wikimedia Hackathon
|
||||
topic: Wikimedia
|
||||
location: Istanbul
|
||||
country: tr
|
||||
start: 2025-05-02
|
||||
end: 2025-05-04
|
||||
venue: Renaissance Polat Istanbul Hotel
|
||||
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
|
||||
latitude: 40.959946
|
||||
longitude: 28.838763
|
||||
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
|
||||
going: true
|
||||
free: true
|
||||
hackathon: true
|
||||
registered: true
|
||||
"""
|
||||
coordinate_note = ""
|
||||
if detected_coordinates is not None:
|
||||
coordinate_note = (
|
||||
"\nDetected venue coordinates from a map link on the page:\n"
|
||||
f"latitude: {detected_coordinates[0]}\n"
|
||||
f"longitude: {detected_coordinates[1]}\n"
|
||||
)
|
||||
|
||||
prompt = f"""
|
||||
I keep a record of interesting conferences in a YAML file.
|
||||
|
||||
Here are some examples of the format I use:
|
||||
|
||||
{examples}
|
||||
|
||||
Now here is a new conference of interest:
|
||||
|
||||
Conference URL: {url}
|
||||
|
||||
Return the YAML representation for this conference following the
|
||||
same style and keys as the examples. Only include keys if the
|
||||
information is available. Do not invent details.
|
||||
|
||||
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
|
||||
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
|
||||
Do not output full country names.
|
||||
|
||||
Important: always include an `end` field. If the event is a single-day event,
|
||||
the `end` can be the same date as `start`, or a same-day datetime if the page
|
||||
provides an end time.
|
||||
|
||||
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
|
||||
on the event date unless the page explicitly provides a different end time.
|
||||
{coordinate_note}
|
||||
|
||||
Wrap your answer in a JSON object with a single key "yaml".
|
||||
===
|
||||
{source_text}
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
||||
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
|
||||
"""Pass prompt to OpenAI and get reply."""
|
||||
client = openai.OpenAI(api_key=read_api_key())
|
||||
|
||||
response = client.chat.completions.create(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
model=model,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
reply = response.choices[0].message.content
|
||||
assert isinstance(reply, str)
|
||||
return typing.cast(dict[str, str], json.loads(reply))
|
||||
|
||||
|
||||
def fetch_webpage(url: str) -> lxml.html.HtmlElement:
|
||||
"""Fetch webpage HTML and parse it."""
|
||||
response = requests.get(url, headers={"User-Agent": USER_AGENT})
|
||||
response.raise_for_status()
|
||||
return lxml.html.fromstring(response.content)
|
||||
|
||||
|
||||
def webpage_to_text(root: lxml.html.HtmlElement) -> str:
|
||||
"""Convert parsed HTML into readable text content."""
|
||||
root_copy = lxml.html.fromstring(lxml.html.tostring(root))
|
||||
|
||||
for script_or_style in root_copy.xpath("//script|//style"):
|
||||
script_or_style.drop_tree()
|
||||
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.ignore_links = True
|
||||
text_maker.ignore_images = True
|
||||
return text_maker.handle(lxml.html.tostring(root_copy, encoding="unicode"))
|
||||
|
||||
|
||||
def parse_osm_url(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from an OpenStreetMap URL."""
|
||||
parsed = urlparse(url)
|
||||
query = parse_qs(parsed.query)
|
||||
|
||||
mlat = query.get("mlat")
|
||||
mlon = query.get("mlon")
|
||||
if mlat and mlon:
|
||||
return float(mlat[0]), float(mlon[0])
|
||||
|
||||
if parsed.fragment.startswith("map="):
|
||||
parts = parsed.fragment.split("/")
|
||||
if len(parts) >= 3:
|
||||
return float(parts[-2]), float(parts[-1])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from a Google Maps URL."""
|
||||
for pattern in COORDINATE_PATTERNS:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return float(match.group(1)), float(match.group(2))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def latlon_from_google_maps_url(
|
||||
url: str, timeout: int = 10
|
||||
) -> tuple[float, float] | None:
|
||||
"""Resolve a Google Maps URL and extract latitude/longitude."""
|
||||
response = requests.get(
|
||||
url,
|
||||
allow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={"User-Agent": "lookup.py/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
coordinates = extract_google_maps_latlon(response.url)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
return extract_google_maps_latlon(response.text)
|
||||
|
||||
|
||||
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from a supported map URL."""
|
||||
lower_url = url.lower()
|
||||
|
||||
if "openstreetmap.org" in lower_url:
|
||||
return parse_osm_url(url)
|
||||
|
||||
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
|
||||
coordinates = extract_google_maps_latlon(url)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
try:
|
||||
return latlon_from_google_maps_url(url)
|
||||
except requests.RequestException:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_page_coordinates(root: lxml.html.HtmlElement) -> tuple[float, float] | None:
|
||||
"""Detect venue coordinates from Google Maps or OSM links."""
|
||||
for link in root.xpath("//a[@href]"):
|
||||
href = str(link.get("href", "")).strip()
|
||||
if not href:
|
||||
continue
|
||||
|
||||
coordinates = parse_coordinates_from_url(href)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_date(date_str: str) -> datetime:
|
||||
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
|
||||
try:
|
||||
dt = datetime.fromisoformat(date_str)
|
||||
except ValueError:
|
||||
dt = datetime.fromisoformat(date_str.split("T")[0])
|
||||
|
||||
if dt.tzinfo is not None:
|
||||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
|
||||
return dt
|
||||
|
||||
|
||||
def url_has_year_component(url: str) -> bool:
|
||||
"""Return True if the URL contains any digit."""
|
||||
return any(ch.isdigit() for ch in url)
|
||||
|
||||
|
||||
def insert_sorted(
|
||||
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
|
||||
) -> list[dict[str, typing.Any]]:
|
||||
"""Insert a conference sorted by start date and skip duplicate URLs."""
|
||||
new_url = new_conf.get("url")
|
||||
new_start = parse_date(str(new_conf["start"]))
|
||||
new_year = new_start.year
|
||||
|
||||
if new_url:
|
||||
for conf in conferences:
|
||||
if conf.get("url") == new_url:
|
||||
existing_start = parse_date(str(conf["start"]))
|
||||
existing_year = existing_start.year
|
||||
|
||||
if url_has_year_component(new_url):
|
||||
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
|
||||
return conferences
|
||||
if existing_year == new_year:
|
||||
print(
|
||||
f"⚠️ Conference already exists in YAML "
|
||||
f"(url={new_url}, year={existing_year}), skipping."
|
||||
)
|
||||
return conferences
|
||||
|
||||
for idx, conf in enumerate(conferences):
|
||||
existing_start = parse_date(str(conf["start"]))
|
||||
if new_start < existing_start:
|
||||
conferences.insert(idx, new_conf)
|
||||
return conferences
|
||||
conferences.append(new_conf)
|
||||
return conferences
|
||||
|
||||
|
||||
def validate_country(conf: dict[str, typing.Any]) -> None:
|
||||
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
|
||||
country = conf.get("country")
|
||||
if not country:
|
||||
return
|
||||
|
||||
country = country.strip()
|
||||
if len(country) == 2:
|
||||
if pycountry.countries.get(alpha_2=country.upper()):
|
||||
conf["country"] = country.lower()
|
||||
return
|
||||
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
|
||||
|
||||
match = pycountry.countries.get(name=country)
|
||||
if not match:
|
||||
try:
|
||||
match = pycountry.countries.search_fuzzy(country)[0]
|
||||
except LookupError as exc:
|
||||
raise ValueError(
|
||||
f"❌ Country '{country}' not recognised as ISO 3166-1"
|
||||
) from exc
|
||||
|
||||
conf["country"] = match.alpha_2.lower()
|
||||
|
||||
|
||||
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
|
||||
"""Convert YAML date/datetime values to a datetime."""
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
|
||||
if isinstance(value, date):
|
||||
return datetime.combine(value, time())
|
||||
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return datetime.fromisoformat(value)
|
||||
except ValueError:
|
||||
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def same_type_as_start(
|
||||
start_value: typing.Any,
|
||||
new_dt: datetime,
|
||||
keep_timezone: bool = True,
|
||||
prefer_datetime: bool = False,
|
||||
) -> typing.Any:
|
||||
"""Return end value shaped like the start value when possible."""
|
||||
if isinstance(start_value, datetime):
|
||||
if keep_timezone:
|
||||
return new_dt
|
||||
return new_dt.replace(tzinfo=None)
|
||||
|
||||
if isinstance(start_value, date):
|
||||
if prefer_datetime:
|
||||
return new_dt
|
||||
return new_dt.date()
|
||||
|
||||
if isinstance(start_value, str):
|
||||
if prefer_datetime or " " in start_value or "T" in start_value:
|
||||
return new_dt.isoformat(sep=" ")
|
||||
return new_dt.date().isoformat()
|
||||
|
||||
return new_dt
|
||||
|
||||
|
||||
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
|
||||
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
|
||||
lowered = source_text.lower()
|
||||
|
||||
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
|
||||
return 22
|
||||
|
||||
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
|
||||
return 23
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
|
||||
"""Ensure an end value exists, with a Geomob-specific fallback."""
|
||||
start_value = new_conf.get("start")
|
||||
if start_value is None:
|
||||
return
|
||||
|
||||
start_dt = parse_yaml_datetime(start_value)
|
||||
if start_dt is None:
|
||||
return
|
||||
|
||||
name = str(new_conf.get("name", ""))
|
||||
url = str(new_conf.get("url", ""))
|
||||
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
|
||||
|
||||
if is_geomob:
|
||||
end_hour = maybe_extract_explicit_end_time(source_text)
|
||||
if end_hour is None:
|
||||
end_hour = 22
|
||||
|
||||
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
|
||||
new_conf["end"] = same_type_as_start(
|
||||
start_value, geomob_end, prefer_datetime=True
|
||||
)
|
||||
return
|
||||
|
||||
if "end" not in new_conf:
|
||||
new_conf["end"] = same_type_as_start(start_value, start_dt)
|
||||
|
||||
|
||||
def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]:
|
||||
"""Load conference YAML."""
|
||||
with open(yaml_path) as file:
|
||||
loaded = yaml.safe_load(file)
|
||||
assert isinstance(loaded, list)
|
||||
return typing.cast(list[dict[str, typing.Any]], loaded)
|
||||
|
||||
|
||||
def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None:
|
||||
"""Write conference YAML."""
|
||||
with open(yaml_path, "w") as file:
|
||||
text = yaml.dump(conferences, sort_keys=False, allow_unicode=True)
|
||||
text = text.replace("\n- name:", "\n\n- name:")
|
||||
file.write(text.lstrip())
|
||||
|
||||
|
||||
def add_new_conference(url: str, yaml_path: str) -> bool:
|
||||
"""Fetch, generate and insert a conference into the YAML file."""
|
||||
conferences = load_conferences(yaml_path)
|
||||
|
||||
if url_has_year_component(url):
|
||||
for conf in conferences:
|
||||
if conf.get("url") == url:
|
||||
print(
|
||||
"⚠️ Conference already exists in YAML "
|
||||
+ f"(url={url}), skipping before API call."
|
||||
)
|
||||
return False
|
||||
|
||||
soup = fetch_webpage(url)
|
||||
source_text = webpage_to_text(soup)
|
||||
detected_coordinates = detect_page_coordinates(soup)
|
||||
prompt = build_prompt(url, source_text, detected_coordinates)
|
||||
new_yaml_text = get_from_open_ai(prompt)["yaml"]
|
||||
|
||||
new_conf = yaml.safe_load(new_yaml_text)
|
||||
if isinstance(new_conf, list):
|
||||
new_conf = new_conf[0]
|
||||
assert isinstance(new_conf, dict)
|
||||
|
||||
validate_country(new_conf)
|
||||
normalise_end_field(new_conf, source_text)
|
||||
|
||||
if detected_coordinates is not None:
|
||||
new_conf["latitude"] = detected_coordinates[0]
|
||||
new_conf["longitude"] = detected_coordinates[1]
|
||||
|
||||
updated = insert_sorted(conferences, new_conf)
|
||||
dump_conferences(yaml_path, updated)
|
||||
return True
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
"""CLI entrypoint."""
|
||||
args = argv if argv is not None else sys.argv[1:]
|
||||
if not args:
|
||||
raise SystemExit("Usage: add-new-conference URL")
|
||||
|
||||
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
|
||||
add_new_conference(args[0], yaml_path)
|
||||
return 0
|
||||
|
|
@ -1,463 +1,15 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import configparser
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
from datetime import date, datetime, time, timezone
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import html2text
|
||||
import openai
|
||||
import pycountry
|
||||
import requests
|
||||
import yaml
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
user_agent = "add-new-conference/0.1"
|
||||
coordinate_patterns = (
|
||||
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
|
||||
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
|
||||
)
|
||||
|
||||
|
||||
def read_api_key() -> str:
|
||||
"""Read API key from ~/.config/openai/config."""
|
||||
config_path = os.path.expanduser("~/.config/openai/config")
|
||||
parser = configparser.ConfigParser()
|
||||
parser.read(config_path)
|
||||
return parser["openai"]["api_key"]
|
||||
|
||||
|
||||
def build_prompt(
|
||||
url: str,
|
||||
source_text: str,
|
||||
detected_coordinates: tuple[float, float] | None,
|
||||
) -> str:
|
||||
"""Build prompt with embedded YAML examples."""
|
||||
examples = """
|
||||
- name: Geomob London
|
||||
topic: Maps
|
||||
location: London
|
||||
country: gb
|
||||
start: 2026-01-28 18:00:00+00:00
|
||||
end: 2026-01-28 23:00:00+00:00
|
||||
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
|
||||
venue: Geovation Hub
|
||||
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
|
||||
latitude: 51.5242464
|
||||
longitude: -0.0997024
|
||||
free: true
|
||||
going: true
|
||||
hashtag: '#geomobLON'
|
||||
|
||||
- name: DebConf 25
|
||||
topic: Debian
|
||||
location: Plouzané (Breast)
|
||||
country: fr
|
||||
start: 2025-07-07
|
||||
end: 2025-07-20
|
||||
url: https://wiki.debian.org/DebConf/25
|
||||
going: true
|
||||
cfp_url: https://debconf25.debconf.org/talks/new/
|
||||
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
|
||||
campus de Brest
|
||||
latitude: 48.35934
|
||||
longitude: -4.569889
|
||||
|
||||
- name: Wikimedia Hackathon
|
||||
topic: Wikimedia
|
||||
location: Istanbul
|
||||
country: tr
|
||||
start: 2025-05-02
|
||||
end: 2025-05-04
|
||||
venue: Renaissance Polat Istanbul Hotel
|
||||
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
|
||||
latitude: 40.959946
|
||||
longitude: 28.838763
|
||||
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
|
||||
going: true
|
||||
free: true
|
||||
hackathon: true
|
||||
registered: true
|
||||
"""
|
||||
coordinate_note = ""
|
||||
if detected_coordinates is not None:
|
||||
coordinate_note = (
|
||||
"\nDetected venue coordinates from a map link on the page:\n"
|
||||
f"latitude: {detected_coordinates[0]}\n"
|
||||
f"longitude: {detected_coordinates[1]}\n"
|
||||
)
|
||||
|
||||
prompt = f"""
|
||||
I keep a record of interesting conferences in a YAML file.
|
||||
|
||||
Here are some examples of the format I use:
|
||||
|
||||
{examples}
|
||||
|
||||
Now here is a new conference of interest:
|
||||
|
||||
Conference URL: {url}
|
||||
|
||||
Return the YAML representation for this conference following the
|
||||
same style and keys as the examples. Only include keys if the
|
||||
information is available. Do not invent details.
|
||||
|
||||
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
|
||||
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
|
||||
Do not output full country names.
|
||||
|
||||
Important: always include an `end` field. If the event is a single-day event,
|
||||
the `end` can be the same date as `start`, or a same-day datetime if the page
|
||||
provides an end time.
|
||||
|
||||
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
|
||||
on the event date unless the page explicitly provides a different end time.
|
||||
{coordinate_note}
|
||||
|
||||
Wrap your answer in a JSON object with a single key "yaml".
|
||||
===
|
||||
{source_text}
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
||||
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
|
||||
"""Pass prompt to OpenAI and get reply."""
|
||||
client = openai.OpenAI(api_key=read_api_key())
|
||||
|
||||
response = client.chat.completions.create(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
model=model,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
reply = response.choices[0].message.content
|
||||
assert isinstance(reply, str)
|
||||
return typing.cast(dict[str, str], json.loads(reply))
|
||||
|
||||
|
||||
def fetch_webpage(url: str) -> BeautifulSoup:
|
||||
"""Fetch webpage HTML and parse it."""
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.content, "lxml")
|
||||
|
||||
|
||||
def webpage_to_text(soup: BeautifulSoup) -> str:
|
||||
"""Convert parsed HTML into readable text content."""
|
||||
soup_copy = BeautifulSoup(str(soup), "lxml")
|
||||
|
||||
for script_or_style in soup_copy(["script", "style"]):
|
||||
script_or_style.decompose()
|
||||
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.ignore_links = True
|
||||
text_maker.ignore_images = True
|
||||
return text_maker.handle(str(soup_copy))
|
||||
|
||||
|
||||
def parse_osm_url(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from an OpenStreetMap URL."""
|
||||
parsed = urlparse(url)
|
||||
query = parse_qs(parsed.query)
|
||||
|
||||
mlat = query.get("mlat")
|
||||
mlon = query.get("mlon")
|
||||
if mlat and mlon:
|
||||
return float(mlat[0]), float(mlon[0])
|
||||
|
||||
if parsed.fragment.startswith("map="):
|
||||
parts = parsed.fragment.split("/")
|
||||
if len(parts) >= 3:
|
||||
return float(parts[-2]), float(parts[-1])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from a Google Maps URL."""
|
||||
for pattern in coordinate_patterns:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return float(match.group(1)), float(match.group(2))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def latlon_from_google_maps_url(
|
||||
url: str, timeout: int = 10
|
||||
) -> tuple[float, float] | None:
|
||||
"""Resolve a Google Maps URL and extract latitude/longitude."""
|
||||
response = requests.get(
|
||||
url,
|
||||
allow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={"User-Agent": "lookup.py/1.0"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
coordinates = extract_google_maps_latlon(response.url)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
return extract_google_maps_latlon(response.text)
|
||||
|
||||
|
||||
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
|
||||
"""Extract latitude/longitude from a supported map URL."""
|
||||
lower_url = url.lower()
|
||||
|
||||
if "openstreetmap.org" in lower_url:
|
||||
return parse_osm_url(url)
|
||||
|
||||
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
|
||||
coordinates = extract_google_maps_latlon(url)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
try:
|
||||
return latlon_from_google_maps_url(url)
|
||||
except requests.RequestException:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
|
||||
"""Detect venue coordinates from Google Maps or OSM links."""
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = str(link["href"]).strip()
|
||||
if not href:
|
||||
continue
|
||||
|
||||
coordinates = parse_coordinates_from_url(href)
|
||||
if coordinates is not None:
|
||||
return coordinates
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_date(date_str: str) -> datetime:
|
||||
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
|
||||
try:
|
||||
dt = datetime.fromisoformat(date_str)
|
||||
except ValueError:
|
||||
# fallback: just take the YYYY-MM-DD part
|
||||
dt = datetime.fromisoformat(date_str.split("T")[0])
|
||||
|
||||
if dt.tzinfo is not None:
|
||||
# normalise tz-aware datetimes to UTC, then strip tzinfo
|
||||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
|
||||
return dt
|
||||
|
||||
|
||||
def url_has_year_component(url: str) -> bool:
|
||||
"""Return True if the URL contains any digit (assume year-specific)."""
|
||||
return any(ch.isdigit() for ch in url)
|
||||
|
||||
|
||||
def insert_sorted(
|
||||
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
|
||||
) -> list[dict[str, typing.Any]]:
|
||||
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
|
||||
new_url = new_conf.get("url")
|
||||
new_start = parse_date(str(new_conf["start"]))
|
||||
new_year = new_start.year
|
||||
|
||||
if new_url:
|
||||
for conf in conferences:
|
||||
if conf.get("url") == new_url:
|
||||
existing_start = parse_date(str(conf["start"]))
|
||||
existing_year = existing_start.year
|
||||
|
||||
if url_has_year_component(new_url):
|
||||
# If URL has a year in it, treat exact URL as unique
|
||||
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
|
||||
return conferences
|
||||
elif existing_year == new_year:
|
||||
# Same URL, same year → definitely duplicate
|
||||
print(
|
||||
f"⚠️ Conference already exists in YAML "
|
||||
f"(url={new_url}, year={existing_year}), skipping."
|
||||
)
|
||||
return conferences
|
||||
else:
|
||||
# Same URL reused for different year → allow new entry
|
||||
continue
|
||||
|
||||
# Insert sorted by start date
|
||||
for idx, conf in enumerate(conferences):
|
||||
existing_start = parse_date(str(conf["start"]))
|
||||
if new_start < existing_start:
|
||||
conferences.insert(idx, new_conf)
|
||||
return conferences
|
||||
conferences.append(new_conf)
|
||||
return conferences
|
||||
|
||||
|
||||
def validate_country(conf: dict[str, typing.Any]) -> None:
|
||||
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
|
||||
country = conf.get("country")
|
||||
if not country:
|
||||
return
|
||||
|
||||
country = country.strip()
|
||||
# Already a 2-letter code
|
||||
if len(country) == 2:
|
||||
if pycountry.countries.get(alpha_2=country.upper()):
|
||||
conf["country"] = country.lower()
|
||||
return
|
||||
else:
|
||||
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
|
||||
|
||||
# Try lookup by name
|
||||
match = pycountry.countries.get(name=country)
|
||||
if not match:
|
||||
# fuzzy lookup (handles “United States” vs “United States of America”)
|
||||
try:
|
||||
match = pycountry.countries.search_fuzzy(country)[0]
|
||||
except LookupError:
|
||||
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
|
||||
|
||||
conf["country"] = match.alpha_2.lower()
|
||||
|
||||
|
||||
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
|
||||
"""Convert YAML date/datetime values to a datetime."""
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
|
||||
if isinstance(value, date):
|
||||
return datetime.combine(value, time())
|
||||
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return datetime.fromisoformat(value)
|
||||
except ValueError:
|
||||
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def same_type_as_start(
|
||||
start_value: typing.Any,
|
||||
new_dt: datetime,
|
||||
keep_timezone: bool = True,
|
||||
prefer_datetime: bool = False,
|
||||
) -> typing.Any:
|
||||
"""Return end value shaped like the start value when possible."""
|
||||
if isinstance(start_value, datetime):
|
||||
if keep_timezone:
|
||||
return new_dt
|
||||
return new_dt.replace(tzinfo=None)
|
||||
|
||||
if isinstance(start_value, date):
|
||||
if prefer_datetime:
|
||||
return new_dt
|
||||
return new_dt.date()
|
||||
|
||||
if isinstance(start_value, str):
|
||||
if prefer_datetime or " " in start_value or "T" in start_value:
|
||||
return new_dt.isoformat(sep=" ")
|
||||
return new_dt.date().isoformat()
|
||||
|
||||
return new_dt
|
||||
|
||||
|
||||
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
|
||||
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
|
||||
lowered = source_text.lower()
|
||||
|
||||
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
|
||||
return 22
|
||||
|
||||
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
|
||||
return 23
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
|
||||
"""Ensure an end value exists, with a Geomob-specific fallback."""
|
||||
start_value = new_conf.get("start")
|
||||
if start_value is None:
|
||||
return
|
||||
|
||||
start_dt = parse_yaml_datetime(start_value)
|
||||
if start_dt is None:
|
||||
return
|
||||
|
||||
name = str(new_conf.get("name", ""))
|
||||
url = str(new_conf.get("url", ""))
|
||||
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
|
||||
|
||||
if is_geomob:
|
||||
end_hour = maybe_extract_explicit_end_time(source_text)
|
||||
if end_hour is None:
|
||||
end_hour = 22
|
||||
|
||||
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
|
||||
new_conf["end"] = same_type_as_start(
|
||||
start_value, geomob_end, prefer_datetime=True
|
||||
)
|
||||
return
|
||||
|
||||
if "end" not in new_conf:
|
||||
new_conf["end"] = same_type_as_start(start_value, start_dt)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
|
||||
url = sys.argv[1]
|
||||
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
|
||||
|
||||
# Load conferences first
|
||||
with open(yaml_path) as f:
|
||||
conferences = yaml.safe_load(f)
|
||||
|
||||
# Early exit: if URL contains a year and already exists, skip
|
||||
if url_has_year_component(url):
|
||||
for conf in conferences:
|
||||
if conf.get("url") == url:
|
||||
print(
|
||||
"⚠️ Conference already exists in YAML "
|
||||
+ f"(url={url}), skipping before API call."
|
||||
)
|
||||
return
|
||||
|
||||
# Otherwise proceed with full workflow
|
||||
soup = fetch_webpage(url)
|
||||
source_text = webpage_to_text(soup)
|
||||
detected_coordinates = detect_page_coordinates(soup)
|
||||
prompt = build_prompt(url, source_text, detected_coordinates)
|
||||
new_yaml_text = get_from_open_ai(prompt)["yaml"]
|
||||
|
||||
new_conf = yaml.safe_load(new_yaml_text)
|
||||
if isinstance(new_conf, list):
|
||||
new_conf = new_conf[0]
|
||||
|
||||
validate_country(new_conf)
|
||||
normalise_end_field(new_conf, source_text)
|
||||
|
||||
if detected_coordinates is not None:
|
||||
new_conf["latitude"] = detected_coordinates[0]
|
||||
new_conf["longitude"] = detected_coordinates[1]
|
||||
|
||||
updated = insert_sorted(conferences, new_conf)
|
||||
|
||||
with open(yaml_path, "w") as f:
|
||||
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
|
||||
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
|
||||
f.write(text.lstrip())
|
||||
SCRIPT_PATH = os.path.realpath(__file__)
|
||||
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
|
||||
REPO_ROOT = os.path.dirname(SCRIPT_DIR)
|
||||
if REPO_ROOT not in sys.path:
|
||||
sys.path.insert(0, REPO_ROOT)
|
||||
|
||||
from agenda.add_new_conference import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
raise SystemExit(main())
|
||||
|
|
|
|||
160
tests/test_add_new_conference.py
Normal file
160
tests/test_add_new_conference.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
"""Tests for agenda.add_new_conference."""
|
||||
|
||||
from datetime import date, datetime
|
||||
import typing
|
||||
|
||||
import lxml.html # type: ignore[import-untyped]
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from agenda import add_new_conference
|
||||
|
||||
|
||||
def test_parse_osm_url_mlat_mlon() -> None:
|
||||
"""OpenStreetMap URLs with mlat/mlon should parse."""
|
||||
result = add_new_conference.parse_osm_url(
|
||||
"https://www.openstreetmap.org/?mlat=51.5&mlon=-0.12"
|
||||
)
|
||||
assert result == (51.5, -0.12)
|
||||
|
||||
|
||||
def test_extract_google_maps_latlon_at_pattern() -> None:
|
||||
"""Google Maps @lat,lon URLs should parse."""
|
||||
result = add_new_conference.extract_google_maps_latlon(
|
||||
"https://www.google.com/maps/place/Venue/@51.5242464,-0.0997024,17z/"
|
||||
)
|
||||
assert result == (51.5242464, -0.0997024)
|
||||
|
||||
|
||||
def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None:
|
||||
"""The same non-year-specific URL can be reused for a different year."""
|
||||
conferences: list[dict[str, typing.Any]] = [
|
||||
{
|
||||
"name": "OldConf",
|
||||
"start": date(2025, 6, 1),
|
||||
"url": "https://example.com/conf",
|
||||
}
|
||||
]
|
||||
new_conf: dict[str, typing.Any] = {
|
||||
"name": "NewConf",
|
||||
"start": date(2026, 6, 1),
|
||||
"url": "https://example.com/conf",
|
||||
}
|
||||
|
||||
updated = add_new_conference.insert_sorted(conferences, new_conf)
|
||||
|
||||
assert len(updated) == 2
|
||||
assert updated[1]["name"] == "NewConf"
|
||||
|
||||
|
||||
def test_validate_country_normalises_name() -> None:
|
||||
"""Country names should be normalised to alpha-2 codes."""
|
||||
conf: dict[str, typing.Any] = {"country": "United Kingdom"}
|
||||
|
||||
add_new_conference.validate_country(conf)
|
||||
|
||||
assert conf["country"] == "gb"
|
||||
|
||||
|
||||
def test_normalise_end_field_defaults_single_day_date() -> None:
|
||||
"""Non-Geomob conferences should default end to the start date."""
|
||||
conf: dict[str, typing.Any] = {
|
||||
"name": "PyCon",
|
||||
"start": date(2026, 4, 10),
|
||||
}
|
||||
|
||||
add_new_conference.normalise_end_field(conf, "plain text")
|
||||
|
||||
assert conf["end"] == date(2026, 4, 10)
|
||||
|
||||
|
||||
def test_normalise_end_field_sets_geomob_end_time() -> None:
|
||||
"""Geomob conferences should default to a 22:00 end time."""
|
||||
conf: dict[str, typing.Any] = {
|
||||
"name": "Geomob London",
|
||||
"start": date(2026, 1, 28),
|
||||
"url": "https://thegeomob.com/post/jan-28th-2026-geomoblon-details",
|
||||
}
|
||||
|
||||
add_new_conference.normalise_end_field(conf, "see you there")
|
||||
|
||||
assert conf["end"] == datetime(2026, 1, 28, 22, 0)
|
||||
|
||||
|
||||
def test_detect_page_coordinates_uses_first_supported_link() -> None:
|
||||
"""Page coordinate detection should inspect anchor hrefs."""
|
||||
root = lxml.html.fromstring(
|
||||
(
|
||||
"<html><body>"
|
||||
'<a href="https://example.com">Example</a>'
|
||||
'<a href="https://www.openstreetmap.org/?mlat=51.5&mlon=-0.12">Map</a>'
|
||||
"</body></html>"
|
||||
)
|
||||
)
|
||||
|
||||
assert add_new_conference.detect_page_coordinates(root) == (51.5, -0.12)
|
||||
|
||||
|
||||
def test_add_new_conference_updates_yaml(
|
||||
tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""The end-to-end import flow should append a generated conference."""
|
||||
yaml_path = tmp_path / "conferences.yaml"
|
||||
yaml_path.write_text(
|
||||
yaml.dump(
|
||||
[
|
||||
{
|
||||
"name": "ExistingConf",
|
||||
"start": date(2026, 4, 1),
|
||||
"end": date(2026, 4, 2),
|
||||
"url": "https://example.com/existing",
|
||||
}
|
||||
],
|
||||
sort_keys=False,
|
||||
)
|
||||
)
|
||||
|
||||
root = lxml.html.fromstring(
|
||||
(
|
||||
"<html><body>"
|
||||
'<a href="https://www.openstreetmap.org/?mlat=40.0&mlon=-74.0">Map</a>'
|
||||
"</body></html>"
|
||||
)
|
||||
)
|
||||
|
||||
monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root)
|
||||
monkeypatch.setattr(
|
||||
add_new_conference,
|
||||
"webpage_to_text",
|
||||
lambda parsed: "Conference details",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
add_new_conference,
|
||||
"get_from_open_ai",
|
||||
lambda prompt: {
|
||||
"yaml": yaml.dump(
|
||||
{
|
||||
"name": "NewConf",
|
||||
"topic": "Tech",
|
||||
"location": "New York",
|
||||
"country": "United States",
|
||||
"start": date(2026, 5, 3),
|
||||
"url": "https://example.com/newconf",
|
||||
},
|
||||
sort_keys=False,
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
added = add_new_conference.add_new_conference(
|
||||
"https://example.com/newconf", str(yaml_path)
|
||||
)
|
||||
|
||||
assert added is True
|
||||
written = yaml.safe_load(yaml_path.read_text())
|
||||
assert len(written) == 2
|
||||
assert written[1]["name"] == "NewConf"
|
||||
assert written[1]["country"] == "us"
|
||||
assert written[1]["end"] == date(2026, 5, 3)
|
||||
assert written[1]["latitude"] == 40.0
|
||||
assert written[1]["longitude"] == -74.0
|
||||
Loading…
Add table
Add a link
Reference in a new issue