#!/usr/bin/python3

import configparser
import json
import os
import re
import sys
import typing
from datetime import date, datetime, time, timezone
from urllib.parse import parse_qs, urlparse

import html2text
import openai
import pycountry
import requests
import yaml
from bs4 import BeautifulSoup

user_agent = "add-new-conference/0.1"
coordinate_patterns = (
    re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
    re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
    re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
    re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
    re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
)


def read_api_key() -> str:
    """Read API key from ~/.config/openai/config."""
    config_path = os.path.expanduser("~/.config/openai/config")
    parser = configparser.ConfigParser()
    parser.read(config_path)
    return parser["openai"]["api_key"]


def build_prompt(
    url: str,
    source_text: str,
    detected_coordinates: tuple[float, float] | None,
) -> str:
    """Build prompt with embedded YAML examples."""
    examples = """
- name: Geomob London
  topic: Maps
  location: London
  country: gb
  start: 2026-01-28 18:00:00+00:00
  end: 2026-01-28 23:00:00+00:00
  url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
  venue: Geovation Hub
  address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
  latitude: 51.5242464
  longitude: -0.0997024
  free: true
  going: true
  hashtag: '#geomobLON'

- name: DebConf 25
  topic: Debian
  location: Plouzané (Breast)
  country: fr
  start: 2025-07-07
  end: 2025-07-20
  url: https://wiki.debian.org/DebConf/25
  going: true
  cfp_url: https://debconf25.debconf.org/talks/new/
  venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
    campus de Brest
  latitude: 48.35934
  longitude: -4.569889

- name: Wikimedia Hackathon
  topic: Wikimedia
  location: Istanbul
  country: tr
  start: 2025-05-02
  end: 2025-05-04
  venue: Renaissance Polat Istanbul Hotel
  address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
  latitude: 40.959946
  longitude: 28.838763
  url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
  going: true
  free: true
  hackathon: true
  registered: true
"""
    coordinate_note = ""
    if detected_coordinates is not None:
        coordinate_note = (
            "\nDetected venue coordinates from a map link on the page:\n"
            f"latitude: {detected_coordinates[0]}\n"
            f"longitude: {detected_coordinates[1]}\n"
        )

    prompt = f"""
I keep a record of interesting conferences in a YAML file.

Here are some examples of the format I use:

{examples}

Now here is a new conference of interest:

Conference URL: {url}

Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.

Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.

Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.

Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note}

Wrap your answer in a JSON object with a single key "yaml".
===
{source_text}
"""
    return prompt


def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
    """Pass prompt to OpenAI and get reply."""
    client = openai.OpenAI(api_key=read_api_key())

    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model=model,
        response_format={"type": "json_object"},
    )

    reply = response.choices[0].message.content
    assert isinstance(reply, str)
    return typing.cast(dict[str, str], json.loads(reply))


def fetch_webpage(url: str) -> BeautifulSoup:
    """Fetch webpage HTML and parse it."""
    response = requests.get(url, headers={"User-Agent": user_agent})
    response.raise_for_status()
    return BeautifulSoup(response.content, "lxml")


def webpage_to_text(soup: BeautifulSoup) -> str:
    """Convert parsed HTML into readable text content."""
    soup_copy = BeautifulSoup(str(soup), "lxml")

    for script_or_style in soup_copy(["script", "style"]):
        script_or_style.decompose()

    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    return text_maker.handle(str(soup_copy))


def parse_osm_url(url: str) -> tuple[float, float] | None:
    """Extract latitude/longitude from an OpenStreetMap URL."""
    parsed = urlparse(url)
    query = parse_qs(parsed.query)

    mlat = query.get("mlat")
    mlon = query.get("mlon")
    if mlat and mlon:
        return float(mlat[0]), float(mlon[0])

    if parsed.fragment.startswith("map="):
        parts = parsed.fragment.split("/")
        if len(parts) >= 3:
            return float(parts[-2]), float(parts[-1])

    return None


def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
    """Extract latitude/longitude from a Google Maps URL."""
    for pattern in coordinate_patterns:
        match = pattern.search(url)
        if match:
            return float(match.group(1)), float(match.group(2))

    return None


def latlon_from_google_maps_url(
    url: str, timeout: int = 10
) -> tuple[float, float] | None:
    """Resolve a Google Maps URL and extract latitude/longitude."""
    response = requests.get(
        url,
        allow_redirects=True,
        timeout=timeout,
        headers={"User-Agent": "lookup.py/1.0"},
    )
    response.raise_for_status()

    coordinates = extract_google_maps_latlon(response.url)
    if coordinates is not None:
        return coordinates

    return extract_google_maps_latlon(response.text)


def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
    """Extract latitude/longitude from a supported map URL."""
    lower_url = url.lower()

    if "openstreetmap.org" in lower_url:
        return parse_osm_url(url)

    if "google." in lower_url or "maps.app.goo.gl" in lower_url:
        coordinates = extract_google_maps_latlon(url)
        if coordinates is not None:
            return coordinates

        try:
            return latlon_from_google_maps_url(url)
        except requests.RequestException:
            return None

    return None


def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
    """Detect venue coordinates from Google Maps or OSM links."""
    for link in soup.find_all("a", href=True):
        href = str(link["href"]).strip()
        if not href:
            continue

        coordinates = parse_coordinates_from_url(href)
        if coordinates is not None:
            return coordinates

    return None


def parse_date(date_str: str) -> datetime:
    """Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
    try:
        dt = datetime.fromisoformat(date_str)
    except ValueError:
        # fallback: just take the YYYY-MM-DD part
        dt = datetime.fromisoformat(date_str.split("T")[0])

    if dt.tzinfo is not None:
        # normalise tz-aware datetimes to UTC, then strip tzinfo
        dt = dt.astimezone(timezone.utc).replace(tzinfo=None)

    return dt


def url_has_year_component(url: str) -> bool:
    """Return True if the URL contains any digit (assume year-specific)."""
    return any(ch.isdigit() for ch in url)


def insert_sorted(
    conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> list[dict[str, typing.Any]]:
    """Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
    new_url = new_conf.get("url")
    new_start = parse_date(str(new_conf["start"]))
    new_year = new_start.year

    if new_url:
        for conf in conferences:
            if conf.get("url") == new_url:
                existing_start = parse_date(str(conf["start"]))
                existing_year = existing_start.year

                if url_has_year_component(new_url):
                    # If URL has a year in it, treat exact URL as unique
                    print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
                    return conferences
                elif existing_year == new_year:
                    # Same URL, same year → definitely duplicate
                    print(
                        f"⚠️ Conference already exists in YAML "
                        f"(url={new_url}, year={existing_year}), skipping."
                    )
                    return conferences
                else:
                    # Same URL reused for different year → allow new entry
                    continue

    # Insert sorted by start date
    for idx, conf in enumerate(conferences):
        existing_start = parse_date(str(conf["start"]))
        if new_start < existing_start:
            conferences.insert(idx, new_conf)
            return conferences
    conferences.append(new_conf)
    return conferences


def validate_country(conf: dict[str, typing.Any]) -> None:
    """Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
    country = conf.get("country")
    if not country:
        return

    country = country.strip()
    # Already a 2-letter code
    if len(country) == 2:
        if pycountry.countries.get(alpha_2=country.upper()):
            conf["country"] = country.lower()
            return
        else:
            raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")

    # Try lookup by name
    match = pycountry.countries.get(name=country)
    if not match:
        # fuzzy lookup (handles “United States” vs “United States of America”)
        try:
            match = pycountry.countries.search_fuzzy(country)[0]
        except LookupError:
            raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")

    conf["country"] = match.alpha_2.lower()


def parse_yaml_datetime(value: typing.Any) -> datetime | None:
    """Convert YAML date/datetime values to a datetime."""
    if isinstance(value, datetime):
        return value

    if isinstance(value, date):
        return datetime.combine(value, time())

    if isinstance(value, str):
        try:
            return datetime.fromisoformat(value)
        except ValueError:
            return datetime.combine(date.fromisoformat(value.split("T")[0]), time())

    return None


def same_type_as_start(
    start_value: typing.Any,
    new_dt: datetime,
    keep_timezone: bool = True,
    prefer_datetime: bool = False,
) -> typing.Any:
    """Return end value shaped like the start value when possible."""
    if isinstance(start_value, datetime):
        if keep_timezone:
            return new_dt
        return new_dt.replace(tzinfo=None)

    if isinstance(start_value, date):
        if prefer_datetime:
            return new_dt
        return new_dt.date()

    if isinstance(start_value, str):
        if prefer_datetime or " " in start_value or "T" in start_value:
            return new_dt.isoformat(sep=" ")
        return new_dt.date().isoformat()

    return new_dt


def maybe_extract_explicit_end_time(source_text: str) -> int | None:
    """Extract an explicit 12-hour clock end time for Geomob-style pages."""
    lowered = source_text.lower()

    if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
        return 22

    if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
        return 23

    return None


def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
    """Ensure an end value exists, with a Geomob-specific fallback."""
    start_value = new_conf.get("start")
    if start_value is None:
        return

    start_dt = parse_yaml_datetime(start_value)
    if start_dt is None:
        return

    name = str(new_conf.get("name", ""))
    url = str(new_conf.get("url", ""))
    is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()

    if is_geomob:
        end_hour = maybe_extract_explicit_end_time(source_text)
        if end_hour is None:
            end_hour = 22

        geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
        new_conf["end"] = same_type_as_start(
            start_value, geomob_end, prefer_datetime=True
        )
        return

    if "end" not in new_conf:
        new_conf["end"] = same_type_as_start(start_value, start_dt)


def main() -> None:
    """Fetch page, generate YAML via LLM, update conferences.yaml."""
    url = sys.argv[1]
    yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")

    # Load conferences first
    with open(yaml_path) as f:
        conferences = yaml.safe_load(f)

    # Early exit: if URL contains a year and already exists, skip
    if url_has_year_component(url):
        for conf in conferences:
            if conf.get("url") == url:
                print(
                    "⚠️ Conference already exists in YAML "
                    + f"(url={url}), skipping before API call."
                )
                return

    # Otherwise proceed with full workflow
    soup = fetch_webpage(url)
    source_text = webpage_to_text(soup)
    detected_coordinates = detect_page_coordinates(soup)
    prompt = build_prompt(url, source_text, detected_coordinates)
    new_yaml_text = get_from_open_ai(prompt)["yaml"]

    new_conf = yaml.safe_load(new_yaml_text)
    if isinstance(new_conf, list):
        new_conf = new_conf[0]

    validate_country(new_conf)
    normalise_end_field(new_conf, source_text)

    if detected_coordinates is not None:
        new_conf["latitude"] = detected_coordinates[0]
        new_conf["longitude"] = detected_coordinates[1]

    updated = insert_sorted(conferences, new_conf)

    with open(yaml_path, "w") as f:
        text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
        text = text.replace("\n- name:", "\n\n- name:")  # keep blank lines
        f.write(text.lstrip())


if __name__ == "__main__":
    main()