paddington-eurostar/scraper/realtime_trains.py

"""
Scrape direct trains between a selected station and London Paddington using
Realtime Trains.

Two fetches:
  BRI/to/PAD  → departure times from Bristol (div.time.plan.d)
  PAD/from/BRI → arrival times at Paddington (div.time.plan.a)
Matched by train ID (div.tid).
"""

import re
from typing import Any

import httpx
import lxml.html  # type: ignore[import-untyped]

_TO_PAD_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
    "gb-nr:{crs}/to/gb-nr:PAD/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
)
_PAD_FROM_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
    "gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
)
_PAD_TO_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
    "gb-nr:PAD/to/gb-nr:{crs}/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
)
_FROM_PAD_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
    "gb-nr:{crs}/from/gb-nr:PAD/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
)

DEFAULT_UA = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)


def _browser_headers(user_agent: str) -> dict[str, str]:
    return {
        "User-Agent": user_agent,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-GB,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
    }


def _fmt(hhmm: str) -> str:
    """Convert '0830' → '08:30'."""
    hhmm = re.sub(r"[^0-9]", "", hhmm)
    if len(hhmm) == 4:
        return f"{hhmm[:2]}:{hhmm[2:]}"
    return hhmm


def _parse_services(html: str, time_selector: str) -> dict[str, str]:
    """Return {train_id: time_string} from a servicelist page."""
    root = lxml.html.fromstring(html)
    sl = root.cssselect("div.servicelist")
    if not sl:
        return {}
    result = {}
    for svc in sl[0].cssselect("a.service"):
        tid_els = svc.cssselect("div.tid")
        time_els = svc.cssselect(time_selector)
        if tid_els and time_els:
            tid = tid_els[0].text_content().strip()
            time_text = time_els[0].text_content().strip()
            if time_text:
                result[tid] = _fmt(time_text)
    return result


def _parse_arrivals(html: str) -> dict[str, dict[str, str]]:
    """Return {train_id: {'time': ..., 'platform': ...}} from an arrivals page."""
    root = lxml.html.fromstring(html)
    sl = root.cssselect("div.servicelist")
    if not sl:
        return {}
    result = {}
    for svc in sl[0].cssselect("a.service"):
        tid_els = svc.cssselect("div.tid")
        time_els = svc.cssselect("div.time.plan.a")
        if not (tid_els and time_els):
            continue
        time_text = time_els[0].text_content().strip()
        if not time_text:
            continue
        plat_els = svc.cssselect("div.platform")
        platform = plat_els[0].text_content().strip() if plat_els else ""
        result[tid_els[0].text_content().strip()] = {
            "time": _fmt(time_text),
            "platform": platform,
        }
    return result


def fetch(
    date: str, user_agent: str = DEFAULT_UA, station_crs: str = "BRI"
) -> list[dict[str, Any]]:
    """Fetch trains from station_crs to PAD."""
    headers = _browser_headers(user_agent)
    with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
        r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date))
        r_pad = client.get(_PAD_FROM_TMPL.format(crs=station_crs, date=date))

    departures = _parse_services(r_bri.text, "div.time.plan.d")
    arrivals = _parse_arrivals(r_pad.text)

    trains = [
        {
            "depart_bristol": dep,
            "arrive_paddington": arrivals[tid]["time"],
            "arrive_platform": arrivals[tid]["platform"],
            "headcode": tid,
        }
        for tid, dep in departures.items()
        if tid in arrivals
    ]
    return sorted(trains, key=lambda t: t["depart_bristol"])


def fetch_to_paddington(
    date: str, user_agent: str = DEFAULT_UA, station_crs: str = "BRI"
) -> list[dict[str, Any]]:
    """Fetch trains from station_crs to PAD using generic field names."""
    return [
        {
            **train,
            "depart_origin": train["depart_bristol"],
            "arrive_paddington": train["arrive_paddington"],
            "arrive_platform": train.get("arrive_platform", ""),
            "headcode": train.get("headcode", ""),
        }
        for train in fetch(date, user_agent, station_crs)
    ]


def fetch_from_paddington(
    date: str, user_agent: str = DEFAULT_UA, station_crs: str = "BRI"
) -> list[dict[str, Any]]:
    """Fetch trains from PAD to station_crs."""
    headers = _browser_headers(user_agent)
    with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
        r_pad = client.get(_PAD_TO_TMPL.format(crs=station_crs, date=date))
        r_station = client.get(_FROM_PAD_TMPL.format(crs=station_crs, date=date))

    departures = _parse_services(r_pad.text, "div.time.plan.d")
    arrivals = _parse_arrivals(r_station.text)

    trains = [
        {
            "depart_paddington": dep,
            "arrive_destination": arrivals[tid]["time"],
            "arrive_platform": arrivals[tid]["platform"],
            "headcode": tid,
        }
        for tid, dep in departures.items()
        if tid in arrivals
    ]
    return sorted(trains, key=lambda t: t["depart_paddington"])