Add multi-station support, GWR fares API, and Circle line improvements

- Support any station with direct trains to Paddington; station CRS code is now part of the URL (/results/<crs>/<slug>/<date>) - Load station list from data/direct_to_paddington.tsv; show dropdown on index page; 404 for unknown station codes - Fetch live GWR walk-on fares via api.gwr.com for all stations (SSS/SVS/SDS with restrictions already applied per train); cache 30 days - Scrape Paddington arrival platform numbers from RTT - Show unreachable morning Eurostars (before first reachable service only) - Circle line: show actual KX St Pancras arrival times (not check-in estimate) and add a second backup service in the transfer column - Widen page max-width to 1100px for longer station names Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 20:22:44 +01:00 · 2026-04-06 20:22:44 +01:00 · 3c787b33d3
commit 3c787b33d3
parent 71be0dd8cf
12 changed files with 810 additions and 262 deletions
--- a/scraper/gwr_fares.py
+++ b/scraper/gwr_fares.py
@ -0,0 +1,125 @@
+"""
+Fetch GWR walk-on single fares from any station to London Paddington.
+
+Uses the GWR journey search API (same API as www.gwr.com ticket search).
+Returns per-train cheapest standard-class fare with restrictions already applied.
+Cache for 30 days — fares rarely change.
+"""
+
+import httpx
+
+_API_URL = "https://api.gwr.com/api/shopping/journeysearch"
+# API key is embedded in the GWR web app (appvalues.prod.json)
+_API_KEY = "OgovGqAlLp4gWAhL7DQLo7pMCt8GHi2U4SPFiZgG"
+_PAD_CODE = "GBQQP"  # London Paddington cluster code as used by GWR website
+_WANTED_CODES = {"SSS", "SVS", "SDS"}
+_MAX_PAGES = 20
+
+
+def _headers() -> dict:
+    return {
+        "user-agent": (
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+        ),
+        "accept": "application/json, text/plain, */*",
+        "channel": "WEB",
+        "content-type": "application/json",
+        "apikey": _API_KEY,
+        "origin": "https://www.gwr.com",
+        "referer": "https://www.gwr.com/",
+    }
+
+
+def _request_body(
+    station_crs: str,
+    travel_date: str,
+    conversation_token: str | None,
+    later: bool,
+) -> dict:
+    return {
+        "IsNextOutward": False,
+        "IsPreviousOutward": False,
+        "IsNextReturn": False,
+        "IsPreviousReturn": False,
+        "campaignCode": "",
+        "validationCode": "",
+        "locfrom": f"GB{station_crs}",
+        "locto": _PAD_CODE,
+        "datetimedepart": f"{travel_date}T00:00:00",
+        "outwarddepartafter": True,
+        "datetimereturn": None,
+        "returndepartafter": False,
+        "directServicesOnly": False,
+        "firstclass": False,
+        "standardclass": True,
+        "adults": 1,
+        "children": 0,
+        "openreturn": False,
+        "via": None,
+        "avoid": None,
+        "isEarlierSearch": False,
+        "isLaterSearch": later,
+        "isEarlierSearchReturn": False,
+        "isLaterSearchReturn": False,
+        "railcards": [],
+        "conversationToken": conversation_token,
+    }
+
+
+def fetch(station_crs: str, travel_date: str) -> dict[str, dict]:
+    """
+    Fetch GWR single fares from station_crs to London Paddington on travel_date.
+
+    Returns {departure_time: {'ticket': name, 'price': float, 'code': code}}
+    where price is in £ and only the cheapest available standard-class ticket
+    per departure (with restrictions already applied by GWR) is kept.
+    """
+    result: dict[str, dict] = {}
+
+    with httpx.Client(headers=_headers(), timeout=30) as client:
+        conversation_token = None
+        later = False
+
+        for _ in range(_MAX_PAGES):
+            body = _request_body(station_crs, travel_date, conversation_token, later)
+            resp = client.post(_API_URL, json=body)
+            resp.raise_for_status()
+            data = resp.json().get("data", {})
+
+            conversation_token = data.get("conversationToken")
+
+            for journey in data.get("outwardOpenPureReturnFare", []):
+                dep_iso = journey.get("departureTime", "")
+                dep_time = dep_iso[11:16]  # "HH:MM" from "2026-04-10T09:08:00"
+                if not dep_time or dep_time in result:
+                    continue
+
+                cheapest = None
+                for fare in journey.get("journeyFareDetails", []):
+                    code = fare.get("ticketTypeCode")
+                    if code not in _WANTED_CODES:
+                        continue
+                    if not fare.get("isStandardClass"):
+                        continue
+                    price_pence = fare.get("fare", 0)
+                    if cheapest is None or price_pence < cheapest["price_pence"]:
+                        cheapest = {
+                            "ticket": fare.get("ticketType", ""),
+                            "price": price_pence / 100,
+                            "price_pence": price_pence,
+                            "code": code,
+                        }
+
+                if cheapest:
+                    result[dep_time] = {
+                        "ticket": cheapest["ticket"],
+                        "price": cheapest["price"],
+                        "code": cheapest["code"],
+                    }
+
+            if not data.get("showLaterOutward", False):
+                break
+            later = True
+
+    return result
--- a/scraper/realtime_trains.py
+++ b/scraper/realtime_trains.py
@ -10,14 +10,14 @@ import re
 import httpx
 import lxml.html

-BRI_TO_PAD = (
+_TO_PAD_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
-    "gb-nr:BRI/to/gb-nr:PAD/{date}/0000-2359"
+    "gb-nr:{crs}/to/gb-nr:PAD/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
 )
-PAD_FROM_BRI = (
+_PAD_FROM_TMPL = (
    "https://www.realtimetrains.co.uk/search/detailed/"
-    "gb-nr:PAD/from/gb-nr:BRI/{date}/0000-2359"
+    "gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359"
    "?stp=WVS&show=pax-calls&order=wtt"
 )

@ -68,19 +68,48 @@ def _parse_services(html: str, time_selector: str) -> dict[str, str]:
    return result


-def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
-    """Fetch GWR trains; returns [{'depart_bristol', 'arrive_paddington', 'headcode'}]."""
+def _parse_arrivals(html: str) -> dict[str, dict]:
+    """Return {train_id: {'time': ..., 'platform': ...}} from a PAD arrivals page."""
+    root = lxml.html.fromstring(html)
+    sl = root.cssselect('div.servicelist')
+    if not sl:
+        return {}
+    result = {}
+    for svc in sl[0].cssselect('a.service'):
+        tid_els  = svc.cssselect('div.tid')
+        time_els = svc.cssselect('div.time.plan.a')
+        if not (tid_els and time_els):
+            continue
+        time_text = time_els[0].text_content().strip()
+        if not time_text:
+            continue
+        plat_els = svc.cssselect('div.platform')
+        platform = plat_els[0].text_content().strip() if plat_els else ''
+        result[tid_els[0].text_content().strip()] = {
+            'time': _fmt(time_text),
+            'platform': platform,
+        }
+    return result
+
+
+def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') -> list[dict]:
+    """Fetch trains from station_crs to PAD; returns [{'depart_bristol', 'arrive_paddington', 'headcode', 'arrive_platform'}]."""
    headers = _browser_headers(user_agent)
    with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
-        r_bri = client.get(BRI_TO_PAD.format(date=date))
-        r_pad = client.get(PAD_FROM_BRI.format(date=date))
+        r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date))
+        r_pad = client.get(_PAD_FROM_TMPL.format(crs=station_crs, date=date))

    departures = _parse_services(r_bri.text, 'div.time.plan.d')
-    arrivals   = _parse_services(r_pad.text, 'div.time.plan.a')
+    arrivals   = _parse_arrivals(r_pad.text)

    trains = [
-        {'depart_bristol': dep, 'arrive_paddington': arr, 'headcode': tid}
+        {
+            'depart_bristol': dep,
+            'arrive_paddington': arrivals[tid]['time'],
+            'arrive_platform': arrivals[tid]['platform'],
+            'headcode': tid,
+        }
        for tid, dep in departures.items()
-        if (arr := arrivals.get(tid))
+        if tid in arrivals
    ]
    return sorted(trains, key=lambda t: t['depart_bristol'])