Add multi-station support, GWR fares API, and Circle line improvements

- Support any station with direct trains to Paddington; station CRS code
  is now part of the URL (/results/<crs>/<slug>/<date>)
- Load station list from data/direct_to_paddington.tsv; show dropdown on
  index page; 404 for unknown station codes
- Fetch live GWR walk-on fares via api.gwr.com for all stations (SSS/SVS/SDS
  with restrictions already applied per train); cache 30 days
- Scrape Paddington arrival platform numbers from RTT
- Show unreachable morning Eurostars (before first reachable service only)
- Circle line: show actual KX St Pancras arrival times (not check-in estimate)
  and add a second backup service in the transfer column
- Widen page max-width to 1100px for longer station names

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-06 20:22:44 +01:00
parent 71be0dd8cf
commit 3c787b33d3
12 changed files with 810 additions and 262 deletions

125
scraper/gwr_fares.py Normal file
View file

@ -0,0 +1,125 @@
"""
Fetch GWR walk-on single fares from any station to London Paddington.
Uses the GWR journey search API (same API as www.gwr.com ticket search).
Returns per-train cheapest standard-class fare with restrictions already applied.
Cache for 30 days fares rarely change.
"""
import httpx
_API_URL = "https://api.gwr.com/api/shopping/journeysearch"
# API key is embedded in the GWR web app (appvalues.prod.json)
_API_KEY = "OgovGqAlLp4gWAhL7DQLo7pMCt8GHi2U4SPFiZgG"
_PAD_CODE = "GBQQP" # London Paddington cluster code as used by GWR website
_WANTED_CODES = {"SSS", "SVS", "SDS"}
_MAX_PAGES = 20
def _headers() -> dict:
return {
"user-agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
),
"accept": "application/json, text/plain, */*",
"channel": "WEB",
"content-type": "application/json",
"apikey": _API_KEY,
"origin": "https://www.gwr.com",
"referer": "https://www.gwr.com/",
}
def _request_body(
station_crs: str,
travel_date: str,
conversation_token: str | None,
later: bool,
) -> dict:
return {
"IsNextOutward": False,
"IsPreviousOutward": False,
"IsNextReturn": False,
"IsPreviousReturn": False,
"campaignCode": "",
"validationCode": "",
"locfrom": f"GB{station_crs}",
"locto": _PAD_CODE,
"datetimedepart": f"{travel_date}T00:00:00",
"outwarddepartafter": True,
"datetimereturn": None,
"returndepartafter": False,
"directServicesOnly": False,
"firstclass": False,
"standardclass": True,
"adults": 1,
"children": 0,
"openreturn": False,
"via": None,
"avoid": None,
"isEarlierSearch": False,
"isLaterSearch": later,
"isEarlierSearchReturn": False,
"isLaterSearchReturn": False,
"railcards": [],
"conversationToken": conversation_token,
}
def fetch(station_crs: str, travel_date: str) -> dict[str, dict]:
"""
Fetch GWR single fares from station_crs to London Paddington on travel_date.
Returns {departure_time: {'ticket': name, 'price': float, 'code': code}}
where price is in £ and only the cheapest available standard-class ticket
per departure (with restrictions already applied by GWR) is kept.
"""
result: dict[str, dict] = {}
with httpx.Client(headers=_headers(), timeout=30) as client:
conversation_token = None
later = False
for _ in range(_MAX_PAGES):
body = _request_body(station_crs, travel_date, conversation_token, later)
resp = client.post(_API_URL, json=body)
resp.raise_for_status()
data = resp.json().get("data", {})
conversation_token = data.get("conversationToken")
for journey in data.get("outwardOpenPureReturnFare", []):
dep_iso = journey.get("departureTime", "")
dep_time = dep_iso[11:16] # "HH:MM" from "2026-04-10T09:08:00"
if not dep_time or dep_time in result:
continue
cheapest = None
for fare in journey.get("journeyFareDetails", []):
code = fare.get("ticketTypeCode")
if code not in _WANTED_CODES:
continue
if not fare.get("isStandardClass"):
continue
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": code,
}
if cheapest:
result[dep_time] = {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
}
if not data.get("showLaterOutward", False):
break
later = True
return result

View file

@ -10,14 +10,14 @@ import re
import httpx
import lxml.html
BRI_TO_PAD = (
_TO_PAD_TMPL = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:BRI/to/gb-nr:PAD/{date}/0000-2359"
"gb-nr:{crs}/to/gb-nr:PAD/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
PAD_FROM_BRI = (
_PAD_FROM_TMPL = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:PAD/from/gb-nr:BRI/{date}/0000-2359"
"gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
@ -68,19 +68,48 @@ def _parse_services(html: str, time_selector: str) -> dict[str, str]:
return result
def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Fetch GWR trains; returns [{'depart_bristol', 'arrive_paddington', 'headcode'}]."""
def _parse_arrivals(html: str) -> dict[str, dict]:
"""Return {train_id: {'time': ..., 'platform': ...}} from a PAD arrivals page."""
root = lxml.html.fromstring(html)
sl = root.cssselect('div.servicelist')
if not sl:
return {}
result = {}
for svc in sl[0].cssselect('a.service'):
tid_els = svc.cssselect('div.tid')
time_els = svc.cssselect('div.time.plan.a')
if not (tid_els and time_els):
continue
time_text = time_els[0].text_content().strip()
if not time_text:
continue
plat_els = svc.cssselect('div.platform')
platform = plat_els[0].text_content().strip() if plat_els else ''
result[tid_els[0].text_content().strip()] = {
'time': _fmt(time_text),
'platform': platform,
}
return result
def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') -> list[dict]:
"""Fetch trains from station_crs to PAD; returns [{'depart_bristol', 'arrive_paddington', 'headcode', 'arrive_platform'}]."""
headers = _browser_headers(user_agent)
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
r_bri = client.get(BRI_TO_PAD.format(date=date))
r_pad = client.get(PAD_FROM_BRI.format(date=date))
r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date))
r_pad = client.get(_PAD_FROM_TMPL.format(crs=station_crs, date=date))
departures = _parse_services(r_bri.text, 'div.time.plan.d')
arrivals = _parse_services(r_pad.text, 'div.time.plan.a')
arrivals = _parse_arrivals(r_pad.text)
trains = [
{'depart_bristol': dep, 'arrive_paddington': arr, 'headcode': tid}
{
'depart_bristol': dep,
'arrive_paddington': arrivals[tid]['time'],
'arrive_platform': arrivals[tid]['platform'],
'headcode': tid,
}
for tid, dep in departures.items()
if (arr := arrivals.get(tid))
if tid in arrivals
]
return sorted(trains, key=lambda t: t['depart_bristol'])