- Support any station with direct trains to Paddington; station CRS code is now part of the URL (/results/<crs>/<slug>/<date>) - Load station list from data/direct_to_paddington.tsv; show dropdown on index page; 404 for unknown station codes - Fetch live GWR walk-on fares via api.gwr.com for all stations (SSS/SVS/SDS with restrictions already applied per train); cache 30 days - Scrape Paddington arrival platform numbers from RTT - Show unreachable morning Eurostars (before first reachable service only) - Circle line: show actual KX St Pancras arrival times (not check-in estimate) and add a second backup service in the transfer column - Widen page max-width to 1100px for longer station names Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
115 lines
3.8 KiB
Python
115 lines
3.8 KiB
Python
"""
|
|
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
|
|
|
|
Two fetches:
|
|
BRI/to/PAD → departure times from Bristol (div.time.plan.d)
|
|
PAD/from/BRI → arrival times at Paddington (div.time.plan.a)
|
|
Matched by train ID (div.tid).
|
|
"""
|
|
import re
|
|
import httpx
|
|
import lxml.html
|
|
|
|
_TO_PAD_TMPL = (
|
|
"https://www.realtimetrains.co.uk/search/detailed/"
|
|
"gb-nr:{crs}/to/gb-nr:PAD/{date}/0000-2359"
|
|
"?stp=WVS&show=pax-calls&order=wtt"
|
|
)
|
|
_PAD_FROM_TMPL = (
|
|
"https://www.realtimetrains.co.uk/search/detailed/"
|
|
"gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359"
|
|
"?stp=WVS&show=pax-calls&order=wtt"
|
|
)
|
|
|
|
DEFAULT_UA = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
def _browser_headers(user_agent: str) -> dict:
|
|
return {
|
|
"User-Agent": user_agent,
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "en-GB,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
}
|
|
|
|
|
|
def _fmt(hhmm: str) -> str:
|
|
"""Convert '0830' → '08:30'."""
|
|
hhmm = re.sub(r'[^0-9]', '', hhmm)
|
|
if len(hhmm) == 4:
|
|
return f"{hhmm[:2]}:{hhmm[2:]}"
|
|
return hhmm
|
|
|
|
|
|
def _parse_services(html: str, time_selector: str) -> dict[str, str]:
|
|
"""Return {train_id: time_string} from a servicelist page."""
|
|
root = lxml.html.fromstring(html)
|
|
sl = root.cssselect('div.servicelist')
|
|
if not sl:
|
|
return {}
|
|
result = {}
|
|
for svc in sl[0].cssselect('a.service'):
|
|
tid_els = svc.cssselect('div.tid')
|
|
time_els = svc.cssselect(time_selector)
|
|
if tid_els and time_els:
|
|
tid = tid_els[0].text_content().strip()
|
|
time_text = time_els[0].text_content().strip()
|
|
if time_text:
|
|
result[tid] = _fmt(time_text)
|
|
return result
|
|
|
|
|
|
def _parse_arrivals(html: str) -> dict[str, dict]:
|
|
"""Return {train_id: {'time': ..., 'platform': ...}} from a PAD arrivals page."""
|
|
root = lxml.html.fromstring(html)
|
|
sl = root.cssselect('div.servicelist')
|
|
if not sl:
|
|
return {}
|
|
result = {}
|
|
for svc in sl[0].cssselect('a.service'):
|
|
tid_els = svc.cssselect('div.tid')
|
|
time_els = svc.cssselect('div.time.plan.a')
|
|
if not (tid_els and time_els):
|
|
continue
|
|
time_text = time_els[0].text_content().strip()
|
|
if not time_text:
|
|
continue
|
|
plat_els = svc.cssselect('div.platform')
|
|
platform = plat_els[0].text_content().strip() if plat_els else ''
|
|
result[tid_els[0].text_content().strip()] = {
|
|
'time': _fmt(time_text),
|
|
'platform': platform,
|
|
}
|
|
return result
|
|
|
|
|
|
def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') -> list[dict]:
|
|
"""Fetch trains from station_crs to PAD; returns [{'depart_bristol', 'arrive_paddington', 'headcode', 'arrive_platform'}]."""
|
|
headers = _browser_headers(user_agent)
|
|
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
|
|
r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date))
|
|
r_pad = client.get(_PAD_FROM_TMPL.format(crs=station_crs, date=date))
|
|
|
|
departures = _parse_services(r_bri.text, 'div.time.plan.d')
|
|
arrivals = _parse_arrivals(r_pad.text)
|
|
|
|
trains = [
|
|
{
|
|
'depart_bristol': dep,
|
|
'arrive_paddington': arrivals[tid]['time'],
|
|
'arrive_platform': arrivals[tid]['platform'],
|
|
'headcode': tid,
|
|
}
|
|
for tid, dep in departures.items()
|
|
if tid in arrivals
|
|
]
|
|
return sorted(trains, key=lambda t: t['depart_bristol'])
|