Initial commit.

This commit is contained in:
Edward Betts 2026-03-30 19:34:46 +01:00
commit a8e0bd39e5
16 changed files with 981 additions and 0 deletions

View file

@ -0,0 +1,96 @@
"""
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
Uses httpx (not Playwright) with browser-like headers.
Two fetches run concurrently:
BRI/to/PAD departure times from Bristol (div.time.plan.d)
PAD/from/BRI arrival times at Paddington (div.time.plan.a)
Matched by train ID (div.tid).
"""
import asyncio
import re
import httpx
import lxml.html
BRI_TO_PAD = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:BRI/to/gb-nr:PAD/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
PAD_FROM_BRI = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:PAD/from/gb-nr:BRI/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
def _browser_headers(user_agent: str) -> dict:
return {
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
def _fmt(hhmm: str) -> str:
"""Convert '0830''08:30'."""
hhmm = re.sub(r'[^0-9]', '', hhmm)
if len(hhmm) == 4:
return f"{hhmm[:2]}:{hhmm[2:]}"
return hhmm
def _parse_services(html: str, time_selector: str) -> dict[str, str]:
"""Return {train_id: time_string} from a servicelist page."""
root = lxml.html.fromstring(html)
sl = root.cssselect('div.servicelist')
if not sl:
return {}
result = {}
for svc in sl[0].cssselect('a.service'):
tid_els = svc.cssselect('div.tid')
time_els = svc.cssselect(time_selector)
if tid_els and time_els:
tid = tid_els[0].text_content().strip()
time_text = time_els[0].text_content().strip()
if time_text:
result[tid] = _fmt(time_text)
return result
async def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Fetch GWR trains concurrently; returns [{'depart_bristol', 'arrive_paddington'}]."""
headers = _browser_headers(user_agent)
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=30) as client:
r_bri, r_pad = await asyncio.gather(
client.get(BRI_TO_PAD.format(date=date)),
client.get(PAD_FROM_BRI.format(date=date)),
)
departures = _parse_services(r_bri.text, 'div.time.plan.d')
arrivals = _parse_services(r_pad.text, 'div.time.plan.a')
trains = [
{'depart_bristol': dep, 'arrive_paddington': arr}
for tid, dep in departures.items()
if (arr := arrivals.get(tid))
]
return sorted(trains, key=lambda t: t['depart_bristol'])
def get_gwr_trains(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Synchronous wrapper around fetch() for CLI/testing use."""
return asyncio.run(fetch(date, user_agent))