""" Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains. Two fetches: BRI/to/PAD → departure times from Bristol (div.time.plan.d) PAD/from/BRI → arrival times at Paddington (div.time.plan.a) Matched by train ID (div.tid). """ import re import httpx import lxml.html _TO_PAD_TMPL = ( "https://www.realtimetrains.co.uk/search/detailed/" "gb-nr:{crs}/to/gb-nr:PAD/{date}/0000-2359" "?stp=WVS&show=pax-calls&order=wtt" ) _PAD_FROM_TMPL = ( "https://www.realtimetrains.co.uk/search/detailed/" "gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359" "?stp=WVS&show=pax-calls&order=wtt" ) DEFAULT_UA = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) def _browser_headers(user_agent: str) -> dict: return { "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", } def _fmt(hhmm: str) -> str: """Convert '0830' → '08:30'.""" hhmm = re.sub(r'[^0-9]', '', hhmm) if len(hhmm) == 4: return f"{hhmm[:2]}:{hhmm[2:]}" return hhmm def _parse_services(html: str, time_selector: str) -> dict[str, str]: """Return {train_id: time_string} from a servicelist page.""" root = lxml.html.fromstring(html) sl = root.cssselect('div.servicelist') if not sl: return {} result = {} for svc in sl[0].cssselect('a.service'): tid_els = svc.cssselect('div.tid') time_els = svc.cssselect(time_selector) if tid_els and time_els: tid = tid_els[0].text_content().strip() time_text = time_els[0].text_content().strip() if time_text: result[tid] = _fmt(time_text) return result def _parse_arrivals(html: str) -> dict[str, dict]: """Return {train_id: {'time': ..., 'platform': ...}} from a PAD arrivals page.""" root = lxml.html.fromstring(html) sl = root.cssselect('div.servicelist') if not sl: return {} result = {} for svc in sl[0].cssselect('a.service'): tid_els = svc.cssselect('div.tid') time_els = svc.cssselect('div.time.plan.a') if not (tid_els and time_els): continue time_text = time_els[0].text_content().strip() if not time_text: continue plat_els = svc.cssselect('div.platform') platform = plat_els[0].text_content().strip() if plat_els else '' result[tid_els[0].text_content().strip()] = { 'time': _fmt(time_text), 'platform': platform, } return result def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') -> list[dict]: """Fetch trains from station_crs to PAD; returns [{'depart_bristol', 'arrive_paddington', 'headcode', 'arrive_platform'}].""" headers = _browser_headers(user_agent) with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client: r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date)) r_pad = client.get(_PAD_FROM_TMPL.format(crs=station_crs, date=date)) departures = _parse_services(r_bri.text, 'div.time.plan.d') arrivals = _parse_arrivals(r_pad.text) trains = [ { 'depart_bristol': dep, 'arrive_paddington': arrivals[tid]['time'], 'arrive_platform': arrivals[tid]['platform'], 'headcode': tid, } for tid, dep in departures.items() if tid in arrivals ] return sorted(trains, key=lambda t: t['depart_bristol'])