Various improvements

This commit is contained in:
Edward Betts 2026-03-31 10:42:30 +01:00
parent 2090268754
commit 876eb6a759
5 changed files with 98 additions and 72 deletions

View file

@ -13,7 +13,6 @@ Data path: props.pageProps.pageData.liveDepartures[]
.destination.model.scheduledArrivalDateTime destination arrival
(already filtered to the requested stop, not the final stop)
"""
import asyncio
import json
import re
import httpx
@ -62,29 +61,26 @@ def _parse(html: str, destination: str) -> list[dict]:
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
if dep_time and arr_time:
carrier = dep.get('model', {}).get('carrier', 'ES')
number = dep.get('model', {}).get('trainNumber', '')
services.append({
'depart_st_pancras': dep_time,
'arrive_destination': arr_time,
'destination': destination,
'train_number': f"{carrier} {number}" if number else '',
})
return sorted(services, key=lambda s: s['depart_st_pancras'])
async def fetch(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
def fetch(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
url = ROUTE_URLS[destination]
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-GB,en;q=0.9',
}
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=20) as client:
r = await client.get(url, params={'date': travel_date})
with httpx.Client(headers=headers, follow_redirects=True, timeout=20) as client:
r = client.get(url, params={'date': travel_date})
r.raise_for_status()
return _parse(r.text, destination)
def get_eurostar_times(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
"""Synchronous wrapper for CLI/testing."""
return asyncio.run(fetch(destination, travel_date, user_agent))

View file

@ -1,14 +1,11 @@
"""
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
Uses httpx (not Playwright) with browser-like headers.
Two fetches run concurrently:
Two fetches:
BRI/to/PAD departure times from Bristol (div.time.plan.d)
PAD/from/BRI arrival times at Paddington (div.time.plan.a)
Matched by train ID (div.tid).
"""
import asyncio
import re
import httpx
import lxml.html
@ -71,26 +68,19 @@ def _parse_services(html: str, time_selector: str) -> dict[str, str]:
return result
async def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Fetch GWR trains concurrently; returns [{'depart_bristol', 'arrive_paddington'}]."""
def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Fetch GWR trains; returns [{'depart_bristol', 'arrive_paddington', 'headcode'}]."""
headers = _browser_headers(user_agent)
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=30) as client:
r_bri, r_pad = await asyncio.gather(
client.get(BRI_TO_PAD.format(date=date)),
client.get(PAD_FROM_BRI.format(date=date)),
)
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
r_bri = client.get(BRI_TO_PAD.format(date=date))
r_pad = client.get(PAD_FROM_BRI.format(date=date))
departures = _parse_services(r_bri.text, 'div.time.plan.d')
arrivals = _parse_services(r_pad.text, 'div.time.plan.a')
trains = [
{'depart_bristol': dep, 'arrive_paddington': arr}
{'depart_bristol': dep, 'arrive_paddington': arr, 'headcode': tid}
for tid, dep in departures.items()
if (arr := arrivals.get(tid))
]
return sorted(trains, key=lambda t: t['depart_bristol'])
def get_gwr_trains(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
"""Synchronous wrapper around fetch() for CLI/testing use."""
return asyncio.run(fetch(date, user_agent))