Various improvements
This commit is contained in:
parent
2090268754
commit
876eb6a759
5 changed files with 98 additions and 72 deletions
|
|
@ -1,14 +1,11 @@
|
|||
"""
|
||||
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
|
||||
|
||||
Uses httpx (not Playwright) with browser-like headers.
|
||||
|
||||
Two fetches run concurrently:
|
||||
Two fetches:
|
||||
BRI/to/PAD → departure times from Bristol (div.time.plan.d)
|
||||
PAD/from/BRI → arrival times at Paddington (div.time.plan.a)
|
||||
Matched by train ID (div.tid).
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import httpx
|
||||
import lxml.html
|
||||
|
|
@ -71,26 +68,19 @@ def _parse_services(html: str, time_selector: str) -> dict[str, str]:
|
|||
return result
|
||||
|
||||
|
||||
async def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Fetch GWR trains concurrently; returns [{'depart_bristol', 'arrive_paddington'}]."""
|
||||
def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Fetch GWR trains; returns [{'depart_bristol', 'arrive_paddington', 'headcode'}]."""
|
||||
headers = _browser_headers(user_agent)
|
||||
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=30) as client:
|
||||
r_bri, r_pad = await asyncio.gather(
|
||||
client.get(BRI_TO_PAD.format(date=date)),
|
||||
client.get(PAD_FROM_BRI.format(date=date)),
|
||||
)
|
||||
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
|
||||
r_bri = client.get(BRI_TO_PAD.format(date=date))
|
||||
r_pad = client.get(PAD_FROM_BRI.format(date=date))
|
||||
|
||||
departures = _parse_services(r_bri.text, 'div.time.plan.d')
|
||||
arrivals = _parse_services(r_pad.text, 'div.time.plan.a')
|
||||
|
||||
trains = [
|
||||
{'depart_bristol': dep, 'arrive_paddington': arr}
|
||||
{'depart_bristol': dep, 'arrive_paddington': arr, 'headcode': tid}
|
||||
for tid, dep in departures.items()
|
||||
if (arr := arrivals.get(tid))
|
||||
]
|
||||
return sorted(trains, key=lambda t: t['depart_bristol'])
|
||||
|
||||
|
||||
def get_gwr_trains(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Synchronous wrapper around fetch() for CLI/testing use."""
|
||||
return asyncio.run(fetch(date, user_agent))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue