Initial commit.
This commit is contained in:
commit
a8e0bd39e5
16 changed files with 981 additions and 0 deletions
0
scraper/__init__.py
Normal file
0
scraper/__init__.py
Normal file
90
scraper/eurostar.py
Normal file
90
scraper/eurostar.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Scrape Eurostar timetable via httpx.
|
||||
|
||||
The route-specific timetable pages are Next.js SSR — all departure data is
|
||||
embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
|
||||
|
||||
URL pattern:
|
||||
https://www.eurostar.com/uk-en/travel-info/timetable/
|
||||
{origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
|
||||
|
||||
Data path: props.pageProps.pageData.liveDepartures[]
|
||||
.origin.model.scheduledDepartureDateTime → London departure
|
||||
.destination.model.scheduledArrivalDateTime → destination arrival
|
||||
(already filtered to the requested stop, not the final stop)
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import httpx
|
||||
|
||||
DEFAULT_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
ROUTE_URLS = {
|
||||
'Paris Gare du Nord': (
|
||||
'https://www.eurostar.com/uk-en/travel-info/timetable/'
|
||||
'7015400/8727100/london-st-pancras-intl/paris-gare-du-nord'
|
||||
),
|
||||
'Brussels Midi': (
|
||||
'https://www.eurostar.com/uk-en/travel-info/timetable/'
|
||||
'7015400/8814001/london-st-pancras-intl/brussels-midi'
|
||||
),
|
||||
'Lille Europe': (
|
||||
'https://www.eurostar.com/uk-en/travel-info/timetable/'
|
||||
'7015400/8722326/london-st-pancras-intl/lille-europe'
|
||||
),
|
||||
'Amsterdam Centraal': (
|
||||
'https://www.eurostar.com/uk-en/travel-info/timetable/'
|
||||
'7015400/8400058/london-st-pancras-intl/amsterdam-centraal'
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _hhmm(dt_str: str | None) -> str | None:
|
||||
"""'2026-03-30 09:34:00' → '09:34'"""
|
||||
if not dt_str:
|
||||
return None
|
||||
m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
|
||||
return f"{m.group(1)}:{m.group(2)}" if m else None
|
||||
|
||||
|
||||
def _parse(html: str, destination: str) -> list[dict]:
|
||||
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if not m:
|
||||
return []
|
||||
data = json.loads(m.group(1))
|
||||
departures = data['props']['pageProps']['pageData']['liveDepartures']
|
||||
services = []
|
||||
for dep in departures:
|
||||
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
|
||||
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
|
||||
if dep_time and arr_time:
|
||||
services.append({
|
||||
'depart_st_pancras': dep_time,
|
||||
'arrive_destination': arr_time,
|
||||
'destination': destination,
|
||||
})
|
||||
return sorted(services, key=lambda s: s['depart_st_pancras'])
|
||||
|
||||
|
||||
async def fetch(destination: str, travel_date: str,
|
||||
user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
url = ROUTE_URLS[destination]
|
||||
headers = {
|
||||
'User-Agent': user_agent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-GB,en;q=0.9',
|
||||
}
|
||||
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=20) as client:
|
||||
r = await client.get(url, params={'date': travel_date})
|
||||
r.raise_for_status()
|
||||
return _parse(r.text, destination)
|
||||
|
||||
|
||||
def get_eurostar_times(destination: str, travel_date: str,
|
||||
user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Synchronous wrapper for CLI/testing."""
|
||||
return asyncio.run(fetch(destination, travel_date, user_agent))
|
||||
96
scraper/realtime_trains.py
Normal file
96
scraper/realtime_trains.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
"""
|
||||
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
|
||||
|
||||
Uses httpx (not Playwright) with browser-like headers.
|
||||
|
||||
Two fetches run concurrently:
|
||||
BRI/to/PAD → departure times from Bristol (div.time.plan.d)
|
||||
PAD/from/BRI → arrival times at Paddington (div.time.plan.a)
|
||||
Matched by train ID (div.tid).
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import httpx
|
||||
import lxml.html
|
||||
|
||||
BRI_TO_PAD = (
|
||||
"https://www.realtimetrains.co.uk/search/detailed/"
|
||||
"gb-nr:BRI/to/gb-nr:PAD/{date}/0000-2359"
|
||||
"?stp=WVS&show=pax-calls&order=wtt"
|
||||
)
|
||||
PAD_FROM_BRI = (
|
||||
"https://www.realtimetrains.co.uk/search/detailed/"
|
||||
"gb-nr:PAD/from/gb-nr:BRI/{date}/0000-2359"
|
||||
"?stp=WVS&show=pax-calls&order=wtt"
|
||||
)
|
||||
|
||||
DEFAULT_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def _browser_headers(user_agent: str) -> dict:
|
||||
return {
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
}
|
||||
|
||||
|
||||
def _fmt(hhmm: str) -> str:
|
||||
"""Convert '0830' → '08:30'."""
|
||||
hhmm = re.sub(r'[^0-9]', '', hhmm)
|
||||
if len(hhmm) == 4:
|
||||
return f"{hhmm[:2]}:{hhmm[2:]}"
|
||||
return hhmm
|
||||
|
||||
|
||||
def _parse_services(html: str, time_selector: str) -> dict[str, str]:
|
||||
"""Return {train_id: time_string} from a servicelist page."""
|
||||
root = lxml.html.fromstring(html)
|
||||
sl = root.cssselect('div.servicelist')
|
||||
if not sl:
|
||||
return {}
|
||||
result = {}
|
||||
for svc in sl[0].cssselect('a.service'):
|
||||
tid_els = svc.cssselect('div.tid')
|
||||
time_els = svc.cssselect(time_selector)
|
||||
if tid_els and time_els:
|
||||
tid = tid_els[0].text_content().strip()
|
||||
time_text = time_els[0].text_content().strip()
|
||||
if time_text:
|
||||
result[tid] = _fmt(time_text)
|
||||
return result
|
||||
|
||||
|
||||
async def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Fetch GWR trains concurrently; returns [{'depart_bristol', 'arrive_paddington'}]."""
|
||||
headers = _browser_headers(user_agent)
|
||||
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=30) as client:
|
||||
r_bri, r_pad = await asyncio.gather(
|
||||
client.get(BRI_TO_PAD.format(date=date)),
|
||||
client.get(PAD_FROM_BRI.format(date=date)),
|
||||
)
|
||||
|
||||
departures = _parse_services(r_bri.text, 'div.time.plan.d')
|
||||
arrivals = _parse_services(r_pad.text, 'div.time.plan.a')
|
||||
|
||||
trains = [
|
||||
{'depart_bristol': dep, 'arrive_paddington': arr}
|
||||
for tid, dep in departures.items()
|
||||
if (arr := arrivals.get(tid))
|
||||
]
|
||||
return sorted(trains, key=lambda t: t['depart_bristol'])
|
||||
|
||||
|
||||
def get_gwr_trains(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
|
||||
"""Synchronous wrapper around fetch() for CLI/testing use."""
|
||||
return asyncio.run(fetch(date, user_agent))
|
||||
Loading…
Add table
Add a link
Reference in a new issue