Add return and inbound journey support

This commit is contained in:
Edward Betts 2026-05-21 08:46:35 +01:00
parent 6ba71447ef
commit 9691632f65
12 changed files with 1687 additions and 486 deletions

View file

@ -16,7 +16,8 @@ DEFAULT_UA = (
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
ORIGIN_STATION_ID = '7015400'
ST_PANCRAS_STATION_ID = '7015400'
ORIGIN_STATION_ID = ST_PANCRAS_STATION_ID
DESTINATION_STATION_IDS = {
'Paris Gare du Nord': '8727100',
@ -35,11 +36,11 @@ _GATEWAY_URL = 'https://site-api.eurostar.com/gateway'
_GQL_QUERY = (
"query NewBookingSearch("
"$origin:String!,$destination:String!,$outbound:String!,"
"$currency:Currency!,$adult:Int,"
"$inbound:String,$currency:Currency!,$adult:Int,"
"$filteredClassesOfService:[ClassOfServiceEnum]"
"){"
"journeySearch("
"outboundDate:$outbound origin:$origin destination:$destination"
"outboundDate:$outbound inboundDate:$inbound origin:$origin destination:$destination"
" adults:$adult currency:$currency"
" productFamilies:[\"PUB\"] contractCode:\"EIL_ALL\""
" adults16Plus:0 children:0 youths:0 children4Only:0 children5To11:0"
@ -64,6 +65,22 @@ _GQL_QUERY = (
"}"
"}"
"}"
"inbound{"
"journeys("
"hideIndirectTrainsWhenDisruptedAndCancelled:false"
" hideDepartedTrains:true"
" hideExternalCarrierTrains:true"
" hideDirectExternalCarrierTrains:true"
"){"
"timing{departureTime:departs arrivalTime:arrives}"
"fares(filteredClassesOfService:$filteredClassesOfService){"
"classOfService{code}"
"prices{displayPrice}"
"seats "
"legs{serviceName serviceType{code}}"
"}"
"}"
"}"
"}"
"}"
)
@ -72,11 +89,19 @@ _STANDARD = 'STANDARD'
_STANDARD_PLUS = 'PLUS'
def search_url(destination: str, travel_date: str) -> str:
def search_url(destination: str, travel_date: str, direction: str = "outbound", return_date: str | None = None) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
origin = ST_PANCRAS_STATION_ID
destination_id = dest_id
outbound = travel_date
inbound = return_date
if direction == "inbound":
origin, destination_id = dest_id, ST_PANCRAS_STATION_ID
inbound = None
return (
f'https://www.eurostar.com/search/uk-en'
f'?adult=1&origin={ORIGIN_STATION_ID}&destination={dest_id}&outbound={travel_date}'
f'?adult=1&origin={origin}&destination={destination_id}&outbound={outbound}'
+ (f'&inbound={inbound}' if inbound else '')
)
@ -85,7 +110,7 @@ def _generate_cid() -> str:
return 'SRCH-' + ''.join(random.choices(chars, k=22))
def _parse_graphql(data: dict, destination: str) -> list[dict]:
def _parse_journeys(journeys: list[dict], destination: str, direction: str) -> list[dict]:
"""
Parse a NewBookingSearch GraphQL response into a list of service dicts.
@ -97,7 +122,6 @@ def _parse_graphql(data: dict, destination: str) -> list[dict]:
Multi-leg train numbers are joined with ' + ' (e.g. 'ES 9116 + ER 9329').
"""
best: dict[str, dict] = {}
journeys = data['data']['journeySearch']['outbound']['journeys']
for journey in journeys:
dep = journey['timing']['departureTime']
arr = journey['timing']['arrivalTime']
@ -118,8 +142,21 @@ def _parse_graphql(data: dict, destination: str) -> list[dict]:
std_price, std_seats = price, seats
elif cos == _STANDARD_PLUS:
plus_price, plus_seats = price, seats
if dep not in best or arr < best[dep]['arrive_destination']:
best[dep] = {
if direction == 'inbound':
service = {
'depart_destination': dep,
'arrive_st_pancras': arr,
'destination': destination,
'train_number': train_number,
'price': std_price,
'seats': std_seats,
'plus_price': plus_price,
'plus_seats': plus_seats,
}
key = dep
arrive_key = 'arrive_st_pancras'
else:
service = {
'depart_st_pancras': dep,
'arrive_destination': arr,
'destination': destination,
@ -129,18 +166,43 @@ def _parse_graphql(data: dict, destination: str) -> list[dict]:
'plus_price': plus_price,
'plus_seats': plus_seats,
}
return sorted(best.values(), key=lambda s: s['depart_st_pancras'])
key = dep
arrive_key = 'arrive_destination'
if key not in best or arr < best[key][arrive_key]:
best[key] = service
sort_key = 'depart_destination' if direction == 'inbound' else 'depart_st_pancras'
return sorted(best.values(), key=lambda s: s[sort_key])
def fetch(destination: str, travel_date: str) -> list[dict]:
"""
Return all Eurostar services for destination on travel_date.
def _parse_graphql(data: dict, destination: str) -> list[dict]:
journeys = data['data']['journeySearch']['outbound']['journeys']
return _parse_journeys(journeys, destination, 'outbound')
Each dict contains timetable info (depart_st_pancras, arrive_destination,
train_number) plus pricing (price, seats) from a single GraphQL call.
"""
dest_id = DESTINATION_STATION_IDS[destination]
headers = {
def _parse_graphql_leg(data: dict, destination: str, leg: str, direction: str) -> list[dict]:
journeys = data['data']['journeySearch'][leg]['journeys']
return _parse_journeys(journeys, destination, direction)
def _payload(origin: str, destination_id: str, outbound: str, inbound: str | None = None) -> dict:
variables = {
'origin': origin,
'destination': destination_id,
'outbound': outbound,
'inbound': inbound,
'currency': 'GBP',
'adult': 1,
'filteredClassesOfService': [_STANDARD, _STANDARD_PLUS],
}
return {
'operationName': 'NewBookingSearch',
'variables': variables,
'query': _GQL_QUERY,
}
def _headers() -> dict:
return {
'User-Agent': DEFAULT_UA,
'Content-Type': 'application/json',
'Accept': '*/*',
@ -151,18 +213,42 @@ def fetch(destination: str, travel_date: str) -> list[dict]:
'x-source-url': 'search-app/',
'cid': _generate_cid(),
}
payload = {
'operationName': 'NewBookingSearch',
'variables': {
'origin': ORIGIN_STATION_ID,
'destination': dest_id,
'outbound': travel_date,
'currency': 'GBP',
'adult': 1,
'filteredClassesOfService': [_STANDARD, _STANDARD_PLUS],
},
'query': _GQL_QUERY,
}
resp = requests.post(_GATEWAY_URL, json=payload, headers=headers, timeout=20)
def fetch(destination: str, travel_date: str, direction: str = 'outbound') -> list[dict]:
"""
Return all Eurostar services for destination on travel_date.
Each dict contains timetable info (depart_st_pancras, arrive_destination,
train_number) plus pricing (price, seats) from a single GraphQL call.
"""
dest_id = DESTINATION_STATION_IDS[destination]
if direction == 'inbound':
origin, destination_id = dest_id, ST_PANCRAS_STATION_ID
else:
origin, destination_id = ST_PANCRAS_STATION_ID, dest_id
resp = requests.post(
_GATEWAY_URL,
json=_payload(origin, destination_id, travel_date),
headers=_headers(),
timeout=20,
)
resp.raise_for_status()
return _parse_graphql(resp.json(), destination)
leg_direction = 'inbound' if direction == 'inbound' else 'outbound'
return _parse_graphql_leg(resp.json(), destination, 'outbound', leg_direction)
def fetch_return(destination: str, outbound_date: str, return_date: str) -> dict[str, list[dict]]:
dest_id = DESTINATION_STATION_IDS[destination]
resp = requests.post(
_GATEWAY_URL,
json=_payload(ST_PANCRAS_STATION_ID, dest_id, outbound_date, return_date),
headers=_headers(),
timeout=20,
)
resp.raise_for_status()
data = resp.json()
return {
'outbound': _parse_graphql_leg(data, destination, 'outbound', 'outbound'),
'inbound': _parse_graphql_leg(data, destination, 'inbound', 'inbound'),
}

View file

@ -32,7 +32,8 @@ def _headers() -> dict:
def _request_body(
station_crs: str,
from_code: str,
to_code: str,
travel_date: str,
conversation_token: str | None,
later: bool,
@ -44,8 +45,8 @@ def _request_body(
"IsPreviousReturn": False,
"campaignCode": "",
"validationCode": "",
"locfrom": f"GB{station_crs}",
"locto": _PAD_CODE,
"locfrom": from_code,
"locto": to_code,
"datetimedepart": f"{travel_date}T00:00:00",
"outwarddepartafter": True,
"datetimereturn": None,
@ -67,7 +68,22 @@ def _request_body(
}
def _run_pages(station_crs: str, travel_date: str, first_class: bool = False):
def _station_code(station_crs: str) -> str:
return f"GB{station_crs}"
def _od_codes(station_crs: str, direction: str) -> tuple[str, str]:
if direction == "from_paddington":
return _PAD_CODE, _station_code(station_crs)
return _station_code(station_crs), _PAD_CODE
def _run_pages(
station_crs: str,
travel_date: str,
first_class: bool = False,
direction: str = "to_paddington",
):
"""
Iterate all pages of GWR journey search results.
@ -78,8 +94,9 @@ def _run_pages(station_crs: str, travel_date: str, first_class: bool = False):
with httpx.Client(headers=_headers(), timeout=30) as client:
conversation_token = None
later = False
from_code, to_code = _od_codes(station_crs, direction)
for _ in range(_MAX_PAGES):
body = _request_body(station_crs, travel_date, conversation_token, later)
body = _request_body(from_code, to_code, travel_date, conversation_token, later)
if first_class:
body["firstclass"] = True
body["standardclass"] = False
@ -99,7 +116,12 @@ def _run_pages(station_crs: str, travel_date: str, first_class: bool = False):
later = True
def _run_pages_batched(station_crs: str, travel_date: str, first_class: bool = False):
def _run_pages_batched(
station_crs: str,
travel_date: str,
first_class: bool = False,
direction: str = "to_paddington",
):
"""
Like _run_pages but yields one list of (dep_time, fares_list) per API page call,
allowing callers to stream results a page at a time.
@ -108,8 +130,9 @@ def _run_pages_batched(station_crs: str, travel_date: str, first_class: bool = F
with httpx.Client(headers=_headers(), timeout=30) as client:
conversation_token = None
later = False
from_code, to_code = _od_codes(station_crs, direction)
for _ in range(_MAX_PAGES):
body = _request_body(station_crs, travel_date, conversation_token, later)
body = _request_body(from_code, to_code, travel_date, conversation_token, later)
if first_class:
body["firstclass"] = True
body["standardclass"] = False
@ -132,16 +155,18 @@ def _run_pages_batched(station_crs: str, travel_date: str, first_class: bool = F
later = True
def fetch(station_crs: str, travel_date: str) -> dict[str, dict]:
def fetch(
station_crs: str, travel_date: str, direction: str = "to_paddington"
) -> dict[str, dict]:
"""
Fetch GWR walk-on single fares from station_crs to London Paddington on travel_date.
Fetch GWR walk-on single fares for the selected Paddington direction.
Returns {departure_time: {'ticket': name, 'price': float, 'code': code}}
where price is in £ and only the cheapest available standard-class walk-on
ticket per departure (with restrictions already applied by GWR) is kept.
"""
result: dict[str, dict] = {}
for dep_time, fares in _run_pages(station_crs, travel_date):
for dep_time, fares in _run_pages(station_crs, travel_date, direction=direction):
cheapest = None
for fare in fares:
code = fare.get("ticketTypeCode")
@ -166,7 +191,9 @@ def fetch(station_crs: str, travel_date: str) -> dict[str, dict]:
return result
def fetch_advance(station_crs: str, travel_date: str) -> dict[str, dict]:
def fetch_advance(
station_crs: str, travel_date: str, direction: str = "to_paddington"
) -> dict[str, dict]:
"""
Fetch advance fares: cheapest standard advance and first-class advance per departure.
@ -175,7 +202,9 @@ def fetch_advance(station_crs: str, travel_date: str) -> dict[str, dict]:
where each sub-dict has keys 'ticket', 'price', 'code'.
"""
std_advance: dict[str, dict] = {}
for dep_time, fares in _run_pages(station_crs, travel_date, first_class=False):
for dep_time, fares in _run_pages(
station_crs, travel_date, first_class=False, direction=direction
):
cheapest = None
for fare in fares:
code = fare.get("ticketTypeCode")
@ -199,7 +228,9 @@ def fetch_advance(station_crs: str, travel_date: str) -> dict[str, dict]:
}
first_advance: dict[str, dict] = {}
for dep_time, fares in _run_pages(station_crs, travel_date, first_class=True):
for dep_time, fares in _run_pages(
station_crs, travel_date, first_class=True, direction=direction
):
cheapest = None
for fare in fares:
price_pence = fare.get("fare", 0)
@ -227,7 +258,9 @@ def fetch_advance(station_crs: str, travel_date: str) -> dict[str, dict]:
}
def fetch_advance_streaming(station_crs: str, travel_date: str):
def fetch_advance_streaming(
station_crs: str, travel_date: str, direction: str = "to_paddington"
):
"""
Generator yielding partial advance fare dicts one GWR API page at a time.
@ -236,7 +269,9 @@ def fetch_advance_streaming(station_crs: str, travel_date: str):
yielded immediately so callers can stream prices to clients as they arrive.
"""
# Pass 1: standard class advance fares
for batch in _run_pages_batched(station_crs, travel_date, first_class=False):
for batch in _run_pages_batched(
station_crs, travel_date, first_class=False, direction=direction
):
page: dict[str, dict] = {}
for dep_time, fares in batch:
cheapest = None
@ -267,7 +302,9 @@ def fetch_advance_streaming(station_crs: str, travel_date: str):
yield page
# Pass 2: first class advance fares
for batch in _run_pages_batched(station_crs, travel_date, first_class=True):
for batch in _run_pages_batched(
station_crs, travel_date, first_class=True, direction=direction
):
page = {}
for dep_time, fares in batch:
cheapest = None

View file

@ -1,5 +1,6 @@
"""
Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
Scrape direct trains between a selected station and London Paddington using
Realtime Trains.
Two fetches:
BRI/to/PAD departure times from Bristol (div.time.plan.d)
@ -20,6 +21,16 @@ _PAD_FROM_TMPL = (
"gb-nr:PAD/from/gb-nr:{crs}/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
_PAD_TO_TMPL = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:PAD/to/gb-nr:{crs}/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
_FROM_PAD_TMPL = (
"https://www.realtimetrains.co.uk/search/detailed/"
"gb-nr:{crs}/from/gb-nr:PAD/{date}/0000-2359"
"?stp=WVS&show=pax-calls&order=wtt"
)
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
@ -69,7 +80,7 @@ def _parse_services(html: str, time_selector: str) -> dict[str, str]:
def _parse_arrivals(html: str) -> dict[str, dict]:
"""Return {train_id: {'time': ..., 'platform': ...}} from a PAD arrivals page."""
"""Return {train_id: {'time': ..., 'platform': ...}} from an arrivals page."""
root = lxml.html.fromstring(html)
sl = root.cssselect('div.servicelist')
if not sl:
@ -93,7 +104,7 @@ def _parse_arrivals(html: str) -> dict[str, dict]:
def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') -> list[dict]:
"""Fetch trains from station_crs to PAD; returns [{'depart_bristol', 'arrive_paddington', 'headcode', 'arrive_platform'}]."""
"""Fetch trains from station_crs to PAD."""
headers = _browser_headers(user_agent)
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
r_bri = client.get(_TO_PAD_TMPL.format(crs=station_crs, date=date))
@ -113,3 +124,44 @@ def fetch(date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI') ->
if tid in arrivals
]
return sorted(trains, key=lambda t: t['depart_bristol'])
def fetch_to_paddington(
date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI'
) -> list[dict]:
"""Fetch trains from station_crs to PAD using generic field names."""
return [
{
**train,
"depart_origin": train["depart_bristol"],
"arrive_paddington": train["arrive_paddington"],
"arrive_platform": train.get("arrive_platform", ""),
"headcode": train.get("headcode", ""),
}
for train in fetch(date, user_agent, station_crs)
]
def fetch_from_paddington(
date: str, user_agent: str = DEFAULT_UA, station_crs: str = 'BRI'
) -> list[dict]:
"""Fetch trains from PAD to station_crs."""
headers = _browser_headers(user_agent)
with httpx.Client(headers=headers, follow_redirects=True, timeout=30) as client:
r_pad = client.get(_PAD_TO_TMPL.format(crs=station_crs, date=date))
r_station = client.get(_FROM_PAD_TMPL.format(crs=station_crs, date=date))
departures = _parse_services(r_pad.text, 'div.time.plan.d')
arrivals = _parse_arrivals(r_station.text)
trains = [
{
"depart_paddington": dep,
"arrive_destination": arrivals[tid]["time"],
"arrive_platform": arrivals[tid]["platform"],
"headcode": tid,
}
for tid, dep in departures.items()
if tid in arrivals
]
return sorted(trains, key=lambda t: t["depart_paddington"])