Add airbnb trip detail parser.
This commit is contained in:
parent
155569419a
commit
6b3e8e31eb
210
parse_airbnb.py
Executable file
210
parse_airbnb.py
Executable file
|
@ -0,0 +1,210 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import lxml.html
|
||||
import pycountry
|
||||
import yaml
|
||||
|
||||
StrDict = dict[str, typing.Any]
|
||||
|
||||
|
||||
def build_datetime(date_str: str, time_str: str, tz_name: str) -> datetime:
|
||||
"""
|
||||
Combine an ISO date string, HH:MM time string, and a timezone name
|
||||
into a timezone-aware datetime in the specified timezone.
|
||||
"""
|
||||
dt_str = f"{date_str}T{time_str}"
|
||||
naive_dt = datetime.fromisoformat(dt_str)
|
||||
return naive_dt.replace(tzinfo=ZoneInfo(tz_name))
|
||||
|
||||
|
||||
def list_to_dict(items: list) -> dict[str, int]:
|
||||
"""Convert a flat list to a dict, assuming alternating keys and values."""
|
||||
return {items[i]: items[i + 1] for i in range(0, len(items), 2)}
|
||||
|
||||
|
||||
def extract_country_code(address: str) -> str | None:
|
||||
"""Return ISO 3166-1 alpha-2 country code from a free-text address."""
|
||||
address_lower = address.lower()
|
||||
for country in pycountry.countries:
|
||||
if country.name.lower() in address_lower:
|
||||
return str(country.alpha_2.lower())
|
||||
if (
|
||||
hasattr(country, "official_name")
|
||||
and country.official_name.lower() in address_lower
|
||||
):
|
||||
return str(country.alpha_2.lower())
|
||||
return None
|
||||
|
||||
|
||||
def get_json_blob(tree) -> str:
|
||||
data_id = "data-injector-instances"
|
||||
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
|
||||
return js_string
|
||||
|
||||
|
||||
def get_ui_state(tree) -> StrDict:
|
||||
data_id = "data-injector-instances"
|
||||
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
|
||||
# print(js_string)
|
||||
# sys.exit(0)
|
||||
big_blob = json.loads(js_string)
|
||||
ui_state = walk_tree(big_blob, "uiState")
|
||||
# print(json.dumps(ui_state))
|
||||
|
||||
return list_to_dict(ui_state[0])
|
||||
|
||||
|
||||
def get_reservation_data(ui_state: StrDict) -> StrDict:
|
||||
return {
|
||||
row["id"]: row for row in ui_state["reservation"]["scheduled_event"]["rows"]
|
||||
}
|
||||
|
||||
|
||||
def get_room_url(tree) -> str:
|
||||
for e in tree.xpath('//a[@data-testid="reservation-destination-link"]'):
|
||||
href = e.get("href")
|
||||
assert isinstance(href, str)
|
||||
if not href.startswith("/room"):
|
||||
continue
|
||||
return "https://www.airbnb.co.uk" + href
|
||||
|
||||
|
||||
def get_price_from_reservation(reservation: StrDict) -> str:
|
||||
price_string = reservation["payment_summary"]["subtitle"]
|
||||
assert isinstance(price_string, str)
|
||||
tc = "Total cost: "
|
||||
assert price_string.startswith(tc)
|
||||
price = price_string[len(tc) :]
|
||||
assert price[0] == "£"
|
||||
return price[1:]
|
||||
|
||||
|
||||
def extract_booking_from_html(html_file: str) -> StrDict:
|
||||
"""Extract booking information from Airbnb HTML file."""
|
||||
|
||||
with open(html_file, "r", encoding="utf-8") as f:
|
||||
text_content = f.read()
|
||||
|
||||
confirmation_code = re.search(
|
||||
r"/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/([A-Z0-9]+)",
|
||||
text_content,
|
||||
).group(1)
|
||||
|
||||
tree = lxml.html.parse(html_file)
|
||||
root = tree.getroot()
|
||||
try:
|
||||
ui_state = get_ui_state(tree)
|
||||
except Exception:
|
||||
print(html_file)
|
||||
raise
|
||||
# print(json.dumps(ui_state))
|
||||
|
||||
reservation = get_reservation_data(ui_state)
|
||||
m_guests = re.match(r"^(\d+) guests?$", reservation["guests"]["subtitle"])
|
||||
number_of_adults = int(m_guests.group(1))
|
||||
|
||||
price = get_price_from_reservation(reservation)
|
||||
# print(reservation["payment_summary"]["subtitle"])
|
||||
# print(json.dumps(reservation))
|
||||
metadata = ui_state["reservation"]["metadata"]
|
||||
country_code = metadata["country"].lower()
|
||||
# pprint(metadata)
|
||||
|
||||
# print(json.dumps(x))
|
||||
|
||||
title = reservation["dynamic_marquee_title_image_v3"]["title"]
|
||||
location = title.rpartition(" in ")[2]
|
||||
|
||||
# print(json.dumps(reservation))
|
||||
checkin_checkout = reservation["checkin_checkout_arrival_guide"]
|
||||
# pprint(checkin_checkout)
|
||||
check_in_time = checkin_checkout["leading_subtitle"]
|
||||
check_out_time = checkin_checkout["trailing_subtitle"]
|
||||
|
||||
check_in = build_datetime(
|
||||
metadata["check_in_date"], check_in_time, metadata["timezone"]
|
||||
)
|
||||
|
||||
check_out = build_datetime(
|
||||
metadata["check_out_date"], check_out_time, metadata["timezone"]
|
||||
)
|
||||
|
||||
# print(check_in, check_out)
|
||||
|
||||
address = reservation["map"]["address"]
|
||||
|
||||
# country_code = extract_country_code(address)
|
||||
|
||||
# if "header_action.pdp" not in reservation:
|
||||
# pprint(reservation)
|
||||
|
||||
if "header_action.pdp" in reservation:
|
||||
name = reservation["header_action.pdp"]["subtitle"]
|
||||
else:
|
||||
name = root.findtext(".//h1")
|
||||
|
||||
booking = {
|
||||
"type": "apartment",
|
||||
"operator": "airbnb",
|
||||
"name": name,
|
||||
"location": location,
|
||||
"booking_reference": confirmation_code,
|
||||
"booking_url": f"https://www.airbnb.co.uk/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/{confirmation_code}",
|
||||
"address": address,
|
||||
"country": country_code,
|
||||
"latitude": metadata["lat"],
|
||||
"longitude": metadata["lng"],
|
||||
"timezone": metadata["timezone"],
|
||||
"from": check_in,
|
||||
"to": check_out,
|
||||
"price": price,
|
||||
"currency": "GBP",
|
||||
"number_of_adults": number_of_adults,
|
||||
}
|
||||
|
||||
booking["url"] = get_room_url(tree)
|
||||
|
||||
return booking
|
||||
|
||||
|
||||
def walk_tree(data: Any, want_key: str) -> Any:
|
||||
"""Recursively search for a dict containing 'reservation' and return its value."""
|
||||
if isinstance(data, dict):
|
||||
if want_key in data:
|
||||
return data[want_key]
|
||||
for key, value in data.items():
|
||||
result = walk_tree(value, want_key)
|
||||
if result is not None:
|
||||
return result
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
result = walk_tree(item, want_key)
|
||||
if result is not None:
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main function."""
|
||||
|
||||
filenames = sorted(sys.argv[1:])
|
||||
|
||||
bookings = []
|
||||
for html_file in filenames:
|
||||
booking = extract_booking_from_html(html_file)
|
||||
assert booking
|
||||
bookings.append(booking)
|
||||
|
||||
print(yaml.dump(bookings, sort_keys=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in a new issue