#!/usr/bin/python3 import json import re import sys import typing from datetime import datetime from typing import Any from zoneinfo import ZoneInfo import lxml.html import pycountry import yaml StrDict = dict[str, typing.Any] def build_datetime(date_str: str, time_str: str, tz_name: str) -> datetime: """ Combine an ISO date string, HH:MM time string, and a timezone name into a timezone-aware datetime in the specified timezone. """ dt_str = f"{date_str}T{time_str}" naive_dt = datetime.fromisoformat(dt_str) return naive_dt.replace(tzinfo=ZoneInfo(tz_name)) def list_to_dict(items: list) -> dict[str, int]: """Convert a flat list to a dict, assuming alternating keys and values.""" return {items[i]: items[i + 1] for i in range(0, len(items), 2)} def extract_country_code(address: str) -> str | None: """Return ISO 3166-1 alpha-2 country code from a free-text address.""" address_lower = address.lower() for country in pycountry.countries: if country.name.lower() in address_lower: return str(country.alpha_2.lower()) if ( hasattr(country, "official_name") and country.official_name.lower() in address_lower ): return str(country.alpha_2.lower()) return None def get_json_blob(tree) -> str: data_id = "data-injector-instances" js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0] return js_string def get_ui_state(tree) -> StrDict: data_id = "data-injector-instances" js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0] # print(js_string) # sys.exit(0) big_blob = json.loads(js_string) ui_state = walk_tree(big_blob, "uiState") # print(json.dumps(ui_state)) return list_to_dict(ui_state[0]) def get_reservation_data(ui_state: StrDict) -> StrDict: return { row["id"]: row for row in ui_state["reservation"]["scheduled_event"]["rows"] } def get_room_url(tree) -> str: for e in tree.xpath('//a[@data-testid="reservation-destination-link"]'): href = e.get("href") assert isinstance(href, str) if not href.startswith("/room"): continue return "https://www.airbnb.co.uk" + href def get_price_from_reservation(reservation: StrDict) -> str: price_string = reservation["payment_summary"]["subtitle"] assert isinstance(price_string, str) tc = "Total cost: " assert price_string.startswith(tc) price = price_string[len(tc) :] assert price[0] == "£" return price[1:] def extract_booking_from_html(html_file: str) -> StrDict: """Extract booking information from Airbnb HTML file.""" with open(html_file, "r", encoding="utf-8") as f: text_content = f.read() confirmation_code = re.search( r"/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/([A-Z0-9]+)", text_content, ).group(1) tree = lxml.html.parse(html_file) root = tree.getroot() try: ui_state = get_ui_state(tree) except Exception: print(html_file) raise # print(json.dumps(ui_state)) reservation = get_reservation_data(ui_state) m_guests = re.match(r"^(\d+) guests?$", reservation["guests"]["subtitle"]) number_of_adults = int(m_guests.group(1)) price = get_price_from_reservation(reservation) # print(reservation["payment_summary"]["subtitle"]) # print(json.dumps(reservation)) metadata = ui_state["reservation"]["metadata"] country_code = metadata["country"].lower() # pprint(metadata) # print(json.dumps(x)) title = reservation["dynamic_marquee_title_image_v3"]["title"] location = title.rpartition(" in ")[2] # print(json.dumps(reservation)) checkin_checkout = reservation["checkin_checkout_arrival_guide"] # pprint(checkin_checkout) check_in_time = checkin_checkout["leading_subtitle"] check_out_time = checkin_checkout["trailing_subtitle"] check_in = build_datetime( metadata["check_in_date"], check_in_time, metadata["timezone"] ) check_out = build_datetime( metadata["check_out_date"], check_out_time, metadata["timezone"] ) # print(check_in, check_out) address = reservation["map"]["address"] # country_code = extract_country_code(address) # if "header_action.pdp" not in reservation: # pprint(reservation) if "header_action.pdp" in reservation: name = reservation["header_action.pdp"]["subtitle"] else: name = root.findtext(".//h1") booking = { "type": "apartment", "operator": "airbnb", "name": name, "location": location, "booking_reference": confirmation_code, "booking_url": f"https://www.airbnb.co.uk/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/{confirmation_code}", "address": address, "country": country_code, "latitude": metadata["lat"], "longitude": metadata["lng"], "timezone": metadata["timezone"], "from": check_in, "to": check_out, "price": price, "currency": "GBP", "number_of_adults": number_of_adults, } booking["url"] = get_room_url(tree) return booking def walk_tree(data: Any, want_key: str) -> Any: """Recursively search for a dict containing 'reservation' and return its value.""" if isinstance(data, dict): if want_key in data: return data[want_key] for key, value in data.items(): result = walk_tree(value, want_key) if result is not None: return result elif isinstance(data, list): for item in data: result = walk_tree(item, want_key) if result is not None: return result return None def main() -> None: """Main function.""" filenames = sorted(sys.argv[1:]) bookings = [] for html_file in filenames: booking = extract_booking_from_html(html_file) assert booking bookings.append(booking) print(yaml.dump(bookings, sort_keys=False)) if __name__ == "__main__": main()