diff --git a/parse_airbnb.py b/parse_airbnb.py new file mode 100755 index 0000000..65b27fc --- /dev/null +++ b/parse_airbnb.py @@ -0,0 +1,210 @@ +#!/usr/bin/python3 + +import json +import re +import sys +import typing +from datetime import datetime +from typing import Any +from zoneinfo import ZoneInfo + +import lxml.html +import pycountry +import yaml + +StrDict = dict[str, typing.Any] + + +def build_datetime(date_str: str, time_str: str, tz_name: str) -> datetime: + """ + Combine an ISO date string, HH:MM time string, and a timezone name + into a timezone-aware datetime in the specified timezone. + """ + dt_str = f"{date_str}T{time_str}" + naive_dt = datetime.fromisoformat(dt_str) + return naive_dt.replace(tzinfo=ZoneInfo(tz_name)) + + +def list_to_dict(items: list) -> dict[str, int]: + """Convert a flat list to a dict, assuming alternating keys and values.""" + return {items[i]: items[i + 1] for i in range(0, len(items), 2)} + + +def extract_country_code(address: str) -> str | None: + """Return ISO 3166-1 alpha-2 country code from a free-text address.""" + address_lower = address.lower() + for country in pycountry.countries: + if country.name.lower() in address_lower: + return str(country.alpha_2.lower()) + if ( + hasattr(country, "official_name") + and country.official_name.lower() in address_lower + ): + return str(country.alpha_2.lower()) + return None + + +def get_json_blob(tree) -> str: + data_id = "data-injector-instances" + js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0] + return js_string + + +def get_ui_state(tree) -> StrDict: + data_id = "data-injector-instances" + js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0] + # print(js_string) + # sys.exit(0) + big_blob = json.loads(js_string) + ui_state = walk_tree(big_blob, "uiState") + # print(json.dumps(ui_state)) + + return list_to_dict(ui_state[0]) + + +def get_reservation_data(ui_state: StrDict) -> StrDict: + return { + row["id"]: row for row in ui_state["reservation"]["scheduled_event"]["rows"] + } + + +def get_room_url(tree) -> str: + for e in tree.xpath('//a[@data-testid="reservation-destination-link"]'): + href = e.get("href") + assert isinstance(href, str) + if not href.startswith("/room"): + continue + return "https://www.airbnb.co.uk" + href + + +def get_price_from_reservation(reservation: StrDict) -> str: + price_string = reservation["payment_summary"]["subtitle"] + assert isinstance(price_string, str) + tc = "Total cost: " + assert price_string.startswith(tc) + price = price_string[len(tc) :] + assert price[0] == "£" + return price[1:] + + +def extract_booking_from_html(html_file: str) -> StrDict: + """Extract booking information from Airbnb HTML file.""" + + with open(html_file, "r", encoding="utf-8") as f: + text_content = f.read() + + confirmation_code = re.search( + r"/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/([A-Z0-9]+)", + text_content, + ).group(1) + + tree = lxml.html.parse(html_file) + root = tree.getroot() + try: + ui_state = get_ui_state(tree) + except Exception: + print(html_file) + raise + # print(json.dumps(ui_state)) + + reservation = get_reservation_data(ui_state) + m_guests = re.match(r"^(\d+) guests?$", reservation["guests"]["subtitle"]) + number_of_adults = int(m_guests.group(1)) + + price = get_price_from_reservation(reservation) + # print(reservation["payment_summary"]["subtitle"]) + # print(json.dumps(reservation)) + metadata = ui_state["reservation"]["metadata"] + country_code = metadata["country"].lower() + # pprint(metadata) + + # print(json.dumps(x)) + + title = reservation["dynamic_marquee_title_image_v3"]["title"] + location = title.rpartition(" in ")[2] + + # print(json.dumps(reservation)) + checkin_checkout = reservation["checkin_checkout_arrival_guide"] + # pprint(checkin_checkout) + check_in_time = checkin_checkout["leading_subtitle"] + check_out_time = checkin_checkout["trailing_subtitle"] + + check_in = build_datetime( + metadata["check_in_date"], check_in_time, metadata["timezone"] + ) + + check_out = build_datetime( + metadata["check_out_date"], check_out_time, metadata["timezone"] + ) + + # print(check_in, check_out) + + address = reservation["map"]["address"] + + # country_code = extract_country_code(address) + + # if "header_action.pdp" not in reservation: + # pprint(reservation) + + if "header_action.pdp" in reservation: + name = reservation["header_action.pdp"]["subtitle"] + else: + name = root.findtext(".//h1") + + booking = { + "type": "apartment", + "operator": "airbnb", + "name": name, + "location": location, + "booking_reference": confirmation_code, + "booking_url": f"https://www.airbnb.co.uk/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/{confirmation_code}", + "address": address, + "country": country_code, + "latitude": metadata["lat"], + "longitude": metadata["lng"], + "timezone": metadata["timezone"], + "from": check_in, + "to": check_out, + "price": price, + "currency": "GBP", + "number_of_adults": number_of_adults, + } + + booking["url"] = get_room_url(tree) + + return booking + + +def walk_tree(data: Any, want_key: str) -> Any: + """Recursively search for a dict containing 'reservation' and return its value.""" + if isinstance(data, dict): + if want_key in data: + return data[want_key] + for key, value in data.items(): + result = walk_tree(value, want_key) + if result is not None: + return result + elif isinstance(data, list): + for item in data: + result = walk_tree(item, want_key) + if result is not None: + return result + return None + + +def main() -> None: + """Main function.""" + + filenames = sorted(sys.argv[1:]) + + bookings = [] + for html_file in filenames: + booking = extract_booking_from_html(html_file) + assert booking + bookings.append(booking) + + print(yaml.dump(bookings, sort_keys=False)) + + +if __name__ == "__main__": + main()