#!/usr/bin/python3 """Check if conference websites are live.""" import os import re import smtplib import warnings from datetime import date from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse import requests import yaml from requests.adapters import HTTPAdapter from urllib3.exceptions import InsecureRequestWarning from urllib3.util.url import parse_url from conference import LiveConference, config, load_yaml class AbsoluteDNSAdapter(HTTPAdapter): """A custom adapter for requests to ensure hostnames are treated as absolute.""" def add_dot_to_hostname(self, url: str) -> str: """Append a dot to the hostname to treat it as an absolute domain name.""" parsed_url = parse_url(url) # Append a dot to the hostname if it's not already there. hostname = parsed_url.host assert hostname if not hostname.endswith("."): hostname += "." # Reconstruct the URL with the modified hostname. new_url: str = parsed_url._replace(host=hostname).url return new_url def send(self, request, **kwargs): # type: ignore """Override the send method to modify the request URL before sending.""" # Modify the request URL to ensure the hostname is treated as absolute. request.url = self.add_dot_to_hostname(request.url) return super().send(request, **kwargs) # Suppress only the single InsecureRequestWarning from urllib3 warnings.filterwarnings("ignore", category=InsecureRequestWarning) re_title = re.compile("(.*?)", re.DOTALL) AGENT = config["browser"]["User-Agent"] headers = {"User-Agent": AGENT, "Accept": "text/html"} s = requests.Session() s.headers.update(headers) # Create a session and mount the custom adapter for both HTTP and HTTPS requests. adapter = AbsoluteDNSAdapter() s.mount("http://", adapter) s.mount("https://", adapter) not_here_list = [ "The specified URL was not found.", "There is currently no text in this page.", "This page does not exist yet", "404 Not Found", "500 Internal Server Error", "Test Page for the Apache HTTP Server", "Site not found · GitHub Pages", "504: Gateway time-out", "504 Gateway Time-out", "502 Bad Gateway", "This page doesn't exist (404)", "Coming soon", "NOT_FOUND", "Resource Not Found", "Wikimedia Error", "The page you requested could not be found", "Ooops! Could Not Find It", "OpenStreetMap Authentication Proxy", "Error 404", "Under Construction", "Page not found", "Error 404: Page not found", "Barcamptools", "That page can’t be found.", "looks like there's no page here", "404 page", "Database Error", "You are not authorized to access this page", "Attention Required! | Cloudflare", ] def find_not_here_message(html: str) -> str | None: """Find not here message in web page.""" return next((not_here for not_here in not_here_list if not_here in html), None) def get_title(html: str) -> str: """Title from web page.""" m = re_title.search(html) return m.group(1).strip() if m and m.group(1) else "no title" def normalize_url(url: str) -> str: """ Normalize the URL by parsing and reconstructing to ensure uniformity. This handles cases like differing schemes, casing in the domain and trailing slashes. """ # Parse the URL into components parsed_url = urlparse(url) # Normalize the domain to lowercase and remove any trailing dot normalized_netloc = parsed_url.netloc.lower().rstrip(".") # Reconstruct the URL with normalized components return urlunparse(parsed_url._replace(netloc=normalized_netloc)) def check_conference( name: str, src_url: str, year: int ) -> tuple[bool, str, str | None]: """Check if conference is live.""" url = src_url.format(year=year) past_url = src_url.format(year=year - 1) try: # SotM Baltics has an invalid TLS certificate, but we don't care r = s.get(url, verify=False) except requests.exceptions.ConnectionError: return (False, "connection refused", None) not_here = find_not_here_message(r.text) if ( len(r.text) < 2048 and 'http-equiv="refresh"' in r.text and str(year) not in r.text ): return (False, "redirect to URL without year", r.url) if normalize_url(r.url) == normalize_url(past_url): return (False, "redirect to previous year", r.url) return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url) def send_mail(subject: str, body: str) -> None: """Send an e-mail.""" mail_from_address = config["mail"]["from_address"] mail_from_name = config["mail"]["from_name"] mail_to_address = config["mail"]["to_address"] mail_to_name = config["mail"]["to_name"] msg = MIMEText(body, "plain", "UTF-8") msg["Subject"] = subject msg["To"] = f"{mail_to_name} <{mail_to_address}>" msg["From"] = f"{mail_from_name} <{mail_from_address}>" msg["Date"] = formatdate() msg["Message-ID"] = make_msgid() # extra mail headers from config for header_name, value in config["mail_headers"].items(): msg[header_name] = value s = smtplib.SMTP(config["mail"]["smtp_host"]) s.sendmail(mail_from_address, [mail_to_address], msg.as_string()) s.quit() def check_conference_web_site(name: str, src_url: str, year: int) -> bool: """Check if an individual web site is live.""" assert "{year}" in src_url live, msg, redirect_to_url = check_conference(name, src_url, year) url = src_url.format(year=year) if live: if redirect_to_url == url: body = f"{name}\n{url}\nWeb page title: {msg}" else: body = f"""{name} {url} redirects to {redirect_to_url} Web page title: {msg}""" send_mail(f"Conference site live: {name} - {year}", body) return live def find_new_conference_web_sites( today: date, live: list[LiveConference] ) -> list[LiveConference]: """Check for new conference web sites going live.""" this_year = today.year new: list[LiveConference] = [] live_set = {(c["conference"], c["year"]) for c in live} for name, src_url in load_yaml("conferences").items(): new += [ {"conference": name, "year": year, "live": today} for year in (this_year, this_year + 1) if (name, year) not in live_set and check_conference_web_site(name, src_url, year) ] return new def main(show_not_live: bool = False) -> None: """Check fow new conference web sites.""" live: list[LiveConference] = load_yaml("live") if not (new := find_new_conference_web_sites(date.today(), live)): return live_filename = os.path.expanduser(config["data"]["live"]) with open(live_filename, "w") as out: yaml.dump(live + new, stream=out, sort_keys=False) if __name__ == "__main__": main()