From 28865b078363c1023fe272c8caa99b39c32a80a1 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 13 Oct 2024 16:27:38 +0100 Subject: [PATCH] Detect and handle redirects Closes: #6 --- check.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/check.py b/check.py index 79d9dc2..efcfa08 100755 --- a/check.py +++ b/check.py @@ -9,6 +9,7 @@ import warnings from datetime import date from email.mime.text import MIMEText from email.utils import formatdate, make_msgid +from urllib.parse import urlparse, urlunparse import requests import yaml @@ -105,8 +106,25 @@ def get_title(html: str) -> str: return m.group(1).strip() if m and m.group(1) else "no title" -def check_conference(name: str, url: str) -> tuple[bool, str]: +def normalize_url(url: str) -> str: + """ + Normalize the URL by parsing and reconstructing to ensure uniformity. + + This handles cases like differing schemes, casing in the domain + and trailing slashes. + """ + # Parse the URL into components + parsed_url = urlparse(url) + # Normalize the domain to lowercase and remove any trailing dot + normalized_netloc = parsed_url.netloc.lower().rstrip(".") + # Reconstruct the URL with normalized components + return urlunparse(parsed_url._replace(netloc=normalized_netloc)) + + +def check_conference(name: str, src_url: str, year: int) -> tuple[bool, str]: """Check if conference is live.""" + url = src_url.format(year=year) + past_url = src_url.format(year=year - 1) try: # SotM Baltics has an invalid TLS certificate, but we don't care r = s.get(url, verify=False) @@ -114,6 +132,19 @@ def check_conference(name: str, url: str) -> tuple[bool, str]: return (False, "connection refused") not_here = find_not_here_message(r.text) + if ( + len(r.text) < 2048 + and 'http-equiv="refresh"' in r.text + and str(year) not in r.text + ): + return (False, "redirect to URL without year") + + if str(year) not in r.url: + return (False, "redirect to URL without year") + + if normalize_url(r.url) == normalize_url(past_url): + return (False, "redirect to previous year") + return (False, not_here) if not_here else (True, get_title(r.text)) @@ -143,7 +174,8 @@ def send_mail(subject: str, body: str) -> None: def check_conference_web_site(name: str, src_url: str, year: int) -> bool: """Check if an individual web site is live.""" assert "{year}" in src_url - live, msg = check_conference(name, url := src_url.format(year=year)) + live, msg = check_conference(name, src_url, year) + url = src_url.format(year=year) if live: body = f"{name}\n{url}\nWeb page title: {msg}" send_mail(f"Conference site live: {name} - {year}", body)