conference-check/check.py

#!/usr/bin/python3

"""Check if conference websites are live."""

import os
import re
import smtplib
import warnings
from datetime import date
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse

import requests
import yaml
from requests.adapters import HTTPAdapter
from urllib3.exceptions import InsecureRequestWarning
from urllib3.util.url import parse_url

from conference import LiveConference, config, load_yaml


class AbsoluteDNSAdapter(HTTPAdapter):
    """A custom adapter for requests to ensure hostnames are treated as absolute."""

    def add_dot_to_hostname(self, url: str) -> str:
        """Append a dot to the hostname to treat it as an absolute domain name."""
        parsed_url = parse_url(url)

        # Append a dot to the hostname if it's not already there.
        hostname = parsed_url.host
        assert hostname
        if not hostname.endswith("."):
            hostname += "."

        # Reconstruct the URL with the modified hostname.
        new_url: str = parsed_url._replace(host=hostname).url
        return new_url

    def send(self, request, **kwargs):  # type: ignore
        """Override the send method to modify the request URL before sending."""
        # Modify the request URL to ensure the hostname is treated as absolute.
        request.url = self.add_dot_to_hostname(request.url)
        return super().send(request, **kwargs)


# Suppress only the single InsecureRequestWarning from urllib3
warnings.filterwarnings("ignore", category=InsecureRequestWarning)


re_title = re.compile("<title>(.*?)</title>", re.DOTALL)

AGENT = config["browser"]["User-Agent"]
headers = {"User-Agent": AGENT, "Accept": "text/html"}

s = requests.Session()
s.headers.update(headers)

# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
adapter = AbsoluteDNSAdapter()
s.mount("http://", adapter)
s.mount("https://", adapter)


not_here_list = [
    "The specified URL was not found.",
    "There is currently no text in this page.",
    "This page does not exist yet",
    "404 Not Found",
    "500 Internal Server Error",
    "Test Page for the Apache HTTP Server",
    "Site not found &middot; GitHub Pages",
    "504: Gateway time-out",
    "504 Gateway Time-out",
    "502 Bad Gateway",
    "This page doesn't exist (404)",
    "Coming soon",
    "NOT_FOUND",
    "Resource Not Found",
    "Wikimedia Error",
    "The page you requested could not be found",
    "Ooops! Could Not Find It",
    "OpenStreetMap Authentication Proxy",
    "Error 404",
    "Under Construction",
    "Page not found",
    "Error 404: Page not found",
    "Barcamptools",
    "That page can’t be found.",
    "looks like there's no page here",
    "404 page",
    "Database Error",
    "You are not authorized to access this page",
    "Attention Required! | Cloudflare",
]


def find_not_here_message(html: str) -> str | None:
    """Find not here message in web page."""
    return next((not_here for not_here in not_here_list if not_here in html), None)


def get_title(html: str) -> str:
    """Title from web page."""
    m = re_title.search(html)
    return m.group(1).strip() if m and m.group(1) else "no title"


def normalize_url(url: str) -> str:
    """
    Normalize the URL by parsing and reconstructing to ensure uniformity.

    This handles cases like differing schemes, casing in the domain
    and trailing slashes.
    """
    # Parse the URL into components
    parsed_url = urlparse(url)
    # Normalize the domain to lowercase and remove any trailing dot
    normalized_netloc = parsed_url.netloc.lower().rstrip(".")
    # Reconstruct the URL with normalized components
    return urlunparse(parsed_url._replace(netloc=normalized_netloc))


def check_conference(
    name: str, src_url: str, year: int
) -> tuple[bool, str, str | None]:
    """Check if conference is live."""
    url = src_url.format(year=year)
    past_url = src_url.format(year=year - 1)
    try:
        # SotM Baltics has an invalid TLS certificate, but we don't care
        r = s.get(url, verify=False)
    except requests.exceptions.ConnectionError:
        return (False, "connection refused", None)

    not_here = find_not_here_message(r.text)
    if (
        len(r.text) < 2048
        and 'http-equiv="refresh"' in r.text
        and str(year) not in r.text
    ):
        return (False, "redirect to URL without year", r.url)

    if normalize_url(r.url) == normalize_url(past_url):
        return (False, "redirect to previous year", r.url)

    return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)


def send_mail(subject: str, body: str) -> None:
    """Send an e-mail."""
    mail_from_address = config["mail"]["from_address"]
    mail_from_name = config["mail"]["from_name"]
    mail_to_address = config["mail"]["to_address"]
    mail_to_name = config["mail"]["to_name"]
    msg = MIMEText(body, "plain", "UTF-8")

    msg["Subject"] = subject
    msg["To"] = f"{mail_to_name} <{mail_to_address}>"
    msg["From"] = f"{mail_from_name} <{mail_from_address}>"
    msg["Date"] = formatdate()
    msg["Message-ID"] = make_msgid()

    # extra mail headers from config
    for header_name, value in config["mail_headers"].items():
        msg[header_name] = value

    s = smtplib.SMTP(config["mail"]["smtp_host"])
    s.sendmail(mail_from_address, [mail_to_address], msg.as_string())
    s.quit()


def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
    """Check if an individual web site is live."""
    assert "{year}" in src_url
    live, msg, redirect_to_url = check_conference(name, src_url, year)
    url = src_url.format(year=year)
    if live:
        if redirect_to_url == url:
            body = f"{name}\n{url}\nWeb page title: {msg}"
        else:
            body = f"""{name}
{url} redirects to {redirect_to_url}
Web page title: {msg}"""
        send_mail(f"Conference site live: {name} - {year}", body)

    return live


def find_new_conference_web_sites(
    today: date, live: list[LiveConference]
) -> list[LiveConference]:
    """Check for new conference web sites going live."""
    this_year = today.year

    new: list[LiveConference] = []

    live_set = {(c["conference"], c["year"]) for c in live}
    for name, src_url in load_yaml("conferences").items():
        new += [
            {"conference": name, "year": year, "live": today}
            for year in (this_year, this_year + 1)
            if (name, year) not in live_set
            and check_conference_web_site(name, src_url, year)
        ]
    return new


def main(show_not_live: bool = False) -> None:
    """Check fow new conference web sites."""
    live: list[LiveConference] = load_yaml("live")
    if not (new := find_new_conference_web_sites(date.today(), live)):
        return

    live_filename = os.path.expanduser(config["data"]["live"])
    with open(live_filename, "w") as out:
        yaml.dump(live + new, stream=out, sort_keys=False)


if __name__ == "__main__":
    main()
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								#!/usr/bin/python3
 								"""Check if conference websites are live."""
-												Read extra headers for mail from config

											
										
										
											2024-02-11 06:56:57 +00:00
+								import os
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								import re
 								import smtplib
-												Add https://2024.sotm-baltics.org/

Closes: #1

											
										
										
											2023-10-29 11:52:15 +00:00
+								import warnings
-												Record conference state in YAML file

Closes: #2, Closes: #3

											
										
										
											2024-02-25 17:23:38 +00:00
+								from datetime import date
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								from email.mime.text import MIMEText
 								from email.utils import formatdate, make_msgid
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
+								from urllib.parse import urlparse, urlunparse
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
 								import requests
-												Record conference state in YAML file

Closes: #2, Closes: #3

											
										
										
											2024-02-25 17:23:38 +00:00
+								import yaml
-												Absolute DNS lookups

											
										
										
											2024-02-25 15:15:38 +00:00
+								from requests.adapters import HTTPAdapter
-												urllib3 now has types

											
										
										
											2024-07-14 16:28:16 +01:00
+								from urllib3.exceptions import InsecureRequestWarning
 								from urllib3.util.url import parse_url
-												Absolute DNS lookups

											
										
										
											2024-02-25 15:15:38 +00:00
-												Add Web UI to see when conference websites appeared

Closes: #4

											
										
										
											2024-07-21 02:53:22 +01:00
+								from conference import LiveConference, config, load_yaml
-												Move code around to make functions readable

											
										
										
											2024-03-22 09:31:21 +00:00
-												Absolute DNS lookups

											
										
										
											2024-02-25 15:15:38 +00:00
+								class AbsoluteDNSAdapter(HTTPAdapter):
 								    """A custom adapter for requests to ensure hostnames are treated as absolute."""
 								    def add_dot_to_hostname(self, url: str) -> str:
 								        """Append a dot to the hostname to treat it as an absolute domain name."""
 								        parsed_url = parse_url(url)
 								        # Append a dot to the hostname if it's not already there.
 								        hostname = parsed_url.host
-												Make mypy happy

											
										
										
											2024-07-14 17:55:18 +01:00
+								        assert hostname
-												Absolute DNS lookups

											
										
										
											2024-02-25 15:15:38 +00:00
+								        if not hostname.endswith("."):
 								            hostname += "."
 								        # Reconstruct the URL with the modified hostname.
 								        new_url: str = parsed_url._replace(host=hostname).url
 								        return new_url
 								    def send(self, request, **kwargs):  # type: ignore
 								        """Override the send method to modify the request URL before sending."""
 								        # Modify the request URL to ensure the hostname is treated as absolute.
 								        request.url = self.add_dot_to_hostname(request.url)
 								        return super().send(request, **kwargs)
-												Read extra headers for mail from config

											
										
										
											2024-02-11 06:56:57 +00:00
-												Add https://2024.sotm-baltics.org/

Closes: #1

											
										
										
											2023-10-29 11:52:15 +00:00
+								# Suppress only the single InsecureRequestWarning from urllib3
 								warnings.filterwarnings("ignore", category=InsecureRequestWarning)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
-												Handle newlines in title HTML.

											
										
										
											2023-09-11 07:40:16 +01:00
+								re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
-												Move User-Agent string to config file

											
										
										
											2024-03-21 08:57:38 +00:00
+								AGENT = config["browser"]["User-Agent"]
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								headers = {"User-Agent": AGENT, "Accept": "text/html"}
 								s = requests.Session()
 								s.headers.update(headers)
-												Absolute DNS lookups

											
										
										
											2024-02-25 15:15:38 +00:00
+								# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
 								adapter = AbsoluteDNSAdapter()
 								s.mount("http://", adapter)
 								s.mount("https://", adapter)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								not_here_list = [
 								    "The specified URL was not found.",
 								    "There is currently no text in this page.",
 								    "This page does not exist yet",
 								    "404 Not Found",
-												Update

											
										
										
											2023-09-07 16:14:19 +01:00
+								    "500 Internal Server Error",
-												Add https://2024.sotm-baltics.org/

Closes: #1

											
										
										
											2023-10-29 11:52:15 +00:00
+								    "Test Page for the Apache HTTP Server",
-												Add 'Site not found &middot; GitHub Pages'

											
										
										
											2023-10-29 18:19:12 +00:00
+								    "Site not found &middot; GitHub Pages",
-												Add EuroPython

											
										
										
											2023-11-04 22:30:34 +00:00
+								    "504: Gateway time-out",
-												More 'not live' strings

											
										
										
											2024-10-13 16:27:01 +01:00
+								    "504 Gateway Time-out",
-												Add more ways of saying not found

											
										
										
											2024-07-11 09:07:42 +01:00
+								    "502 Bad Gateway",
-												Add more phrases to not_here_list

											
										
										
											2024-02-11 06:12:32 +00:00
+								    "This page doesn't exist (404)",
 								    "Coming soon",
 								    "NOT_FOUND",
-												Record conference state in YAML file

Closes: #2, Closes: #3

											
										
										
											2024-02-25 17:23:38 +00:00
+								    "Resource Not Found",
-												'Wikimedia Error' doens't mean a site is live

											
										
										
											2024-03-26 07:15:02 +00:00
+								    "Wikimedia Error",
-												Add more ways of saying not found

											
										
										
											2024-07-11 09:07:42 +01:00
+								    "The page you requested could not be found",
 								    "Ooops! Could Not Find It",
-												Add Web UI to see when conference websites appeared

Closes: #4

											
										
										
											2024-07-21 02:53:22 +01:00
+								    "OpenStreetMap Authentication Proxy",
-												More 'not live' strings

											
										
										
											2024-09-19 13:08:27 +01:00
+								    "Error 404",
 								    "Under Construction",
 								    "Page not found",
 								    "Error 404: Page not found",
 								    "Barcamptools",
 								    "That page can’t be found.",
 								    "looks like there's no page here",
 								    "404 page",
 								    "Database Error",
-												More 'not live' strings

											
										
										
											2024-10-13 16:27:01 +01:00
+								    "You are not authorized to access this page",
 								    "Attention Required! | Cloudflare",
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								]
 								def find_not_here_message(html: str) -> str | None:
 								    """Find not here message in web page."""
 								    return next((not_here for not_here in not_here_list if not_here in html), None)
 								def get_title(html: str) -> str:
 								    """Title from web page."""
 								    m = re_title.search(html)
-												Handle newlines in title HTML.

											
										
										
											2023-09-11 07:40:16 +01:00
+								    return m.group(1).strip() if m and m.group(1) else "no title"
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
+								def normalize_url(url: str) -> str:
 								    """
 								    Normalize the URL by parsing and reconstructing to ensure uniformity.
 								    This handles cases like differing schemes, casing in the domain
 								    and trailing slashes.
 								    """
 								    # Parse the URL into components
 								    parsed_url = urlparse(url)
 								    # Normalize the domain to lowercase and remove any trailing dot
 								    normalized_netloc = parsed_url.netloc.lower().rstrip(".")
 								    # Reconstruct the URL with normalized components
 								    return urlunparse(parsed_url._replace(netloc=normalized_netloc))
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								def check_conference(
 								    name: str, src_url: str, year: int
 								) -> tuple[bool, str, str | None]:
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    """Check if conference is live."""
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
+								    url = src_url.format(year=year)
 								    past_url = src_url.format(year=year - 1)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    try:
-												Add https://2024.sotm-baltics.org/

Closes: #1

											
										
										
											2023-10-29 11:52:15 +00:00
+								        # SotM Baltics has an invalid TLS certificate, but we don't care
 								        r = s.get(url, verify=False)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    except requests.exceptions.ConnectionError:
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								        return (False, "connection refused", None)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
 								    not_here = find_not_here_message(r.text)
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
+								    if (
 								        len(r.text) < 2048
 								        and 'http-equiv="refresh"' in r.text
 								        and str(year) not in r.text
 								    ):
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								        return (False, "redirect to URL without year", r.url)
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
 								    if normalize_url(r.url) == normalize_url(past_url):
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								        return (False, "redirect to previous year", r.url)
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								    return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
 								def send_mail(subject: str, body: str) -> None:
 								    """Send an e-mail."""
-												Move mail settings to config file

											
										
										
											2024-02-25 17:20:26 +00:00
+								    mail_from_address = config["mail"]["from_address"]
 								    mail_from_name = config["mail"]["from_name"]
 								    mail_to_address = config["mail"]["to_address"]
 								    mail_to_name = config["mail"]["to_name"]
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    msg = MIMEText(body, "plain", "UTF-8")
 								    msg["Subject"] = subject
-												Move mail settings to config file

											
										
										
											2024-02-25 17:20:26 +00:00
+								    msg["To"] = f"{mail_to_name} <{mail_to_address}>"
 								    msg["From"] = f"{mail_from_name} <{mail_from_address}>"
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    msg["Date"] = formatdate()
 								    msg["Message-ID"] = make_msgid()
-												Read extra headers for mail from config

											
										
										
											2024-02-11 06:56:57 +00:00
+								    # extra mail headers from config
 								    for header_name, value in config["mail_headers"].items():
 								        msg[header_name] = value
-												Record conference state in YAML file

Closes: #2, Closes: #3

											
										
										
											2024-02-25 17:23:38 +00:00
+								    s = smtplib.SMTP(config["mail"]["smtp_host"])
 								    s.sendmail(mail_from_address, [mail_to_address], msg.as_string())
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
+								    s.quit()
-												Move code around to make functions readable

											
										
										
											2024-03-22 09:31:21 +00:00
+								def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
 								    """Check if an individual web site is live."""
 								    assert "{year}" in src_url
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								    live, msg, redirect_to_url = check_conference(name, src_url, year)
-												Detect and handle redirects

Closes: #6

											
										
										
											2024-10-13 16:27:38 +01:00
+								    url = src_url.format(year=year)
-												Move code around to make functions readable

											
										
										
											2024-03-22 09:31:21 +00:00
+								    if live:
-												Include redirect to URL in notification emails

Closes: #5

											
										
										
											2024-10-13 16:34:21 +01:00
+								        if redirect_to_url == url:
 								            body = f"{name}\n{url}\nWeb page title: {msg}"
 								        else:
 								            body = f"""{name}
 								{url} redirects to {redirect_to_url}
 								Web page title: {msg}"""
-												Include conference year in email subject

											
										
										
											2024-07-11 09:07:59 +01:00
+								        send_mail(f"Conference site live: {name} - {year}", body)
-												Move code around to make functions readable

											
										
										
											2024-03-22 09:31:21 +00:00
 								    return live
 								def find_new_conference_web_sites(
 								    today: date, live: list[LiveConference]
 								) -> list[LiveConference]:
 								    """Check for new conference web sites going live."""
-												Record conference state in YAML file

Closes: #2, Closes: #3

											
										
										
											2024-02-25 17:23:38 +00:00
+								    this_year = today.year
-												Move code around to make functions readable

											
										
										
											2024-03-22 09:31:21 +00:00
+								    new: list[LiveConference] = []
 								    live_set = {(c["conference"], c["year"]) for c in live}
 								    for name, src_url in load_yaml("conferences").items():
 								        new += [
 								            {"conference": name, "year": year, "live": today}
 								            for year in (this_year, this_year + 1)
 								            if (name, year) not in live_set
 								            and check_conference_web_site(name, src_url, year)
 								        ]
 								    return new
 								def main(show_not_live: bool = False) -> None:
 								    """Check fow new conference web sites."""
 								    live: list[LiveConference] = load_yaml("live")
 								    if not (new := find_new_conference_web_sites(date.today(), live)):
 								        return
 								    live_filename = os.path.expanduser(config["data"]["live"])
 								    with open(live_filename, "w") as out:
 								        yaml.dump(live + new, stream=out, sort_keys=False)
-												Initial commit

											
										
										
											2023-09-06 07:57:12 +01:00
 								if __name__ == "__main__":
 								    main()