Compare commits

...

3 commits

Author SHA1 Message Date
Edward Betts 28865b0783 Detect and handle redirects
Closes: #6
2024-10-13 16:27:38 +01:00
Edward Betts f49741ddbc Use dotenv 2024-10-13 16:27:19 +01:00
Edward Betts ca933191f5 More 'not live' strings 2024-10-13 16:27:01 +01:00
2 changed files with 40 additions and 2 deletions

View file

@ -9,6 +9,7 @@ import warnings
from datetime import date
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse
import requests
import yaml
@ -70,6 +71,7 @@ not_here_list = [
"Test Page for the Apache HTTP Server",
"Site not found · GitHub Pages",
"504: Gateway time-out",
"504 Gateway Time-out",
"502 Bad Gateway",
"This page doesn't exist (404)",
"Coming soon",
@ -88,6 +90,8 @@ not_here_list = [
"looks like there's no page here",
"404 page",
"Database Error",
"You are not authorized to access this page",
"Attention Required! | Cloudflare",
]
@ -102,8 +106,25 @@ def get_title(html: str) -> str:
return m.group(1).strip() if m and m.group(1) else "no title"
def check_conference(name: str, url: str) -> tuple[bool, str]:
def normalize_url(url: str) -> str:
"""
Normalize the URL by parsing and reconstructing to ensure uniformity.
This handles cases like differing schemes, casing in the domain
and trailing slashes.
"""
# Parse the URL into components
parsed_url = urlparse(url)
# Normalize the domain to lowercase and remove any trailing dot
normalized_netloc = parsed_url.netloc.lower().rstrip(".")
# Reconstruct the URL with normalized components
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
def check_conference(name: str, src_url: str, year: int) -> tuple[bool, str]:
"""Check if conference is live."""
url = src_url.format(year=year)
past_url = src_url.format(year=year - 1)
try:
# SotM Baltics has an invalid TLS certificate, but we don't care
r = s.get(url, verify=False)
@ -111,6 +132,19 @@ def check_conference(name: str, url: str) -> tuple[bool, str]:
return (False, "connection refused")
not_here = find_not_here_message(r.text)
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(year) not in r.text
):
return (False, "redirect to URL without year")
if str(year) not in r.url:
return (False, "redirect to URL without year")
if normalize_url(r.url) == normalize_url(past_url):
return (False, "redirect to previous year")
return (False, not_here) if not_here else (True, get_title(r.text))
@ -140,7 +174,8 @@ def send_mail(subject: str, body: str) -> None:
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in src_url
live, msg = check_conference(name, url := src_url.format(year=year))
live, msg = check_conference(name, src_url, year)
url = src_url.format(year=year)
if live:
body = f"{name}\n{url}\nWeb page title: {msg}"
send_mail(f"Conference site live: {name} - {year}", body)

View file

@ -6,6 +6,9 @@ import typing
from datetime import date
import yaml
from dotenv import load_dotenv
load_dotenv()
config_file_path = os.path.expanduser(
os.path.join(