diff --git a/check.py b/check.py
index 62349ec..5d3bb74 100755
--- a/check.py
+++ b/check.py
@@ -6,15 +6,13 @@ import html
import os
import re
import smtplib
-import sys
import warnings
-from dataclasses import dataclass
from datetime import date
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse
-import cloudscraper # type: ignore
+import cloudscraper
import requests
import yaml
from requests.adapters import HTTPAdapter
@@ -23,8 +21,6 @@ from urllib3.util.url import parse_url
from conference import LiveConference, config, load_yaml
-IS_TTY = sys.stdout.isatty()
-
class AbsoluteDNSAdapter(HTTPAdapter):
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
@@ -80,7 +76,6 @@ not_here_list = [
"This page does not exist yet",
"404 Not Found",
"500 Internal Server Error",
- "500: Internal Server Error",
"Test Page for the Apache HTTP Server",
"Site not found · GitHub Pages",
"504: Gateway time-out",
@@ -108,17 +103,9 @@ not_here_list = [
"This page doesn't currently exist",
"ERROR 503 - Service Unavailable",
"ERROR 503",
- "503 Service Unavailable",
"401 Authorization Required",
"Authorization Required",
"used Cloudflare to restrict access",
- "Error 1014",
- "CNAME Cross-User Banned",
- "looks like there's no page here",
- "404 page can’t be found either",
- "503 self-signed certificate",
- "504 Gateway Timeout",
- "
Pages
",
]
@@ -148,133 +135,31 @@ def normalize_url(url: str) -> str:
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
-def url_to_filename(url: str) -> str:
- """
- Convert a URL to a valid filename by replacing invalid characters with underscores.
+def check_conference(
+ name: str, src_url: str, year: int
+) -> tuple[bool, str, str | None]:
+ """Check if conference is live."""
+ url = src_url.format(year=year)
+ past_url = src_url.format(year=year - 1)
+ no_dot = {"bsideskbh.dk", "pif.camp"}
+ session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
+ try:
+ r = session.get(url)
+ except requests.exceptions.ConnectionError:
+ return (False, "connection refused", None)
- Args:
- url (str): The URL to be converted.
+ not_here = find_not_here_message(r.text)
+ if (
+ len(r.text) < 2048
+ and 'http-equiv="refresh"' in r.text
+ and str(year) not in r.text
+ ):
+ return (False, "redirect to URL without year", r.url)
- Returns:
- str: A valid filename.
- """
- # Replace invalid characters with underscores
- return re.sub(r'[\/:*?"<>|]', "_", url)
+ if normalize_url(r.url) == normalize_url(past_url):
+ return (False, "redirect to previous year", r.url)
-
-def parse_opengraph_tags(html: str) -> dict[str, str]:
- """
- Locate Open Graph meta tags, and return them as a dictionary.
-
- Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
- """
- pattern = re.compile(
- r'', flags=re.IGNORECASE
- )
- matches = pattern.findall(html)
-
- og_tags = {}
- for prop, content in matches:
- og_tags[prop] = content
-
- return og_tags
-
-
-@dataclass
-class Conference:
- """Conference."""
-
- name: str
- src_url: str
- year: int
- response: requests.models.Response | None = None
- redirect_to_url: str | None = None
-
- @property
- def url(self) -> str:
- """Conference URL."""
- return self.build_url(year=self.year)
-
- def build_url(self, year: int) -> str:
- """Build conference URL."""
- return self.src_url.format(year=year, two_digit_year=year % 2000)
-
- @property
- def past_url(self) -> str:
- """URL for previous year."""
- return self.build_url(year=self.year - 1)
-
- def check(self) -> tuple[bool, str, str | None]:
- """Check if conference is live."""
- no_dot = {"bsideskbh.dk", "pif.camp"}
- url = self.url
- session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
- try:
- r = session.get(url)
- except requests.exceptions.ConnectionError:
- return (False, "connection refused", None)
-
- self.response = r
-
- if r.url.endswith("404.html") or r.url.endswith("404.htm"):
- return (False, "URL ends with 404.html/404.htm", r.url)
-
- if not r.text:
- return (False, "empty response", r.url)
-
- not_here = find_not_here_message(r.text)
- if (
- len(r.text) < 2048
- and 'http-equiv="refresh"' in r.text
- and str(self.year) not in r.text
- ):
- return (False, "redirect to URL without year", r.url)
-
- if normalize_url(r.url) == normalize_url(self.past_url):
- return (False, "redirect to previous year", r.url)
-
- if not_here:
- return (False, not_here, r.url)
-
- return (True, get_title(r.text), r.url)
-
- def og_tags(self) -> dict[str, str]:
- """Open Graph tags."""
- return parse_opengraph_tags(self.response.text) if self.response else {}
-
- def check_web_site(self) -> bool:
- """Check if an individual web site is live."""
- assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
- if IS_TTY:
- print(f"Checking {self.name} {self.year}: {self.url}")
- live, msg, redirect_to_url = self.check()
- if not live:
- if IS_TTY:
- print(f" Not live: {msg}")
- return False
-
- if IS_TTY:
- print(f" Live! Title: {msg}")
-
- og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
-
- if og:
- og = "\n\nOpen Graph\n\n" + og
-
- if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
- self.url
- ):
- body = f"{self.name}\n{self.url}\n"
- else:
- self.redirect_to_url = redirect_to_url
- body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
-
- body += f"Web page title: {msg}{og}" ""
- send_mail(f"Conference site live: {self.name} - {self.year}", body)
- if IS_TTY:
- print(f" Email sent")
-
- return True
+ return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
def send_mail(subject: str, body: str) -> None:
@@ -300,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
s.quit()
+def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
+ """Check if an individual web site is live."""
+ assert "{year}" in src_url
+ live, msg, redirect_to_url = check_conference(name, src_url, year)
+ url = src_url.format(year=year)
+ if live:
+ if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
+ body = f"{name}\n{url}\nWeb page title: {msg}"
+ else:
+ body = f"""{name}
+{url} redirects to {redirect_to_url}
+Web page title: {msg}"""
+ send_mail(f"Conference site live: {name} - {year}", body)
+
+ return live
+
+
def find_new_conference_web_sites(
today: date, live: list[LiveConference]
) -> list[LiveConference]:
@@ -310,59 +212,24 @@ def find_new_conference_web_sites(
live_set = {(c["conference"], c["year"]) for c in live}
for name, src_url in load_yaml("conferences").items():
- for year in (this_year, this_year + 1):
- if (name, year) in live_set:
- continue
- conf = Conference(name, src_url, year)
- if not conf.check_web_site():
- continue
- c: LiveConference = {
- "conference": name,
- "year": year,
- "live": today,
- "url": conf.url,
- }
- if conf.redirect_to_url:
- c["redirect_to_url"] = conf.redirect_to_url
- new.append(c)
+ new += [
+ {"conference": name, "year": year, "live": today}
+ for year in (this_year, this_year + 1)
+ if (name, year) not in live_set
+ and check_conference_web_site(name, src_url, year)
+ ]
return new
-class NoAliasDumper(yaml.SafeDumper):
- """Dumper that disables YAML anchors and aliases."""
-
- def ignore_aliases(self, data):
- """Skip alias generation."""
- return True # disables anchors and aliases
-
-
def main(show_not_live: bool = False) -> None:
"""Check fow new conference web sites."""
- if IS_TTY:
- print("Loading existing live conferences...")
live: list[LiveConference] = load_yaml("live")
- if IS_TTY:
- print(f"Found {len(live)} existing live conferences")
- print("\nChecking for new conference websites...")
if not (new := find_new_conference_web_sites(date.today(), live)):
- if IS_TTY:
- print("\nNo new conference websites found")
return
- if IS_TTY:
- print(f"\n{len(new)} new conference(s) found! Updating live.yaml...")
live_filename = os.path.expanduser(config["data"]["live"])
with open(live_filename, "w") as out:
- yaml.dump(
- live + new,
- stream=out,
- sort_keys=False,
- Dumper=NoAliasDumper,
- allow_unicode=True,
- default_flow_style=False,
- )
- if IS_TTY:
- print(f"Updated {live_filename}")
+ yaml.dump(live + new, stream=out, sort_keys=False)
if __name__ == "__main__":
diff --git a/conference/__init__.py b/conference/__init__.py
index e1d57a5..415e3d2 100644
--- a/conference/__init__.py
+++ b/conference/__init__.py
@@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
year: int
live: date
url: str | None
- redirect_to_url: str | None
def load_yaml(name: str) -> typing.Any:
diff --git a/requirements.txt b/requirements.txt
index f7b8e8f..f229360 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1 @@
requests
-cloudscraper