Compare commits

..

2 commits

Author SHA1 Message Date
fee54a37e7 User cloudscraper 2025-01-11 16:43:51 +00:00
35c213110d Add more strings to not_here_list. 2024-11-01 09:23:18 +00:00
3 changed files with 47 additions and 182 deletions

227
check.py
View file

@ -6,15 +6,13 @@ import html
import os import os
import re import re
import smtplib import smtplib
import sys
import warnings import warnings
from dataclasses import dataclass
from datetime import date from datetime import date
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import cloudscraper # type: ignore import cloudscraper
import requests import requests
import yaml import yaml
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@ -23,8 +21,6 @@ from urllib3.util.url import parse_url
from conference import LiveConference, config, load_yaml from conference import LiveConference, config, load_yaml
IS_TTY = sys.stdout.isatty()
class AbsoluteDNSAdapter(HTTPAdapter): class AbsoluteDNSAdapter(HTTPAdapter):
"""A custom adapter for requests to ensure hostnames are treated as absolute.""" """A custom adapter for requests to ensure hostnames are treated as absolute."""
@ -80,7 +76,6 @@ not_here_list = [
"This page does not exist yet", "This page does not exist yet",
"404 Not Found", "404 Not Found",
"500 Internal Server Error", "500 Internal Server Error",
"500: Internal Server Error",
"Test Page for the Apache HTTP Server", "Test Page for the Apache HTTP Server",
"Site not found · GitHub Pages", "Site not found · GitHub Pages",
"504: Gateway time-out", "504: Gateway time-out",
@ -108,17 +103,9 @@ not_here_list = [
"This page doesn't currently exist", "This page doesn't currently exist",
"ERROR 503 - Service Unavailable", "ERROR 503 - Service Unavailable",
"ERROR 503", "ERROR 503",
"503 Service Unavailable",
"401 Authorization Required", "401 Authorization Required",
"Authorization Required", "Authorization Required",
"used Cloudflare to restrict access", "used Cloudflare to restrict access",
"Error 1014",
"CNAME Cross-User Banned",
"looks like there's no page here",
"404 page cant be found either",
"503 self-signed certificate",
"504 Gateway Timeout",
"<h2>Pages</h2>",
] ]
@ -148,133 +135,31 @@ def normalize_url(url: str) -> str:
return urlunparse(parsed_url._replace(netloc=normalized_netloc)) return urlunparse(parsed_url._replace(netloc=normalized_netloc))
def url_to_filename(url: str) -> str: def check_conference(
""" name: str, src_url: str, year: int
Convert a URL to a valid filename by replacing invalid characters with underscores. ) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
url = src_url.format(year=year)
past_url = src_url.format(year=year - 1)
no_dot = {"bsideskbh.dk", "pif.camp"}
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
Args: not_here = find_not_here_message(r.text)
url (str): The URL to be converted. if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(year) not in r.text
):
return (False, "redirect to URL without year", r.url)
Returns: if normalize_url(r.url) == normalize_url(past_url):
str: A valid filename. return (False, "redirect to previous year", r.url)
"""
# Replace invalid characters with underscores
return re.sub(r'[\/:*?"<>|]', "_", url)
return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
def parse_opengraph_tags(html: str) -> dict[str, str]:
"""
Locate Open Graph meta tags, and return them as a dictionary.
Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
"""
pattern = re.compile(
r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
)
matches = pattern.findall(html)
og_tags = {}
for prop, content in matches:
og_tags[prop] = content
return og_tags
@dataclass
class Conference:
"""Conference."""
name: str
src_url: str
year: int
response: requests.models.Response | None = None
redirect_to_url: str | None = None
@property
def url(self) -> str:
"""Conference URL."""
return self.build_url(year=self.year)
def build_url(self, year: int) -> str:
"""Build conference URL."""
return self.src_url.format(year=year, two_digit_year=year % 2000)
@property
def past_url(self) -> str:
"""URL for previous year."""
return self.build_url(year=self.year - 1)
def check(self) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
no_dot = {"bsideskbh.dk", "pif.camp"}
url = self.url
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
self.response = r
if r.url.endswith("404.html") or r.url.endswith("404.htm"):
return (False, "URL ends with 404.html/404.htm", r.url)
if not r.text:
return (False, "empty response", r.url)
not_here = find_not_here_message(r.text)
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(self.year) not in r.text
):
return (False, "redirect to URL without year", r.url)
if normalize_url(r.url) == normalize_url(self.past_url):
return (False, "redirect to previous year", r.url)
if not_here:
return (False, not_here, r.url)
return (True, get_title(r.text), r.url)
def og_tags(self) -> dict[str, str]:
"""Open Graph tags."""
return parse_opengraph_tags(self.response.text) if self.response else {}
def check_web_site(self) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
if IS_TTY:
print(f"Checking {self.name} {self.year}: {self.url}")
live, msg, redirect_to_url = self.check()
if not live:
if IS_TTY:
print(f" Not live: {msg}")
return False
if IS_TTY:
print(f" Live! Title: {msg}")
og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
if og:
og = "\n\nOpen Graph\n\n" + og
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
self.url
):
body = f"{self.name}\n{self.url}\n"
else:
self.redirect_to_url = redirect_to_url
body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
body += f"Web page title: {msg}{og}" ""
send_mail(f"Conference site live: {self.name} - {self.year}", body)
if IS_TTY:
print(f" Email sent")
return True
def send_mail(subject: str, body: str) -> None: def send_mail(subject: str, body: str) -> None:
@ -300,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
s.quit() s.quit()
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in src_url
live, msg, redirect_to_url = check_conference(name, src_url, year)
url = src_url.format(year=year)
if live:
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
body = f"{name}\n{url}\nWeb page title: {msg}"
else:
body = f"""{name}
{url} redirects to {redirect_to_url}
Web page title: {msg}"""
send_mail(f"Conference site live: {name} - {year}", body)
return live
def find_new_conference_web_sites( def find_new_conference_web_sites(
today: date, live: list[LiveConference] today: date, live: list[LiveConference]
) -> list[LiveConference]: ) -> list[LiveConference]:
@ -310,59 +212,24 @@ def find_new_conference_web_sites(
live_set = {(c["conference"], c["year"]) for c in live} live_set = {(c["conference"], c["year"]) for c in live}
for name, src_url in load_yaml("conferences").items(): for name, src_url in load_yaml("conferences").items():
for year in (this_year, this_year + 1): new += [
if (name, year) in live_set: {"conference": name, "year": year, "live": today}
continue for year in (this_year, this_year + 1)
conf = Conference(name, src_url, year) if (name, year) not in live_set
if not conf.check_web_site(): and check_conference_web_site(name, src_url, year)
continue ]
c: LiveConference = {
"conference": name,
"year": year,
"live": today,
"url": conf.url,
}
if conf.redirect_to_url:
c["redirect_to_url"] = conf.redirect_to_url
new.append(c)
return new return new
class NoAliasDumper(yaml.SafeDumper):
"""Dumper that disables YAML anchors and aliases."""
def ignore_aliases(self, data):
"""Skip alias generation."""
return True # disables anchors and aliases
def main(show_not_live: bool = False) -> None: def main(show_not_live: bool = False) -> None:
"""Check fow new conference web sites.""" """Check fow new conference web sites."""
if IS_TTY:
print("Loading existing live conferences...")
live: list[LiveConference] = load_yaml("live") live: list[LiveConference] = load_yaml("live")
if IS_TTY:
print(f"Found {len(live)} existing live conferences")
print("\nChecking for new conference websites...")
if not (new := find_new_conference_web_sites(date.today(), live)): if not (new := find_new_conference_web_sites(date.today(), live)):
if IS_TTY:
print("\nNo new conference websites found")
return return
if IS_TTY:
print(f"\n{len(new)} new conference(s) found! Updating live.yaml...")
live_filename = os.path.expanduser(config["data"]["live"]) live_filename = os.path.expanduser(config["data"]["live"])
with open(live_filename, "w") as out: with open(live_filename, "w") as out:
yaml.dump( yaml.dump(live + new, stream=out, sort_keys=False)
live + new,
stream=out,
sort_keys=False,
Dumper=NoAliasDumper,
allow_unicode=True,
default_flow_style=False,
)
if IS_TTY:
print(f"Updated {live_filename}")
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
year: int year: int
live: date live: date
url: str | None url: str | None
redirect_to_url: str | None
def load_yaml(name: str) -> typing.Any: def load_yaml(name: str) -> typing.Any:

View file

@ -1,2 +1 @@
requests requests
cloudscraper