Compare commits

..

2 commits

Author SHA1 Message Date
Edward Betts fee54a37e7 User cloudscraper 2025-01-11 16:43:51 +00:00
Edward Betts 35c213110d Add more strings to not_here_list. 2024-11-01 09:23:18 +00:00
3 changed files with 46 additions and 129 deletions

173
check.py
View file

@ -7,13 +7,12 @@ import os
import re import re
import smtplib import smtplib
import warnings import warnings
from dataclasses import dataclass
from datetime import date from datetime import date
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import cloudscraper # type: ignore import cloudscraper
import requests import requests
import yaml import yaml
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@ -77,7 +76,6 @@ not_here_list = [
"This page does not exist yet", "This page does not exist yet",
"404 Not Found", "404 Not Found",
"500 Internal Server Error", "500 Internal Server Error",
"500: Internal Server Error",
"Test Page for the Apache HTTP Server", "Test Page for the Apache HTTP Server",
"Site not found · GitHub Pages", "Site not found · GitHub Pages",
"504: Gateway time-out", "504: Gateway time-out",
@ -137,118 +135,31 @@ def normalize_url(url: str) -> str:
return urlunparse(parsed_url._replace(netloc=normalized_netloc)) return urlunparse(parsed_url._replace(netloc=normalized_netloc))
def url_to_filename(url: str) -> str: def check_conference(
""" name: str, src_url: str, year: int
Convert a URL to a valid filename by replacing invalid characters with underscores. ) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
url = src_url.format(year=year)
past_url = src_url.format(year=year - 1)
no_dot = {"bsideskbh.dk", "pif.camp"}
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
Args: not_here = find_not_here_message(r.text)
url (str): The URL to be converted. if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(year) not in r.text
):
return (False, "redirect to URL without year", r.url)
Returns: if normalize_url(r.url) == normalize_url(past_url):
str: A valid filename. return (False, "redirect to previous year", r.url)
"""
# Replace invalid characters with underscores
return re.sub(r'[\/:*?"<>|]', "_", url)
return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
def parse_opengraph_tags(html: str) -> dict[str, str]:
"""
Locate Open Graph meta tags, and return them as a dictionary.
Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
"""
pattern = re.compile(
r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
)
matches = pattern.findall(html)
og_tags = {}
for prop, content in matches:
og_tags[prop] = content
return og_tags
@dataclass
class Conference:
"""Conference."""
name: str
src_url: str
year: int
response: requests.models.Response | None = None
redirect_to_url: str | None = None
@property
def url(self) -> str:
"""Conference URL."""
return self.build_url(year=self.year)
def build_url(self, year: int) -> str:
"""Build conference URL."""
return self.src_url.format(year=year, two_digit_year=year % 2000)
@property
def past_url(self) -> str:
"""URL for previous year."""
return self.build_url(year=self.year - 1)
def check(self) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
no_dot = {"bsideskbh.dk", "pif.camp"}
url = self.url
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
self.response = r
not_here = find_not_here_message(r.text)
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(self.year) not in r.text
):
return (False, "redirect to URL without year", r.url)
if normalize_url(r.url) == normalize_url(self.past_url):
return (False, "redirect to previous year", r.url)
if not_here:
return (False, not_here, r.url)
return (True, get_title(r.text), r.url)
def og_tags(self) -> dict[str, str]:
"""Open Graph tags."""
return parse_opengraph_tags(self.response.text) if self.response else {}
def check_web_site(self) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
live, msg, redirect_to_url = self.check()
if not live:
return False
og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
if og:
og = "\n\nOpen Graph\n\n" + og
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
self.url
):
body = f"{self.name}\n{self.url}\n"
else:
self.redirect_to_url = redirect_to_url
body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
body += f"Web page title: {msg}{og}" ""
send_mail(f"Conference site live: {self.name} - {self.year}", body)
return True
def send_mail(subject: str, body: str) -> None: def send_mail(subject: str, body: str) -> None:
@ -274,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
s.quit() s.quit()
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in src_url
live, msg, redirect_to_url = check_conference(name, src_url, year)
url = src_url.format(year=year)
if live:
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
body = f"{name}\n{url}\nWeb page title: {msg}"
else:
body = f"""{name}
{url} redirects to {redirect_to_url}
Web page title: {msg}"""
send_mail(f"Conference site live: {name} - {year}", body)
return live
def find_new_conference_web_sites( def find_new_conference_web_sites(
today: date, live: list[LiveConference] today: date, live: list[LiveConference]
) -> list[LiveConference]: ) -> list[LiveConference]:
@ -284,21 +212,12 @@ def find_new_conference_web_sites(
live_set = {(c["conference"], c["year"]) for c in live} live_set = {(c["conference"], c["year"]) for c in live}
for name, src_url in load_yaml("conferences").items(): for name, src_url in load_yaml("conferences").items():
for year in (this_year, this_year + 1): new += [
if (name, year) in live_set: {"conference": name, "year": year, "live": today}
continue for year in (this_year, this_year + 1)
conf = Conference(name, src_url, year) if (name, year) not in live_set
if not conf.check_web_site(): and check_conference_web_site(name, src_url, year)
continue ]
c: LiveConference = {
"conference": name,
"year": year,
"live": today,
"url": conf.url,
}
if conf.redirect_to_url:
c["redirect_to_url"] = conf.redirect_to_url
new.append(c)
return new return new

View file

@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
year: int year: int
live: date live: date
url: str | None url: str | None
redirect_to_url: str | None
def load_yaml(name: str) -> typing.Any: def load_yaml(name: str) -> typing.Any:

View file

@ -1,2 +1 @@
requests requests
cloudscraper