Compare commits

..

2 commits

Author SHA1 Message Date
Edward Betts fee54a37e7 User cloudscraper 2025-01-11 16:43:51 +00:00
Edward Betts 35c213110d Add more strings to not_here_list. 2024-11-01 09:23:18 +00:00
3 changed files with 46 additions and 129 deletions

173
check.py
View file

@ -7,13 +7,12 @@ import os
import re
import smtplib
import warnings
from dataclasses import dataclass
from datetime import date
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse
import cloudscraper # type: ignore
import cloudscraper
import requests
import yaml
from requests.adapters import HTTPAdapter
@ -77,7 +76,6 @@ not_here_list = [
"This page does not exist yet",
"404 Not Found",
"500 Internal Server Error",
"500: Internal Server Error",
"Test Page for the Apache HTTP Server",
"Site not found · GitHub Pages",
"504: Gateway time-out",
@ -137,118 +135,31 @@ def normalize_url(url: str) -> str:
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
def url_to_filename(url: str) -> str:
"""
Convert a URL to a valid filename by replacing invalid characters with underscores.
def check_conference(
name: str, src_url: str, year: int
) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
url = src_url.format(year=year)
past_url = src_url.format(year=year - 1)
no_dot = {"bsideskbh.dk", "pif.camp"}
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
Args:
url (str): The URL to be converted.
not_here = find_not_here_message(r.text)
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(year) not in r.text
):
return (False, "redirect to URL without year", r.url)
Returns:
str: A valid filename.
"""
# Replace invalid characters with underscores
return re.sub(r'[\/:*?"<>|]', "_", url)
if normalize_url(r.url) == normalize_url(past_url):
return (False, "redirect to previous year", r.url)
def parse_opengraph_tags(html: str) -> dict[str, str]:
"""
Locate Open Graph meta tags, and return them as a dictionary.
Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
"""
pattern = re.compile(
r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
)
matches = pattern.findall(html)
og_tags = {}
for prop, content in matches:
og_tags[prop] = content
return og_tags
@dataclass
class Conference:
"""Conference."""
name: str
src_url: str
year: int
response: requests.models.Response | None = None
redirect_to_url: str | None = None
@property
def url(self) -> str:
"""Conference URL."""
return self.build_url(year=self.year)
def build_url(self, year: int) -> str:
"""Build conference URL."""
return self.src_url.format(year=year, two_digit_year=year % 2000)
@property
def past_url(self) -> str:
"""URL for previous year."""
return self.build_url(year=self.year - 1)
def check(self) -> tuple[bool, str, str | None]:
"""Check if conference is live."""
no_dot = {"bsideskbh.dk", "pif.camp"}
url = self.url
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
try:
r = session.get(url)
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
self.response = r
not_here = find_not_here_message(r.text)
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(self.year) not in r.text
):
return (False, "redirect to URL without year", r.url)
if normalize_url(r.url) == normalize_url(self.past_url):
return (False, "redirect to previous year", r.url)
if not_here:
return (False, not_here, r.url)
return (True, get_title(r.text), r.url)
def og_tags(self) -> dict[str, str]:
"""Open Graph tags."""
return parse_opengraph_tags(self.response.text) if self.response else {}
def check_web_site(self) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
live, msg, redirect_to_url = self.check()
if not live:
return False
og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
if og:
og = "\n\nOpen Graph\n\n" + og
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
self.url
):
body = f"{self.name}\n{self.url}\n"
else:
self.redirect_to_url = redirect_to_url
body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
body += f"Web page title: {msg}{og}" ""
send_mail(f"Conference site live: {self.name} - {self.year}", body)
return True
return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
def send_mail(subject: str, body: str) -> None:
@ -274,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
s.quit()
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in src_url
live, msg, redirect_to_url = check_conference(name, src_url, year)
url = src_url.format(year=year)
if live:
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
body = f"{name}\n{url}\nWeb page title: {msg}"
else:
body = f"""{name}
{url} redirects to {redirect_to_url}
Web page title: {msg}"""
send_mail(f"Conference site live: {name} - {year}", body)
return live
def find_new_conference_web_sites(
today: date, live: list[LiveConference]
) -> list[LiveConference]:
@ -284,21 +212,12 @@ def find_new_conference_web_sites(
live_set = {(c["conference"], c["year"]) for c in live}
for name, src_url in load_yaml("conferences").items():
for year in (this_year, this_year + 1):
if (name, year) in live_set:
continue
conf = Conference(name, src_url, year)
if not conf.check_web_site():
continue
c: LiveConference = {
"conference": name,
"year": year,
"live": today,
"url": conf.url,
}
if conf.redirect_to_url:
c["redirect_to_url"] = conf.redirect_to_url
new.append(c)
new += [
{"conference": name, "year": year, "live": today}
for year in (this_year, this_year + 1)
if (name, year) not in live_set
and check_conference_web_site(name, src_url, year)
]
return new

View file

@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
year: int
live: date
url: str | None
redirect_to_url: str | None
def load_yaml(name: str) -> typing.Any:

View file

@ -1,2 +1 @@
requests
cloudscraper