Compare commits

..

No commits in common. "d845ec805f9b2a9d22b99a7f24a8ac88671e026c" and "a523af16e196e59bfa0318d196a69af6f6db342b" have entirely different histories.

View file

@ -2,7 +2,6 @@
"""Check if conference websites are live.""" """Check if conference websites are live."""
import html
import os import os
import re import re
import smtplib import smtplib
@ -12,7 +11,6 @@ from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid from email.utils import formatdate, make_msgid
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import cloudscraper
import requests import requests
import yaml import yaml
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@ -55,14 +53,8 @@ re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
AGENT = config["browser"]["User-Agent"] AGENT = config["browser"]["User-Agent"]
headers = {"User-Agent": AGENT, "Accept": "text/html"} headers = {"User-Agent": AGENT, "Accept": "text/html"}
s = cloudscraper.CloudScraper() s = requests.Session()
s_no_dot = cloudscraper.CloudScraper() s.headers.update(headers)
# s = requests.Session()
# s.headers.update(headers)
# s_no_dot = requests.Session()
# s_no_dot.headers.update(headers)
# Create a session and mount the custom adapter for both HTTP and HTTPS requests. # Create a session and mount the custom adapter for both HTTP and HTTPS requests.
adapter = AbsoluteDNSAdapter() adapter = AbsoluteDNSAdapter()
@ -90,7 +82,7 @@ not_here_list = [
"Ooops! Could Not Find It", "Ooops! Could Not Find It",
"OpenStreetMap Authentication Proxy", "OpenStreetMap Authentication Proxy",
"Error 404", "Error 404",
# "Under Construction", "Under Construction",
"Page not found", "Page not found",
"Error 404: Page not found", "Error 404: Page not found",
"Barcamptools", "Barcamptools",
@ -101,23 +93,18 @@ not_here_list = [
"You are not authorized to access this page", "You are not authorized to access this page",
"Attention Required! | Cloudflare", "Attention Required! | Cloudflare",
"This page doesn't currently exist", "This page doesn't currently exist",
"ERROR 503 - Service Unavailable",
"ERROR 503",
"401 Authorization Required",
"Authorization Required",
"used Cloudflare to restrict access",
] ]
def find_not_here_message(page_html: str) -> str | None: def find_not_here_message(html: str) -> str | None:
"""Find not here message in web page.""" """Find not here message in web page."""
return next((not_here for not_here in not_here_list if not_here in page_html), None) return next((not_here for not_here in not_here_list if not_here in html), None)
def get_title(page_html: str) -> str: def get_title(html: str) -> str:
"""Title from web page.""" """Title from web page."""
m = re_title.search(page_html) m = re_title.search(html)
return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title" return m.group(1).strip() if m and m.group(1) else "no title"
def normalize_url(url: str) -> str: def normalize_url(url: str) -> str:
@ -194,17 +181,18 @@ class Conference:
def check(self) -> tuple[bool, str, str | None]: def check(self) -> tuple[bool, str, str | None]:
"""Check if conference is live.""" """Check if conference is live."""
no_dot = {"bsideskbh.dk", "pif.camp"}
session = (
s if all(hostname not in self.url for hostname in no_dot) else s_no_dot
)
try: try:
r = session.get(self.url) # SotM Baltics has an invalid TLS certificate, but we don't care
r = s.get(self.url, verify=False)
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
return (False, "connection refused", None) return (False, "connection refused", None)
self.response = r self.response = r
filename = url_to_filename(self.url)
with open(os.path.join("sites", filename + ".html"), "w") as out:
out.write(r.text)
not_here = find_not_here_message(r.text) not_here = find_not_here_message(r.text)
if ( if (
len(r.text) < 2048 len(r.text) < 2048