Compare commits
No commits in common. "fee54a37e7407e75e8f482b720a109440033ab90" and "1d8c9eef7b53063201cbc817e0af659223353e3a" have entirely different histories.
fee54a37e7
...
1d8c9eef7b
34
check.py
34
check.py
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
"""Check if conference websites are live."""
|
"""Check if conference websites are live."""
|
||||||
|
|
||||||
import html
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import smtplib
|
import smtplib
|
||||||
|
@ -12,7 +11,6 @@ from email.mime.text import MIMEText
|
||||||
from email.utils import formatdate, make_msgid
|
from email.utils import formatdate, make_msgid
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
import cloudscraper
|
|
||||||
import requests
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
|
@ -55,14 +53,8 @@ re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
|
||||||
AGENT = config["browser"]["User-Agent"]
|
AGENT = config["browser"]["User-Agent"]
|
||||||
headers = {"User-Agent": AGENT, "Accept": "text/html"}
|
headers = {"User-Agent": AGENT, "Accept": "text/html"}
|
||||||
|
|
||||||
s = cloudscraper.CloudScraper()
|
s = requests.Session()
|
||||||
s_no_dot = cloudscraper.CloudScraper()
|
s.headers.update(headers)
|
||||||
|
|
||||||
# s = requests.Session()
|
|
||||||
# s.headers.update(headers)
|
|
||||||
|
|
||||||
# s_no_dot = requests.Session()
|
|
||||||
# s_no_dot.headers.update(headers)
|
|
||||||
|
|
||||||
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
|
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
|
||||||
adapter = AbsoluteDNSAdapter()
|
adapter = AbsoluteDNSAdapter()
|
||||||
|
@ -90,7 +82,7 @@ not_here_list = [
|
||||||
"Ooops! Could Not Find It",
|
"Ooops! Could Not Find It",
|
||||||
"OpenStreetMap Authentication Proxy",
|
"OpenStreetMap Authentication Proxy",
|
||||||
"Error 404",
|
"Error 404",
|
||||||
# "Under Construction",
|
"Under Construction",
|
||||||
"Page not found",
|
"Page not found",
|
||||||
"Error 404: Page not found",
|
"Error 404: Page not found",
|
||||||
"Barcamptools",
|
"Barcamptools",
|
||||||
|
@ -101,23 +93,18 @@ not_here_list = [
|
||||||
"You are not authorized to access this page",
|
"You are not authorized to access this page",
|
||||||
"Attention Required! | Cloudflare",
|
"Attention Required! | Cloudflare",
|
||||||
"This page doesn't currently exist",
|
"This page doesn't currently exist",
|
||||||
"ERROR 503 - Service Unavailable",
|
|
||||||
"ERROR 503",
|
|
||||||
"401 Authorization Required",
|
|
||||||
"Authorization Required",
|
|
||||||
"used Cloudflare to restrict access",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def find_not_here_message(page_html: str) -> str | None:
|
def find_not_here_message(html: str) -> str | None:
|
||||||
"""Find not here message in web page."""
|
"""Find not here message in web page."""
|
||||||
return next((not_here for not_here in not_here_list if not_here in page_html), None)
|
return next((not_here for not_here in not_here_list if not_here in html), None)
|
||||||
|
|
||||||
|
|
||||||
def get_title(page_html: str) -> str:
|
def get_title(html: str) -> str:
|
||||||
"""Title from web page."""
|
"""Title from web page."""
|
||||||
m = re_title.search(page_html)
|
m = re_title.search(html)
|
||||||
return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title"
|
return m.group(1).strip() if m and m.group(1) else "no title"
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(url: str) -> str:
|
def normalize_url(url: str) -> str:
|
||||||
|
@ -141,10 +128,9 @@ def check_conference(
|
||||||
"""Check if conference is live."""
|
"""Check if conference is live."""
|
||||||
url = src_url.format(year=year)
|
url = src_url.format(year=year)
|
||||||
past_url = src_url.format(year=year - 1)
|
past_url = src_url.format(year=year - 1)
|
||||||
no_dot = {"bsideskbh.dk", "pif.camp"}
|
|
||||||
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
|
|
||||||
try:
|
try:
|
||||||
r = session.get(url)
|
# SotM Baltics has an invalid TLS certificate, but we don't care
|
||||||
|
r = s.get(url, verify=False)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
return (False, "connection refused", None)
|
return (False, "connection refused", None)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue