Compare commits
	
		
			2 commits
		
	
	
		
			1d8c9eef7b
			...
			fee54a37e7
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
							
							
								
									
								
								 | 
						fee54a37e7 | ||
| 
							
							
								
									
								
								 | 
						35c213110d | 
							
								
								
									
										34
									
								
								check.py
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								check.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -2,6 +2,7 @@
 | 
			
		|||
 | 
			
		||||
"""Check if conference websites are live."""
 | 
			
		||||
 | 
			
		||||
import html
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import smtplib
 | 
			
		||||
| 
						 | 
				
			
			@ -11,6 +12,7 @@ from email.mime.text import MIMEText
 | 
			
		|||
from email.utils import formatdate, make_msgid
 | 
			
		||||
from urllib.parse import urlparse, urlunparse
 | 
			
		||||
 | 
			
		||||
import cloudscraper
 | 
			
		||||
import requests
 | 
			
		||||
import yaml
 | 
			
		||||
from requests.adapters import HTTPAdapter
 | 
			
		||||
| 
						 | 
				
			
			@ -53,8 +55,14 @@ re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
 | 
			
		|||
AGENT = config["browser"]["User-Agent"]
 | 
			
		||||
headers = {"User-Agent": AGENT, "Accept": "text/html"}
 | 
			
		||||
 | 
			
		||||
s = requests.Session()
 | 
			
		||||
s.headers.update(headers)
 | 
			
		||||
s = cloudscraper.CloudScraper()
 | 
			
		||||
s_no_dot = cloudscraper.CloudScraper()
 | 
			
		||||
 | 
			
		||||
# s = requests.Session()
 | 
			
		||||
# s.headers.update(headers)
 | 
			
		||||
 | 
			
		||||
# s_no_dot = requests.Session()
 | 
			
		||||
# s_no_dot.headers.update(headers)
 | 
			
		||||
 | 
			
		||||
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
 | 
			
		||||
adapter = AbsoluteDNSAdapter()
 | 
			
		||||
| 
						 | 
				
			
			@ -82,7 +90,7 @@ not_here_list = [
 | 
			
		|||
    "Ooops! Could Not Find It",
 | 
			
		||||
    "OpenStreetMap Authentication Proxy",
 | 
			
		||||
    "Error 404",
 | 
			
		||||
    "Under Construction",
 | 
			
		||||
    # "Under Construction",
 | 
			
		||||
    "Page not found",
 | 
			
		||||
    "Error 404: Page not found",
 | 
			
		||||
    "Barcamptools",
 | 
			
		||||
| 
						 | 
				
			
			@ -93,18 +101,23 @@ not_here_list = [
 | 
			
		|||
    "You are not authorized to access this page",
 | 
			
		||||
    "Attention Required! | Cloudflare",
 | 
			
		||||
    "This page doesn't currently exist",
 | 
			
		||||
    "ERROR 503 - Service Unavailable",
 | 
			
		||||
    "ERROR 503",
 | 
			
		||||
    "401 Authorization Required",
 | 
			
		||||
    "Authorization Required",
 | 
			
		||||
    "used Cloudflare to restrict access",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def find_not_here_message(html: str) -> str | None:
 | 
			
		||||
def find_not_here_message(page_html: str) -> str | None:
 | 
			
		||||
    """Find not here message in web page."""
 | 
			
		||||
    return next((not_here for not_here in not_here_list if not_here in html), None)
 | 
			
		||||
    return next((not_here for not_here in not_here_list if not_here in page_html), None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_title(html: str) -> str:
 | 
			
		||||
def get_title(page_html: str) -> str:
 | 
			
		||||
    """Title from web page."""
 | 
			
		||||
    m = re_title.search(html)
 | 
			
		||||
    return m.group(1).strip() if m and m.group(1) else "no title"
 | 
			
		||||
    m = re_title.search(page_html)
 | 
			
		||||
    return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def normalize_url(url: str) -> str:
 | 
			
		||||
| 
						 | 
				
			
			@ -128,9 +141,10 @@ def check_conference(
 | 
			
		|||
    """Check if conference is live."""
 | 
			
		||||
    url = src_url.format(year=year)
 | 
			
		||||
    past_url = src_url.format(year=year - 1)
 | 
			
		||||
    no_dot = {"bsideskbh.dk", "pif.camp"}
 | 
			
		||||
    session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
 | 
			
		||||
    try:
 | 
			
		||||
        # SotM Baltics has an invalid TLS certificate, but we don't care
 | 
			
		||||
        r = s.get(url, verify=False)
 | 
			
		||||
        r = session.get(url)
 | 
			
		||||
    except requests.exceptions.ConnectionError:
 | 
			
		||||
        return (False, "connection refused", None)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue