From 777ede64f85e95eb324abbd16f2496dedca9ea77 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Apr 2024 09:12:11 +0100 Subject: [PATCH] Reformat code with black --- get.py | 59 ++++++++++++++++++--------------- parse.py | 99 ++++++++++++++++++++++++++++++-------------------------- 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/get.py b/get.py index 7b1718b..e654472 100755 --- a/get.py +++ b/get.py @@ -8,64 +8,71 @@ import lxml.html from random import shuffle from time import sleep -re_recommend = re.compile(' ') +re_recommend = re.compile( + ' ' +) s = requests.Session() -cookie_dir = '/home/edward/lib/cookies' -cookie_file = os.path.join(cookie_dir, 'goodreads') +cookie_dir = "/home/edward/lib/cookies" +cookie_file = os.path.join(cookie_dir, "goodreads") cj = LWPCookieJar(cookie_file) if os.path.exists(cookie_file): cj.load() s.cookies = cj + def login(): - sign_in_page = 'https://www.goodreads.com/user/sign_in' + sign_in_page = "https://www.goodreads.com/user/sign_in" page = s.get(sign_in_page).text - open('sign_in.html', 'w').write(page) + open("sign_in.html", "w").write(page) if '"name":"Edward Betts"' in page: return # already signed in - re_token = re.compile('') + re_token = re.compile( + '' + ) re_n = re.compile("") token = re_token.search(page).group(1) data = { - 'utf8': u'\u2713', - 'authenticity_token': token, - 'user[email]': 'edward@4angle.com', - 'user[password]': '8V8~9:3~U!Ly', - 'remember_me': 1, - 'next': 'Sign in', - 'n': re_n.search(page).group(1), + "utf8": "\u2713", + "authenticity_token": token, + "user[email]": "edward@4angle.com", + "user[password]": "8V8~9:3~U!Ly", + "remember_me": 1, + "next": "Sign in", + "n": re_n.search(page).group(1), } print(token) - print(data['n']) + print(data["n"]) - r = s.post(sign_in_page, data=data, headers={'referer': sign_in_page}) + r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page}) - open('signed_in.html', 'w').write(r.text) + open("signed_in.html", "w").write(r.text) root = lxml.html.fromstring(r.content) - flash = root.find_class('flash') + flash = root.find_class("flash") if flash: - print('flash:', flash[0].text) + print("flash:", flash[0].text) cj.save(ignore_discard=True) + def get_index(): # url = 'https://www.goodreads.com/recommendations' - url = 'https://www.goodreads.com/recommendations/?recs_current_view=list' + url = "https://www.goodreads.com/recommendations/?recs_current_view=list" r = s.get(url) - open('recommendations.html', 'w').write(r.text) + open("recommendations.html", "w").write(r.text) + def get_individual(): - for line in open('recommendations.html'): - if 'actionLinkLite' not in line: + for line in open("recommendations.html"): + if "actionLinkLite" not in line: continue m = re_recommend.match(line) if m: @@ -78,13 +85,13 @@ get_index() recommend_list = list(get_individual()) shuffle(recommend_list) -headers = {'Accept': 'text/html'} +headers = {"Accept": "text/html"} for a, b, c in recommend_list: print((b, c)) - url = 'https://www.goodreads.com' + a + url = "https://www.goodreads.com" + a r = s.get(url, headers=headers) - filename = os.path.join(b, c + '.html') - open(filename, 'w').write(r.text) + filename = os.path.join(b, c + ".html") + open(filename, "w").write(r.text) sleep(0.5) diff --git a/parse.py b/parse.py index 56efc73..f0f6294 100755 --- a/parse.py +++ b/parse.py @@ -9,89 +9,94 @@ import requests from time import sleep from pprint import pprint -parser = lxml.html.HTMLParser(encoding='utf-8') +parser = lxml.html.HTMLParser(encoding="utf-8") -env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) +env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) -re_book = re.compile('function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}', re.DOTALL) -re_book = re.compile('function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}', re.DOTALL) +re_book = re.compile( + "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL +) +re_book = re.compile( + "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL +) -re_tip = re.compile(br'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') +re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') -dirs = ['shelf', 'genre'] -start = 'https://www.goodreads.com/book/show/' +dirs = ["shelf", "genre"] +start = "https://www.goodreads.com/book/show/" + +existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} -existing = {book['title'] for book in json.load(open('calibre_book_list.json'))} def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() - for div in root.find_class('bookInformation'): - link = div.find('.//a') + for div in root.find_class("bookInformation"): + link = div.find(".//a") rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') r = rating[0].tail.strip() - cover = div.getnext().find('.//img').get('src') + cover = div.getnext().find(".//img").get("src") book = { - 'title': link.text, - 'url': link.get('href'), - 'rating': r, - 'r': float(r[:3]), - 'cover': cover, - 'authors': [a.text for a in div.find_class('authorName')], + "title": link.text, + "url": link.get("href"), + "rating": r, + "r": float(r[:3]), + "cover": cover, + "authors": [a.text for a in div.find_class("authorName")], } if description is not None: index = 1 if len(description) == 3 else 0 - book['description'] = description[index].text + book["description"] = description[index].text yield d, f, book continue # print(filename) - for line in open(filename, 'rb'): - if b'var newTip' not in line: + for line in open(filename, "rb"): + if b"var newTip" not in line: continue print(line) m = re_tip.search(line) - tip = m.group(1).decode('unicode_escape').replace('\/', '/') + tip = m.group(1).decode("unicode_escape").replace("\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '