From c1cbfd5f7954a9f1e277f89d110835d91b5b7176 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 17 Apr 2024 09:55:24 +0100 Subject: [PATCH] Split code into smaller functions --- parse.py | 191 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 84 deletions(-) diff --git a/parse.py b/parse.py index f0f6294..f2da18b 100755 --- a/parse.py +++ b/parse.py @@ -1,26 +1,28 @@ #!/usr/bin/python3 + +import json import os import re -import lxml.html -import jinja2 -import json import sys -import requests +import typing from time import sleep -from pprint import pprint + +import jinja2 +import lxml.html +import requests parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( - "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL + r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( - "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL + r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL ) -re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') +re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" @@ -28,30 +30,44 @@ start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} +def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]: + """Parse book div.""" + link = div.find(".//a") + assert link is not None + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + assert rating is not None and rating[0] is not None + r = rating[0].tail + assert r + + div_next = div.getnext() + assert div_next is not None + cover_img = div_next.find(".//img") + assert cover_img is not None + cover = cover_img.get("src") + + book = { + "title": link.text, + "url": link.get("href"), + "rating": r, + "r": float(r[:3].strip()), + "cover": cover, + "authors": [a.text for a in div.find_class("authorName")], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book["description"] = description[index].text + + return book + + def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): - link = div.find(".//a") - rating = div.find('.//span[@class="minirating"]') - description = div.find('./div[@class="bookDescription"]') - r = rating[0].tail.strip() - - cover = div.getnext().find(".//img").get("src") - - book = { - "title": link.text, - "url": link.get("href"), - "rating": r, - "r": float(r[:3]), - "cover": cover, - "authors": [a.text for a in div.find_class("authorName")], - } - if description is not None: - index = 1 if len(description) == 3 else 0 - book["description"] = description[index].text + book = parse_book_div(div) yield d, f, book continue @@ -61,7 +77,7 @@ def iter_books(): continue print(line) m = re_tip.search(line) - tip = m.group(1).decode("unicode_escape").replace("\/", "/") + tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '