Split code into smaller functions

This commit is contained in:
Edward Betts 2024-04-17 09:55:24 +01:00
parent 4ac90301d5
commit c1cbfd5f79

View file

@ -1,26 +1,28 @@
#!/usr/bin/python3
import json
import os
import re
import lxml.html
import jinja2
import json
import sys
import requests
import typing
from time import sleep
from pprint import pprint
import jinja2
import lxml.html
import requests
parser = lxml.html.HTMLParser(encoding="utf-8")
env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
re_book = re.compile(
"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
)
re_book = re.compile(
"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
)
re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
dirs = ["shelf", "genre"]
start = "https://www.goodreads.com/book/show/"
@ -28,30 +30,44 @@ start = "https://www.goodreads.com/book/show/"
existing = {book["title"] for book in json.load(open("calibre_book_list.json"))}
def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]:
"""Parse book div."""
link = div.find(".//a")
assert link is not None
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
assert rating is not None and rating[0] is not None
r = rating[0].tail
assert r
div_next = div.getnext()
assert div_next is not None
cover_img = div_next.find(".//img")
assert cover_img is not None
cover = cover_img.get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3].strip()),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
return book
def iter_books():
for d in dirs:
for f in sorted(os.listdir(d)):
filename = os.path.join(d, f)
root = lxml.html.parse(filename, parser=parser).getroot()
for div in root.find_class("bookInformation"):
link = div.find(".//a")
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
r = rating[0].tail.strip()
cover = div.getnext().find(".//img").get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3]),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
book = parse_book_div(div)
yield d, f, book
continue
@ -61,7 +77,7 @@ def iter_books():
continue
print(line)
m = re_tip.search(line)
tip = m.group(1).decode("unicode_escape").replace("\/", "/")
tip = m.group(1).decode("unicode_escape").replace(r"\/", "/")
# tip = m.group(1) # .replace('\/', '/')
# print(tip)
if '<ul class="formatting_tips recommendation_tip">' in tip:
@ -74,6 +90,8 @@ def iter_books():
yield (d, f, lxml.html.fromstring(tip))
def main() -> None:
"""Download books."""
template = env.get_template("books.html")
seen = set()
books = []
@ -134,5 +152,10 @@ sys.exit(0)
for a in sorted(first_authors):
print(a)
# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors)
# if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# print(authors.encode('utf-8'))
if __name__ == "__main__":
main()