Compare commits

...

2 commits

Author SHA1 Message Date
Edward Betts 99844005d6 Update code style 2024-04-17 10:02:26 +01:00
Edward Betts c1cbfd5f79 Split code into smaller functions 2024-04-17 09:55:24 +01:00
2 changed files with 144 additions and 106 deletions

59
get.py
View file

@ -1,13 +1,16 @@
#!/usr/bin/python3
import requests
from http.cookiejar import LWPCookieJar
"""Download shelves from goodreads."""
import os
import re
import lxml.html
from http.cookiejar import LWPCookieJar
from random import shuffle
from time import sleep
import lxml.html
import requests
re_recommend = re.compile(
' <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">'
)
@ -20,10 +23,11 @@ cookie_file = os.path.join(cookie_dir, "goodreads")
cj = LWPCookieJar(cookie_file)
if os.path.exists(cookie_file):
cj.load()
s.cookies = cj
s.cookies = cj # type: ignore
def login():
def login() -> None:
"""Login."""
sign_in_page = "https://www.goodreads.com/user/sign_in"
page = s.get(sign_in_page).text
open("sign_in.html", "w").write(page)
@ -33,9 +37,12 @@ def login():
re_token = re.compile(
'<input type="hidden" name="authenticity_token" value="([^"]*?)" />'
)
re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>")
re_n = re.compile(r"<input name='n' type='hidden' value='(\d+)'>")
m_n = re_n.search(page)
m_token = re_token.search(page)
token = re_token.search(page).group(1)
assert m_token and m_n
token = m_token.group(1)
data = {
"utf8": "\u2713",
@ -44,7 +51,7 @@ def login():
"user[password]": "8V8~9:3~U!Ly",
"remember_me": 1,
"next": "Sign in",
"n": re_n.search(page).group(1),
"n": m_n.group(1),
}
print(token)
@ -62,7 +69,8 @@ def login():
cj.save(ignore_discard=True)
def get_index():
def get_index() -> None:
"""Get index."""
# url = 'https://www.goodreads.com/recommendations'
url = "https://www.goodreads.com/recommendations/?recs_current_view=list"
@ -71,6 +79,7 @@ def get_index():
def get_individual():
"""Get individual page."""
for line in open("recommendations.html"):
if "actionLinkLite" not in line:
continue
@ -79,19 +88,25 @@ def get_individual():
yield m.groups()
# art = 'https://www.goodreads.com/recommendations/genre/art'
login()
get_index()
recommend_list = list(get_individual())
shuffle(recommend_list)
def main() -> None:
"""Login and download shelves."""
# art = 'https://www.goodreads.com/recommendations/genre/art'
login()
get_index()
recommend_list = list(get_individual())
shuffle(recommend_list)
headers = {"Accept": "text/html"}
headers = {"Accept": "text/html"}
for a, b, c in recommend_list:
print((b, c))
url = "https://www.goodreads.com" + a
for a, b, c in recommend_list:
print((b, c))
url = "https://www.goodreads.com" + a
r = s.get(url, headers=headers)
filename = os.path.join(b, c + ".html")
open(filename, "w").write(r.text)
sleep(0.5)
r = s.get(url, headers=headers)
filename = os.path.join(b, c + ".html")
open(filename, "w").write(r.text)
sleep(0.5)
if __name__ == "__main__":
main()

191
parse.py
View file

@ -1,26 +1,28 @@
#!/usr/bin/python3
import json
import os
import re
import lxml.html
import jinja2
import json
import sys
import requests
import typing
from time import sleep
from pprint import pprint
import jinja2
import lxml.html
import requests
parser = lxml.html.HTMLParser(encoding="utf-8")
env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
re_book = re.compile(
"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
)
re_book = re.compile(
"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
)
re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
dirs = ["shelf", "genre"]
start = "https://www.goodreads.com/book/show/"
@ -28,30 +30,44 @@ start = "https://www.goodreads.com/book/show/"
existing = {book["title"] for book in json.load(open("calibre_book_list.json"))}
def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]:
"""Parse book div."""
link = div.find(".//a")
assert link is not None
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
assert rating is not None and rating[0] is not None
r = rating[0].tail
assert r
div_next = div.getnext()
assert div_next is not None
cover_img = div_next.find(".//img")
assert cover_img is not None
cover = cover_img.get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3].strip()),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
return book
def iter_books():
for d in dirs:
for f in sorted(os.listdir(d)):
filename = os.path.join(d, f)
root = lxml.html.parse(filename, parser=parser).getroot()
for div in root.find_class("bookInformation"):
link = div.find(".//a")
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
r = rating[0].tail.strip()
cover = div.getnext().find(".//img").get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3]),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
book = parse_book_div(div)
yield d, f, book
continue
@ -61,7 +77,7 @@ def iter_books():
continue
print(line)
m = re_tip.search(line)
tip = m.group(1).decode("unicode_escape").replace("\/", "/")
tip = m.group(1).decode("unicode_escape").replace(r"\/", "/")
# tip = m.group(1) # .replace('\/', '/')
# print(tip)
if '<ul class="formatting_tips recommendation_tip">' in tip:
@ -74,65 +90,72 @@ def iter_books():
yield (d, f, lxml.html.fromstring(tip))
template = env.get_template("books.html")
seen = set()
books = []
first_authors = set()
for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True):
# pprint(book)
# print(repr(book.get('description')))
# continue
def main() -> None:
"""Download books."""
template = env.get_template("books.html")
seen = set()
books = []
first_authors = set()
for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True):
# pprint(book)
# print(repr(book.get('description')))
# continue
# title_link = book.find_class('bookTitle')[0]
url = book["url"]
url = url[: url.find("?")]
if url in seen:
continue
seen.add(url)
title = book["title"]
authors = book["authors"]
first_authors.add(authors[0])
main_title = title
# for sep in ['(']:
for sep in ":", " - ", "(", ",":
if sep in title:
main_title = title[: title.find(sep)]
break
# print((main_title + ' by ' + u', '.join(authors)).encode('utf-8'))
if len(main_title) < 10:
continue
if main_title in existing:
continue
# print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8'))
print("{}".format(main_title))
# print(main_title.encode('utf-8'))
assert url.startswith(start)
# title_link = book.find_class('bookTitle')[0]
url = book["url"]
url = url[: url.find("?")]
if url in seen:
continue
seen.add(url)
title = book["title"]
authors = book["authors"]
first_authors.add(authors[0])
main_title = title
# for sep in ['(']:
for sep in ":", " - ", "(", ",":
if sep in title:
main_title = title[: title.find(sep)]
break
# print((main_title + ' by ' + u', '.join(authors)).encode('utf-8'))
if len(main_title) < 10:
continue
if main_title in existing:
continue
# print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8'))
print("{}".format(main_title))
# print(main_title.encode('utf-8'))
assert url.startswith(start)
filename = "books/" + url[len(start) :] + ".html"
# print(filename)
if False and not os.path.exists(filename):
open(filename, "w").write(requests.get(url).content)
sleep(1)
books.append(
{
"dir": d,
"file": f[:-5],
"title": title,
"main_title": main_title,
"authors": authors,
"url": url,
"rating": book["rating"],
"cover": book["cover"],
"description": book.get("description"),
}
)
filename = "books/" + url[len(start) :] + ".html"
# print(filename)
if False and not os.path.exists(filename):
open(filename, "w").write(requests.get(url).content)
sleep(1)
books.append(
{
"dir": d,
"file": f[:-5],
"title": title,
"main_title": main_title,
"authors": authors,
"url": url,
"rating": book["rating"],
"cover": book["cover"],
"description": book.get("description"),
}
)
page = template.render(books=books)
open("book_list.html", "w").write(page)
sys.exit(0)
page = template.render(books=books)
open("book_list.html", "w").write(page)
sys.exit(0)
for a in sorted(first_authors):
print(a)
for a in sorted(first_authors):
print(a)
# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# print(authors.encode('utf-8'))
# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors)
# if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# print(authors.encode('utf-8'))
if __name__ == "__main__":
main()