Compare commits

..

No commits in common. "99844005d67ac36ecc5c4bff174a286ed61b00d9" and "4ac90301d5103c0ded1c8b04c3c0578811d9f052" have entirely different histories.

2 changed files with 106 additions and 144 deletions

33
get.py
View file

@ -1,16 +1,13 @@
#!/usr/bin/python3 #!/usr/bin/python3
"""Download shelves from goodreads.""" import requests
from http.cookiejar import LWPCookieJar
import os import os
import re import re
from http.cookiejar import LWPCookieJar import lxml.html
from random import shuffle from random import shuffle
from time import sleep from time import sleep
import lxml.html
import requests
re_recommend = re.compile( re_recommend = re.compile(
' <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">' ' <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">'
) )
@ -23,11 +20,10 @@ cookie_file = os.path.join(cookie_dir, "goodreads")
cj = LWPCookieJar(cookie_file) cj = LWPCookieJar(cookie_file)
if os.path.exists(cookie_file): if os.path.exists(cookie_file):
cj.load() cj.load()
s.cookies = cj # type: ignore s.cookies = cj
def login() -> None: def login():
"""Login."""
sign_in_page = "https://www.goodreads.com/user/sign_in" sign_in_page = "https://www.goodreads.com/user/sign_in"
page = s.get(sign_in_page).text page = s.get(sign_in_page).text
open("sign_in.html", "w").write(page) open("sign_in.html", "w").write(page)
@ -37,12 +33,9 @@ def login() -> None:
re_token = re.compile( re_token = re.compile(
'<input type="hidden" name="authenticity_token" value="([^"]*?)" />' '<input type="hidden" name="authenticity_token" value="([^"]*?)" />'
) )
re_n = re.compile(r"<input name='n' type='hidden' value='(\d+)'>") re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>")
m_n = re_n.search(page)
m_token = re_token.search(page)
assert m_token and m_n token = re_token.search(page).group(1)
token = m_token.group(1)
data = { data = {
"utf8": "\u2713", "utf8": "\u2713",
@ -51,7 +44,7 @@ def login() -> None:
"user[password]": "8V8~9:3~U!Ly", "user[password]": "8V8~9:3~U!Ly",
"remember_me": 1, "remember_me": 1,
"next": "Sign in", "next": "Sign in",
"n": m_n.group(1), "n": re_n.search(page).group(1),
} }
print(token) print(token)
@ -69,8 +62,7 @@ def login() -> None:
cj.save(ignore_discard=True) cj.save(ignore_discard=True)
def get_index() -> None: def get_index():
"""Get index."""
# url = 'https://www.goodreads.com/recommendations' # url = 'https://www.goodreads.com/recommendations'
url = "https://www.goodreads.com/recommendations/?recs_current_view=list" url = "https://www.goodreads.com/recommendations/?recs_current_view=list"
@ -79,7 +71,6 @@ def get_index() -> None:
def get_individual(): def get_individual():
"""Get individual page."""
for line in open("recommendations.html"): for line in open("recommendations.html"):
if "actionLinkLite" not in line: if "actionLinkLite" not in line:
continue continue
@ -88,8 +79,6 @@ def get_individual():
yield m.groups() yield m.groups()
def main() -> None:
"""Login and download shelves."""
# art = 'https://www.goodreads.com/recommendations/genre/art' # art = 'https://www.goodreads.com/recommendations/genre/art'
login() login()
get_index() get_index()
@ -106,7 +95,3 @@ def main() -> None:
filename = os.path.join(b, c + ".html") filename = os.path.join(b, c + ".html")
open(filename, "w").write(r.text) open(filename, "w").write(r.text)
sleep(0.5) sleep(0.5)
if __name__ == "__main__":
main()

View file

@ -1,28 +1,26 @@
#!/usr/bin/python3 #!/usr/bin/python3
import json
import os import os
import re import re
import sys
import typing
from time import sleep
import jinja2
import lxml.html import lxml.html
import jinja2
import json
import sys
import requests import requests
from time import sleep
from pprint import pprint
parser = lxml.html.HTMLParser(encoding="utf-8") parser = lxml.html.HTMLParser(encoding="utf-8")
env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
re_book = re.compile( re_book = re.compile(
r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
) )
re_book = re.compile( re_book = re.compile(
r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
) )
re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
dirs = ["shelf", "genre"] dirs = ["shelf", "genre"]
start = "https://www.goodreads.com/book/show/" start = "https://www.goodreads.com/book/show/"
@ -30,44 +28,30 @@ start = "https://www.goodreads.com/book/show/"
existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} existing = {book["title"] for book in json.load(open("calibre_book_list.json"))}
def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]:
"""Parse book div."""
link = div.find(".//a")
assert link is not None
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
assert rating is not None and rating[0] is not None
r = rating[0].tail
assert r
div_next = div.getnext()
assert div_next is not None
cover_img = div_next.find(".//img")
assert cover_img is not None
cover = cover_img.get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3].strip()),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
return book
def iter_books(): def iter_books():
for d in dirs: for d in dirs:
for f in sorted(os.listdir(d)): for f in sorted(os.listdir(d)):
filename = os.path.join(d, f) filename = os.path.join(d, f)
root = lxml.html.parse(filename, parser=parser).getroot() root = lxml.html.parse(filename, parser=parser).getroot()
for div in root.find_class("bookInformation"): for div in root.find_class("bookInformation"):
book = parse_book_div(div) link = div.find(".//a")
rating = div.find('.//span[@class="minirating"]')
description = div.find('./div[@class="bookDescription"]')
r = rating[0].tail.strip()
cover = div.getnext().find(".//img").get("src")
book = {
"title": link.text,
"url": link.get("href"),
"rating": r,
"r": float(r[:3]),
"cover": cover,
"authors": [a.text for a in div.find_class("authorName")],
}
if description is not None:
index = 1 if len(description) == 3 else 0
book["description"] = description[index].text
yield d, f, book yield d, f, book
continue continue
@ -77,7 +61,7 @@ def iter_books():
continue continue
print(line) print(line)
m = re_tip.search(line) m = re_tip.search(line)
tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") tip = m.group(1).decode("unicode_escape").replace("\/", "/")
# tip = m.group(1) # .replace('\/', '/') # tip = m.group(1) # .replace('\/', '/')
# print(tip) # print(tip)
if '<ul class="formatting_tips recommendation_tip">' in tip: if '<ul class="formatting_tips recommendation_tip">' in tip:
@ -90,8 +74,6 @@ def iter_books():
yield (d, f, lxml.html.fromstring(tip)) yield (d, f, lxml.html.fromstring(tip))
def main() -> None:
"""Download books."""
template = env.get_template("books.html") template = env.get_template("books.html")
seen = set() seen = set()
books = [] books = []
@ -152,10 +134,5 @@ def main() -> None:
for a in sorted(first_authors): for a in sorted(first_authors):
print(a) print(a)
# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) # authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
# print(authors.encode('utf-8')) # print(authors.encode('utf-8'))
if __name__ == "__main__":
main()