goodreads-backup/get.py

#!/usr/bin/python3

import requests
from http.cookiejar import LWPCookieJar
import os
import re
import lxml.html
from random import shuffle
from time import sleep

re_recommend = re.compile(
    '  <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">'
)

s = requests.Session()

cookie_dir = "/home/edward/lib/cookies"
cookie_file = os.path.join(cookie_dir, "goodreads")

cj = LWPCookieJar(cookie_file)
if os.path.exists(cookie_file):
    cj.load()
s.cookies = cj


def login():
    sign_in_page = "https://www.goodreads.com/user/sign_in"
    page = s.get(sign_in_page).text
    open("sign_in.html", "w").write(page)
    if '"name":"Edward Betts"' in page:
        return  # already signed in

    re_token = re.compile(
        '<input type="hidden" name="authenticity_token" value="([^"]*?)" />'
    )
    re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>")

    token = re_token.search(page).group(1)

    data = {
        "utf8": "\u2713",
        "authenticity_token": token,
        "user[email]": "edward@4angle.com",
        "user[password]": "8V8~9:3~U!Ly",
        "remember_me": 1,
        "next": "Sign in",
        "n": re_n.search(page).group(1),
    }

    print(token)
    print(data["n"])

    r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page})

    open("signed_in.html", "w").write(r.text)

    root = lxml.html.fromstring(r.content)
    flash = root.find_class("flash")
    if flash:
        print("flash:", flash[0].text)

    cj.save(ignore_discard=True)


def get_index():
    # url = 'https://www.goodreads.com/recommendations'
    url = "https://www.goodreads.com/recommendations/?recs_current_view=list"

    r = s.get(url)
    open("recommendations.html", "w").write(r.text)


def get_individual():
    for line in open("recommendations.html"):
        if "actionLinkLite" not in line:
            continue
        m = re_recommend.match(line)
        if m:
            yield m.groups()


# art = 'https://www.goodreads.com/recommendations/genre/art'
login()
get_index()
recommend_list = list(get_individual())
shuffle(recommend_list)

headers = {"Accept": "text/html"}

for a, b, c in recommend_list:
    print((b, c))
    url = "https://www.goodreads.com" + a

    r = s.get(url, headers=headers)
    filename = os.path.join(b, c + ".html")
    open(filename, "w").write(r.text)
    sleep(0.5)
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`#!/usr/bin/python3`

			`import requests`
			`from http.cookiejar import LWPCookieJar`
			`import os`
			`import re`
			`import lxml.html`
			`from random import shuffle`
			`from time import sleep`

Reformat code with black 2024-04-17 09:12:11 +01:00			`re_recommend = re.compile(`
			`' <a class="actionLinkLite " href="(/recommendations/([^/]?)/([^/]?))">'`
			`)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`s = requests.Session()`

Reformat code with black 2024-04-17 09:12:11 +01:00			`cookie_dir = "/home/edward/lib/cookies"`
			`cookie_file = os.path.join(cookie_dir, "goodreads")`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`cj = LWPCookieJar(cookie_file)`
			`if os.path.exists(cookie_file):`
			`cj.load()`
			`s.cookies = cj`

Reformat code with black 2024-04-17 09:12:11 +01:00
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`def login():`
Reformat code with black 2024-04-17 09:12:11 +01:00			`sign_in_page = "https://www.goodreads.com/user/sign_in"`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`page = s.get(sign_in_page).text`
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("sign_in.html", "w").write(page)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`if '"name":"Edward Betts"' in page:`
			`return # already signed in`

Reformat code with black 2024-04-17 09:12:11 +01:00			`re_token = re.compile(`
			`'<input type="hidden" name="authenticity_token" value="([^"]*?)" />'`
			`)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>")`

			`token = re_token.search(page).group(1)`

			`data = {`
Reformat code with black 2024-04-17 09:12:11 +01:00			`"utf8": "\u2713",`
			`"authenticity_token": token,`
			`"user[email]": "edward@4angle.com",`
			`"user[password]": "8V8~9:3~U!Ly",`
			`"remember_me": 1,`
			`"next": "Sign in",`
			`"n": re_n.search(page).group(1),`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`}`

			`print(token)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`print(data["n"])`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Reformat code with black 2024-04-17 09:12:11 +01:00			`r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page})`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("signed_in.html", "w").write(r.text)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`root = lxml.html.fromstring(r.content)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`flash = root.find_class("flash")`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`if flash:`
Reformat code with black 2024-04-17 09:12:11 +01:00			`print("flash:", flash[0].text)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`cj.save(ignore_discard=True)`

Reformat code with black 2024-04-17 09:12:11 +01:00
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`def get_index():`
			`# url = 'https://www.goodreads.com/recommendations'`
Reformat code with black 2024-04-17 09:12:11 +01:00			`url = "https://www.goodreads.com/recommendations/?recs_current_view=list"`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`r = s.get(url)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("recommendations.html", "w").write(r.text)`

Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`def get_individual():`
Reformat code with black 2024-04-17 09:12:11 +01:00			`for line in open("recommendations.html"):`
			`if "actionLinkLite" not in line:`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`continue`
			`m = re_recommend.match(line)`
			`if m:`
			`yield m.groups()`


			`# art = 'https://www.goodreads.com/recommendations/genre/art'`
			`login()`
			`get_index()`
			`recommend_list = list(get_individual())`
			`shuffle(recommend_list)`

Reformat code with black 2024-04-17 09:12:11 +01:00			`headers = {"Accept": "text/html"}`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`for a, b, c in recommend_list:`
			`print((b, c))`
Reformat code with black 2024-04-17 09:12:11 +01:00			`url = "https://www.goodreads.com" + a`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`r = s.get(url, headers=headers)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`filename = os.path.join(b, c + ".html")`
			`open(filename, "w").write(r.text)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`sleep(0.5)`