Skip to content

Commit 49909fd

Browse files
feat: adjust project structure and add caching support for the cookie (#11)
* chore: move crawling to a dedicated package to improve the project structure * fix: make time adjustment independent of download * feat: add cookie validation and cookie caching * chore: adjust warning to reflect low download quote issue better * chore: clarify cookie authentication details in README * fix: allow cookie via command line flag * chore: add qa to makefile * chore: fix workflow qa make target * chore: adjust spelling
1 parent 824817c commit 49909fd

File tree

11 files changed

+407
-194
lines changed

11 files changed

+407
-194
lines changed

.github/workflows/qa.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ jobs:
2626
uses: actions/setup-go@v5
2727
with:
2828
go-version-file: ./go.mod
29+
- name: Run QA
30+
run:
31+
make qa
2932
- name: Build artifacts
3033
env:
3134
GOMODCACHE: /tmp/go/pkg/mod

.github/workflows/release.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ jobs:
2828
- name: Get next version
2929
id: get_next_version
3030
uses: thenativeweb/get-next-version@2.6.3
31+
- name: Run QA
32+
run:
33+
make qa
3134
- name: Create new version
3235
if: ${{ steps.get_next_version.outputs.hasNextVersion == 'true' }}
3336
run: |

Makefile

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,14 @@ build:
1111
@rm -rf build/
1212

1313
@GOOS=windows GOARCH=amd64 go build -o ./build/patreon-crawler.exe -ldflags "-X main.version=$(BUILD_VERSION)" ./main.go
14-
1514
@GOOS=linux GOARCH=amd64 go build -o ./build/patreon-crawler -ldflags "-X main.version=$(BUILD_VERSION)" ./main.go
1615

17-
.PHONY: build
16+
qa: analyze
17+
18+
analyze:
19+
@go vet
20+
@go run honnef.co/go/tools/cmd/staticcheck@latest --checks=all
21+
22+
.PHONY: build \
23+
analyze \
24+
qa

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ The simplest way to extract the cookies is by visiting the Patreon home page and
2121
3. Find a request starting with `current_user?include=....`. Click it a scroll to the `Request Headers` section.
2222
4. Copy the `Cookie` value to your clipboard.
2323

24+
> To authenticate against `patreon.com` it's actually sufficient to copy the `session_id=<id>` cookie, however, passing the entire cookie string won't affect functionality.
25+
2426
### Using the tool
2527

2628
To download all media from a creator, run the following command.

crawling/crawl_creator.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package crawling
2+
3+
import (
4+
"fmt"
5+
6+
"patreon-crawler/patreon"
7+
"patreon-crawler/patreon/api"
8+
)
9+
10+
func CrawlCreator(apiClient *api.Client, creatorID string, downloadDir string, downloadInaccessibleMedia bool, groupingStrategy GroupingStrategy, downloadLimit int) error {
11+
12+
creatorDownloadDir := fmt.Sprintf("%s/%s", downloadDir, sanitizeFilename(creatorID))
13+
14+
fmt.Printf("Downloading posts from %s to %s\n", creatorID, creatorDownloadDir)
15+
16+
client, err := patreon.NewClient(apiClient, creatorID)
17+
if err != nil {
18+
return err
19+
}
20+
21+
err = CrawlPosts(client, creatorDownloadDir, downloadInaccessibleMedia, groupingStrategy, downloadLimit)
22+
if err != nil {
23+
return err
24+
}
25+
26+
return nil
27+
}

crawling/crawl_posts.go

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
package crawling
2+
3+
import (
4+
"fmt"
5+
"io"
6+
"net/http"
7+
"os"
8+
"regexp"
9+
"slices"
10+
"strings"
11+
12+
"patreon-crawler/patreon"
13+
)
14+
15+
type GroupingStrategy string
16+
17+
const (
18+
GroupingStrategyNone GroupingStrategy = "none"
19+
GroupingStrategyByPost GroupingStrategy = "by-post"
20+
)
21+
22+
var fileNameWindowsReservedNames = []string{
23+
"CON", "PRN", "AUX", "NUL",
24+
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
25+
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
26+
}
27+
28+
var fileNameInvalidChars = regexp.MustCompile(`[<>:"/\\|?*\x00]`)
29+
30+
func sanitizeFilename(name string) string {
31+
name = fileNameInvalidChars.ReplaceAllString(name, "_")
32+
name = strings.TrimRight(name, " .")
33+
upper := strings.ToUpper(name)
34+
if slices.Contains(fileNameWindowsReservedNames, upper) {
35+
name = "_" + name
36+
}
37+
if len(name) > 255 {
38+
name = name[:255]
39+
}
40+
if name == "" {
41+
name = "_"
42+
}
43+
return name
44+
}
45+
46+
func getMediaFile(downloadDirectory string, media patreon.Media) (string, error) {
47+
extension, err := getFileExtension(media.MimeType)
48+
if err != nil {
49+
return "", err
50+
}
51+
return fmt.Sprintf("%s/%s.%s", downloadDirectory, media.ID, extension), nil
52+
}
53+
54+
func getFileExtension(mimeType string) (string, error) {
55+
// This is a very quick and dirty method, but it should work here
56+
mimeTypeSplits := strings.Split(mimeType, "/")
57+
if len(mimeTypeSplits) != 2 {
58+
return "", fmt.Errorf("invalid mime type: %s", mimeType)
59+
}
60+
return mimeTypeSplits[1], nil
61+
}
62+
63+
func downloadMedia(media patreon.Media, downloadDir string) error {
64+
downloadedFilePath, err := getMediaFile(downloadDir, media)
65+
if err != nil {
66+
return err
67+
}
68+
69+
_, err = os.Stat(downloadedFilePath)
70+
if err == nil {
71+
fmt.Printf("\t- skipped %s (already downloaded)\n", media.ID)
72+
return nil
73+
}
74+
75+
response, err := http.Get(media.DownloadURL)
76+
if err != nil {
77+
return fmt.Errorf("failed to download media: %w", err)
78+
}
79+
defer response.Body.Close()
80+
81+
if response.StatusCode != http.StatusOK {
82+
return fmt.Errorf("failed to download media: %s", response.Status)
83+
}
84+
85+
tempDownloadFilePath := downloadedFilePath + ".tmp"
86+
87+
out, err := os.Create(tempDownloadFilePath)
88+
if err != nil {
89+
return fmt.Errorf("failed to create file: %w", err)
90+
}
91+
defer out.Close()
92+
93+
_, err = io.Copy(out, response.Body)
94+
if err != nil {
95+
return fmt.Errorf("failed to write file: %w", err)
96+
}
97+
98+
out.Close()
99+
100+
err = os.Rename(tempDownloadFilePath, downloadedFilePath)
101+
if err != nil {
102+
return fmt.Errorf("failed to rename file: %w", err)
103+
}
104+
105+
fmt.Printf("\t- saved %s\n", media.ID)
106+
return nil
107+
}
108+
109+
func downloadPost(downloadDirectory string, post patreon.Post) error {
110+
if len(post.Media) == 0 {
111+
return nil
112+
}
113+
114+
err := os.MkdirAll(downloadDirectory, 0700)
115+
if err != nil {
116+
return fmt.Errorf("failed to create download directory: %w", err)
117+
}
118+
119+
for _, media := range post.Media {
120+
err := downloadMedia(media, downloadDirectory)
121+
if err != nil {
122+
return err
123+
}
124+
}
125+
return nil
126+
}
127+
128+
func adjustPostsFileTime(downloadDirectory string, post patreon.Post) error {
129+
if len(post.Media) == 0 {
130+
return nil
131+
}
132+
for _, media := range post.Media {
133+
file, err := getMediaFile(downloadDirectory, media)
134+
if err != nil {
135+
return err
136+
}
137+
138+
_, err = os.Stat(file)
139+
if err != nil {
140+
// File does not exist
141+
return nil
142+
}
143+
144+
err = os.Chtimes(file, post.PublishedAt, post.PublishedAt)
145+
if err != nil {
146+
return err
147+
}
148+
}
149+
150+
return nil
151+
}
152+
153+
func getDownloadDir(baseDownloadDir, postTitle string, groupingStrategy GroupingStrategy) (string, error) {
154+
switch groupingStrategy {
155+
case GroupingStrategyByPost:
156+
return fmt.Sprintf("%s/%s", baseDownloadDir, sanitizeFilename(postTitle)), nil
157+
case GroupingStrategyNone:
158+
return baseDownloadDir, nil
159+
default:
160+
return "", fmt.Errorf("invalid grouping strategy")
161+
}
162+
}
163+
164+
func CrawlPosts(client *patreon.Client, baseDownloadDir string, downloadInaccessibleMedia bool, groupingStrategy GroupingStrategy, downloadLimit int) error {
165+
posts := client.Posts()
166+
167+
postsDownloaded := 0
168+
postsCrawled := 0
169+
for post, err := range posts {
170+
if err != nil {
171+
return err
172+
}
173+
174+
if postsDownloaded >= downloadLimit && downloadLimit != 0 {
175+
break
176+
}
177+
178+
if len(post.Media) == 0 {
179+
fmt.Printf("[%d] Skipping post with no media '%s'\n", postsDownloaded, post.Title)
180+
continue
181+
}
182+
downloadDir, err := getDownloadDir(baseDownloadDir, post.Title, groupingStrategy)
183+
if err != nil {
184+
return err
185+
}
186+
187+
postsCrawled++
188+
189+
if post.CurrentUserCanView || downloadInaccessibleMedia {
190+
fmt.Printf("[%d] Saving post '%s'\n", postsDownloaded, post.Title)
191+
192+
err := downloadPost(downloadDir, post)
193+
if err != nil {
194+
return err
195+
}
196+
197+
postsDownloaded++
198+
}
199+
200+
// This can be done independent to downloading
201+
err = adjustPostsFileTime(downloadDir, post)
202+
if err != nil {
203+
return err
204+
}
205+
}
206+
if postsCrawled == 0 {
207+
fmt.Println("No posts found")
208+
}
209+
210+
downloadedFraction := float64(postsDownloaded) / float64(postsCrawled)
211+
if downloadedFraction < 0.8 {
212+
if postsDownloaded == 0 {
213+
fmt.Printf("Warning: No posts were downloaded. Did you provide the correct creator ID?\n")
214+
} else {
215+
fmt.Printf("Warning: Only %f%% of posts were downloaded. Did you provide correct creator ID?\n", downloadedFraction*100)
216+
}
217+
}
218+
219+
return nil
220+
}

0 commit comments

Comments
 (0)