Skip to content

Commit de232e7

Browse files
feat: add concurrency option for media downloading (#27)
* chore: add crawler struct * feat: add concurrency support for media downloads * docs: update README to include concurrency option for downloads * feat: enhance media download concurrency and error handling * chore: validate download and concurrency limits in crawler initialization
1 parent 2986160 commit de232e7

File tree

8 files changed

+481
-211
lines changed

8 files changed

+481
-211
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,5 @@ The `patreon-crawler` supports the following command line flags.
4949
| `--download-limit <number>` | The maximum number of posts to download. |
5050
| `--download-inaccessible-media` | Whether to download media that is inaccessible (blurred images) |
5151
| `--grouping <none \| by-post>` | The strategy for grouping post media into folders. <br>`none` - Puts all media into the same folder (per creator)<br>`by-post` - Creates a folder for each post, containing its media |
52+
| `--concurrency <number>` | The number of concurrent downloads to perform (default `3`) |
5253

crawling/crawl_creator.go

Lines changed: 0 additions & 26 deletions
This file was deleted.

crawling/crawl_posts.go

Lines changed: 0 additions & 159 deletions
This file was deleted.

crawling/crawler.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
package crawling
2+
3+
import (
4+
"fmt"
5+
"iter"
6+
"os"
7+
"regexp"
8+
"slices"
9+
"strings"
10+
"time"
11+
12+
"patreon-crawler/crawling/download"
13+
"patreon-crawler/patreon"
14+
"patreon-crawler/patreon/api"
15+
"patreon-crawler/queue"
16+
)
17+
18+
type GroupingStrategy string
19+
20+
const (
21+
GroupingStrategyNone GroupingStrategy = "none"
22+
GroupingStrategyByPost GroupingStrategy = "by-post"
23+
)
24+
25+
var fileNameWindowsReservedNames = []string{
26+
"CON", "PRN", "AUX", "NUL",
27+
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
28+
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
29+
}
30+
31+
var fileNameInvalidChars = regexp.MustCompile(`[<>:"/\\|?*\x00]`)
32+
33+
func sanitizeFilename(name string) string {
34+
name = fileNameInvalidChars.ReplaceAllString(name, "_")
35+
name = strings.TrimRight(name, " .")
36+
upper := strings.ToUpper(name)
37+
if slices.Contains(fileNameWindowsReservedNames, upper) {
38+
name = "_" + name
39+
}
40+
if len(name) > 255 {
41+
name = name[:255]
42+
}
43+
if name == "" {
44+
name = "_"
45+
}
46+
return name
47+
}
48+
49+
func adjustMediaFileTime(downloadDirectory string, media patreon.Media, publishedAt time.Time) error {
50+
file, err := download.GetMediaFile(downloadDirectory, media)
51+
if err != nil {
52+
return err
53+
}
54+
55+
_, err = os.Stat(file)
56+
if err != nil {
57+
// File does not exist
58+
return nil
59+
}
60+
61+
err = os.Chtimes(file, publishedAt, publishedAt)
62+
if err != nil {
63+
return err
64+
}
65+
66+
return nil
67+
}
68+
69+
func getDownloadDir(baseDownloadDir, postTitle string, groupingStrategy GroupingStrategy) (string, error) {
70+
switch groupingStrategy {
71+
case GroupingStrategyByPost:
72+
return fmt.Sprintf("%s/%s", baseDownloadDir, sanitizeFilename(postTitle)), nil
73+
case GroupingStrategyNone:
74+
return baseDownloadDir, nil
75+
default:
76+
return "", fmt.Errorf("invalid grouping strategy")
77+
}
78+
}
79+
80+
type mediaPair struct {
81+
post patreon.Post
82+
media patreon.Media
83+
}
84+
85+
type Crawler struct {
86+
apiClient *api.Client
87+
downloadInaccessibleMedia bool
88+
groupingStrategy GroupingStrategy
89+
downloadLimit int
90+
concurrencyLimit int
91+
}
92+
93+
func NewCrawler(apiClient *api.Client, downloadInaccessibleMedia bool, groupingStrategy GroupingStrategy, downloadLimit int, concurrencyLimit int) *Crawler {
94+
return &Crawler{
95+
apiClient: apiClient,
96+
downloadInaccessibleMedia: downloadInaccessibleMedia,
97+
groupingStrategy: groupingStrategy,
98+
downloadLimit: downloadLimit,
99+
concurrencyLimit: concurrencyLimit,
100+
}
101+
}
102+
103+
func (c *Crawler) enumerateMedia(posts iter.Seq2[patreon.Post, error]) iter.Seq2[mediaPair, error] {
104+
return func(yield func(mediaPair, error) bool) {
105+
for post, err := range posts {
106+
if err != nil {
107+
if !yield(mediaPair{}, err) {
108+
return
109+
}
110+
continue
111+
}
112+
113+
if !post.CurrentUserCanView && !c.downloadInaccessibleMedia {
114+
continue
115+
}
116+
117+
for _, media := range post.Media {
118+
if !yield(mediaPair{post: post, media: media}, nil) {
119+
return
120+
}
121+
}
122+
}
123+
}
124+
}
125+
126+
func (c *Crawler) downloadMedia(media patreon.Media, baseDownloadDir string, parentPost patreon.Post) error {
127+
downloadDir, err := getDownloadDir(baseDownloadDir, parentPost.Title, c.groupingStrategy)
128+
if err != nil {
129+
return err
130+
}
131+
132+
reportItem := download.Media(media, downloadDir)
133+
switch item := reportItem.(type) {
134+
case *download.ReportErrorItem:
135+
fmt.Printf("\t[error] %s from `%s`: %s\n", item.Media.ID, parentPost.Title, item.Err)
136+
case *download.ReportSkippedItem:
137+
fmt.Printf("\t[skipped] %s from `%s` (%s)\n", item.Media.ID, parentPost.Title, item.Reason)
138+
case *download.ReportSuccessItem:
139+
fmt.Printf("\t[downloaded] %s from `%s`\n", item.Media.ID, parentPost.Title)
140+
}
141+
142+
err = adjustMediaFileTime(downloadDir, media, parentPost.PublishedAt)
143+
if err != nil {
144+
fmt.Printf("\t[error] adjusting file time for %s from %s: %s\n", media.ID, parentPost.Title, err)
145+
}
146+
return nil
147+
}
148+
149+
func (c *Crawler) CrawlPosts(client *patreon.Client, baseDownloadDir string) error {
150+
posts := client.Posts()
151+
media := c.enumerateMedia(posts)
152+
153+
q, err := queue.New[patreon.Media](c.concurrencyLimit)
154+
discoveredMediaCount := 0
155+
156+
if err != nil {
157+
return err
158+
}
159+
160+
for pair, err := range media {
161+
if err != nil {
162+
return err
163+
}
164+
165+
parentPost := pair.post
166+
q.Enqueue(pair.media, func(media patreon.Media) error {
167+
return c.downloadMedia(media, baseDownloadDir, parentPost)
168+
})
169+
170+
discoveredMediaCount++
171+
if c.downloadLimit > 0 && discoveredMediaCount >= c.downloadLimit {
172+
break
173+
}
174+
}
175+
176+
fmt.Printf("Discovered %d media items to download.\n", discoveredMediaCount)
177+
178+
err = q.ProcessAll()
179+
180+
return err
181+
}
182+
183+
func (c *Crawler) CrawlCreator(creatorID string, downloadDir string) error {
184+
creatorDownloadDir := fmt.Sprintf("%s/%s", downloadDir, sanitizeFilename(creatorID))
185+
186+
fmt.Printf("Downloading posts from %s to %s\n", creatorID, creatorDownloadDir)
187+
188+
client, err := patreon.NewClient(c.apiClient, creatorID)
189+
if err != nil {
190+
return err
191+
}
192+
193+
err = c.CrawlPosts(client, creatorDownloadDir)
194+
if err != nil {
195+
return err
196+
}
197+
198+
return nil
199+
}

0 commit comments

Comments
 (0)