Skip to content

Commit 2986160

Browse files
feat: introduce download reporting strategy to gracefully handle errors (#23)
* fix: introduce download reporting strategy to gracefully handle download errors * chore: adjust reporting and introduce iterators instead of channels * Update crawling/download/report.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent b71f14d commit 2986160

File tree

3 files changed

+170
-88
lines changed

3 files changed

+170
-88
lines changed

crawling/crawl_posts.go

Lines changed: 25 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@ package crawling
22

33
import (
44
"fmt"
5-
"io"
6-
"net/http"
75
"os"
86
"regexp"
97
"slices"
108
"strings"
119

10+
"patreon-crawler/crawling/download"
1211
"patreon-crawler/patreon"
1312
)
1413

@@ -43,94 +42,12 @@ func sanitizeFilename(name string) string {
4342
return name
4443
}
4544

46-
func getMediaFile(downloadDirectory string, media patreon.Media) (string, error) {
47-
extension, err := getFileExtension(media.MimeType)
48-
if err != nil {
49-
return "", err
50-
}
51-
return fmt.Sprintf("%s/%s.%s", downloadDirectory, media.ID, extension), nil
52-
}
53-
54-
func getFileExtension(mimeType string) (string, error) {
55-
// This is a very quick and dirty method, but it should work here
56-
mimeTypeSplits := strings.Split(mimeType, "/")
57-
if len(mimeTypeSplits) != 2 {
58-
return "", fmt.Errorf("invalid mime type: %s", mimeType)
59-
}
60-
return mimeTypeSplits[1], nil
61-
}
62-
63-
func downloadMedia(media patreon.Media, downloadDir string) error {
64-
downloadedFilePath, err := getMediaFile(downloadDir, media)
65-
if err != nil {
66-
return err
67-
}
68-
69-
_, err = os.Stat(downloadedFilePath)
70-
if err == nil {
71-
fmt.Printf("\t- skipped %s (already downloaded)\n", media.ID)
72-
return nil
73-
}
74-
75-
response, err := http.Get(media.DownloadURL)
76-
if err != nil {
77-
return fmt.Errorf("failed to download media: %w", err)
78-
}
79-
defer response.Body.Close()
80-
81-
if response.StatusCode != http.StatusOK {
82-
return fmt.Errorf("failed to download media: %s", response.Status)
83-
}
84-
85-
tempDownloadFilePath := downloadedFilePath + ".tmp"
86-
87-
out, err := os.Create(tempDownloadFilePath)
88-
if err != nil {
89-
return fmt.Errorf("failed to create file: %w", err)
90-
}
91-
defer out.Close()
92-
93-
_, err = io.Copy(out, response.Body)
94-
if err != nil {
95-
return fmt.Errorf("failed to write file: %w", err)
96-
}
97-
98-
out.Close()
99-
100-
err = os.Rename(tempDownloadFilePath, downloadedFilePath)
101-
if err != nil {
102-
return fmt.Errorf("failed to rename file: %w", err)
103-
}
104-
105-
fmt.Printf("\t- saved %s\n", media.ID)
106-
return nil
107-
}
108-
109-
func downloadPost(downloadDirectory string, post patreon.Post) error {
110-
if len(post.Media) == 0 {
111-
return nil
112-
}
113-
114-
err := os.MkdirAll(downloadDirectory, 0700)
115-
if err != nil {
116-
return fmt.Errorf("failed to create download directory: %w", err)
117-
}
118-
119-
for _, media := range post.Media {
120-
err := downloadMedia(media, downloadDirectory)
121-
if err != nil {
122-
return err
123-
}
124-
}
125-
return nil
126-
}
127-
12845
func adjustPostsFileTime(downloadDirectory string, post patreon.Post) error {
12946
if len(post.Media) == 0 {
13047
return nil
13148
}
13249
for _, media := range post.Media {
133-
file, err := getMediaFile(downloadDirectory, media)
50+
file, err := download.GetMediaFile(downloadDirectory, media)
13451
if err != nil {
13552
return err
13653
}
@@ -161,6 +78,28 @@ func getDownloadDir(baseDownloadDir, postTitle string, groupingStrategy Grouping
16178
}
16279
}
16380

81+
func savePost(post patreon.Post, postsDownloaded int, downloadDir string) error {
82+
fmt.Printf("[%d] Saving post '%s'\n", postsDownloaded, post.Title)
83+
84+
err := os.MkdirAll(downloadDir, 0700)
85+
if err != nil {
86+
return err
87+
}
88+
89+
report := download.Post(downloadDir, post)
90+
for item := range report {
91+
switch item := item.(type) {
92+
case *download.ReportErrorItem:
93+
fmt.Printf("\t[error] %s: %s\n", item.Media.ID, item.Err)
94+
case *download.ReportSkippedItem:
95+
fmt.Printf("\t[skipped] %s (%s)\n", item.Media.ID, item.Reason)
96+
case *download.ReportSuccessItem:
97+
fmt.Printf("\t[downloaded] %s\n", item.Media.ID)
98+
}
99+
}
100+
return nil
101+
}
102+
164103
func CrawlPosts(client *patreon.Client, baseDownloadDir string, downloadInaccessibleMedia bool, groupingStrategy GroupingStrategy, downloadLimit int) error {
165104
posts := client.Posts()
166105

@@ -187,9 +126,7 @@ func CrawlPosts(client *patreon.Client, baseDownloadDir string, downloadInaccess
187126
postsCrawled++
188127

189128
if post.CurrentUserCanView || downloadInaccessibleMedia {
190-
fmt.Printf("[%d] Saving post '%s'\n", postsDownloaded, post.Title)
191-
192-
err := downloadPost(downloadDir, post)
129+
err := savePost(post, postsDownloaded, downloadDir)
193130
if err != nil {
194131
return err
195132
}

crawling/download/download.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package download
2+
3+
import (
4+
"fmt"
5+
"io"
6+
"iter"
7+
"net/http"
8+
"os"
9+
"strings"
10+
11+
"patreon-crawler/patreon"
12+
)
13+
14+
func GetMediaFile(downloadDirectory string, media patreon.Media) (string, error) {
15+
extension, err := getFileExtension(media.MimeType)
16+
if err != nil {
17+
return "", err
18+
}
19+
return fmt.Sprintf("%s/%s.%s", downloadDirectory, media.ID, extension), nil
20+
}
21+
22+
func getFileExtension(mimeType string) (string, error) {
23+
// This is a very quick and dirty method, but it should work here
24+
mimeTypeSplits := strings.Split(mimeType, "/")
25+
if len(mimeTypeSplits) != 2 {
26+
return "", fmt.Errorf("invalid mime type: %s", mimeType)
27+
}
28+
return mimeTypeSplits[1], nil
29+
}
30+
31+
func downloadMedia(media patreon.Media, downloadDir string) ReportItem {
32+
downloadedFilePath, err := GetMediaFile(downloadDir, media)
33+
if err != nil {
34+
return NewErrorItem(media, err)
35+
}
36+
37+
_, err = os.Stat(downloadedFilePath)
38+
if err == nil {
39+
return NewSkippedItem(media, "already downloaded")
40+
}
41+
42+
response, err := http.Get(media.DownloadURL)
43+
if err != nil {
44+
return NewErrorItem(media, err)
45+
}
46+
defer response.Body.Close()
47+
48+
if response.StatusCode != http.StatusOK {
49+
return NewErrorItem(media, fmt.Errorf("unexpected status code: %s", response.Status))
50+
}
51+
52+
tempDownloadFilePath := downloadedFilePath + ".tmp"
53+
54+
out, err := os.Create(tempDownloadFilePath)
55+
if err != nil {
56+
return NewErrorItem(media, fmt.Errorf("failed to create file: %w", err))
57+
}
58+
defer out.Close()
59+
60+
_, err = io.Copy(out, response.Body)
61+
if err != nil {
62+
return NewErrorItem(media, fmt.Errorf("failed to write file: %w", err))
63+
}
64+
65+
out.Close()
66+
67+
err = os.Rename(tempDownloadFilePath, downloadedFilePath)
68+
if err != nil {
69+
return NewErrorItem(media, fmt.Errorf("failed to rename file: %w", err))
70+
}
71+
72+
return NewSuccessItem(media)
73+
}
74+
75+
func Post(downloadDirectory string, post patreon.Post) iter.Seq[ReportItem] {
76+
return func(yield func(ReportItem) bool) {
77+
if len(post.Media) == 0 {
78+
return
79+
}
80+
81+
for _, media := range post.Media {
82+
if media.MimeType == "" {
83+
item := NewSkippedItem(media, "no mime type")
84+
if !yield(item) {
85+
return
86+
}
87+
continue
88+
}
89+
90+
item := downloadMedia(media, downloadDirectory)
91+
if !yield(item) {
92+
return
93+
}
94+
}
95+
}
96+
}

crawling/download/report.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package download
2+
3+
import "patreon-crawler/patreon"
4+
5+
type ReportItem interface {
6+
ReportItem()
7+
}
8+
9+
type reportItem struct{}
10+
11+
func (m *reportItem) ReportItem() {}
12+
13+
type ReportSuccessItem struct {
14+
reportItem
15+
Media patreon.Media
16+
}
17+
18+
type ReportSkippedItem struct {
19+
reportItem
20+
Media patreon.Media
21+
Reason string
22+
}
23+
24+
type ReportErrorItem struct {
25+
reportItem
26+
Media patreon.Media
27+
Err error
28+
}
29+
30+
func NewSuccessItem(media patreon.Media) ReportItem {
31+
return &ReportSuccessItem{
32+
Media: media,
33+
}
34+
}
35+
36+
func NewErrorItem(media patreon.Media, err error) ReportItem {
37+
return &ReportErrorItem{
38+
Media: media,
39+
Err: err,
40+
}
41+
}
42+
43+
func NewSkippedItem(media patreon.Media, message string) ReportItem {
44+
return &ReportSkippedItem{
45+
Media: media,
46+
Reason: message,
47+
}
48+
}
49+

0 commit comments

Comments
 (0)