Skip to content

Commit 675b941

Browse files
committed
Refactor: accept single url
1 parent 2591836 commit 675b941

File tree

6 files changed

+90
-195
lines changed

6 files changed

+90
-195
lines changed

cmd/archive.is/is.go

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
package main
22

33
import (
4+
"context"
45
"flag"
56
"fmt"
7+
"net/url"
68
"os"
9+
"sync"
10+
"time"
711

812
"github.com/wabarc/archive.is"
913
)
@@ -38,17 +42,35 @@ func main() {
3842
}
3943

4044
wbrc := &is.Archiver{}
41-
4245
if playback {
43-
collects, _ := wbrc.Playback(args)
44-
for orig, dest := range collects {
45-
fmt.Println(orig, "=>", dest)
46-
}
46+
process(wbrc.Playback, args)
4747
os.Exit(0)
4848
}
4949

50-
saved, _ := wbrc.Wayback(args)
51-
for orig, dest := range saved {
52-
fmt.Println(orig, "=>", dest)
50+
process(wbrc.Wayback, args)
51+
}
52+
53+
func process(f func(context.Context, *url.URL) (string, error), args []string) {
54+
var wg sync.WaitGroup
55+
for _, arg := range args {
56+
wg.Add(1)
57+
go func(link string) {
58+
defer wg.Done()
59+
u, err := url.Parse(link)
60+
if err != nil {
61+
fmt.Println(link, "=>", fmt.Sprintf("%v", err))
62+
return
63+
}
64+
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
65+
defer cancel()
66+
67+
r, err := f(ctx, u)
68+
if err != nil {
69+
fmt.Println(link, "=>", fmt.Sprintf("%v", err))
70+
return
71+
}
72+
fmt.Println(link, "=>", r)
73+
}(arg)
5374
}
75+
wg.Wait()
5476
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ require (
66
github.com/PuerkitoBio/goquery v1.6.1
77
github.com/andybalholm/cascadia v1.2.0 // indirect
88
github.com/cretz/bine v0.1.0
9+
github.com/kr/pretty v0.1.0 // indirect
910
github.com/stretchr/testify v1.7.0 // indirect
10-
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe
1111
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee
1212
golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc // indirect
1313
golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d
1414
golang.org/x/sys v0.0.0-20210415045647-66c3f260301c // indirect
15+
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
1516
)

go.sum

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,9 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
1414
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
1515
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
1616
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
17-
github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
1817
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
1918
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
2019
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
21-
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe h1:V9yz2vQlSVLs51nlo0DAeETFOE57OvlYm98X1LKJA6U=
22-
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe/go.mod h1:TuTZtoiOu984UWOf7FfX58JllKMjq7FCz701kB5W88E=
2320
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee h1:MMIp++7eem2CI1jIYDoPByMwXeZAjsFo2ciBNtvhB80=
2421
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee/go.mod h1:4uYr9fnQaQoDk1ttTzLnSB3lZm3i/vrJwN8EZIB2YuI=
2522
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -41,11 +38,7 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
4138
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
4239
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
4340
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
44-
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
4541
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
4642
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
47-
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
4843
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
4944
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
50-
mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A=
51-
mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8=

is.go

Lines changed: 41 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,8 @@ import (
1111
"os"
1212
"strconv"
1313
"strings"
14-
"sync"
15-
"time"
1614

1715
"github.com/PuerkitoBio/goquery"
18-
"github.com/wabarc/helper"
1916
"github.com/wabarc/logger"
2017
)
2118

@@ -42,7 +39,6 @@ var (
4239
scheme = "http"
4340
onion = "archiveiya74codqgiixo33q62qlrqtkgmcitqx5u2oeqnmn5bpcbiyd.onion" // archivecaslytosk.onion
4441
cookie = ""
45-
timeout = 120 * time.Second
4642
domains = []string{
4743
"archive.today",
4844
"archive.is",
@@ -62,123 +58,67 @@ func init() {
6258
}
6359

6460
// Wayback is the handle of saving webpages to archive.is
65-
func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) {
66-
collects, results := make(map[string]string), make(map[string]string)
67-
for _, link := range links {
68-
if helper.IsURL(link) {
69-
collects[link] = link
70-
}
71-
}
72-
if len(collects) == 0 {
73-
return results, fmt.Errorf("Not found")
74-
}
75-
76-
ctx, cancel := context.WithCancel(context.Background())
77-
defer cancel()
61+
func (wbrc *Archiver) Wayback(ctx context.Context, in *url.URL) (dst string, err error) {
7862
torClient, t, err := newTorClient(ctx)
79-
defer closeTor(t)
63+
defer closeTor(t) // nolint:errcheck
8064
if err != nil {
8165
logger.Error("%v", err)
8266
}
8367

8468
is := &IS{
8569
wbrc: wbrc,
86-
httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect},
70+
httpClient: &http.Client{CheckRedirect: noRedirect},
8771
torClient: torClient,
8872
}
8973

90-
ch := make(chan string, len(collects))
91-
defer close(ch)
92-
93-
var mu sync.Mutex
94-
var wg sync.WaitGroup
95-
for _, link := range collects {
96-
wg.Add(1)
97-
go func(link string) {
98-
mu.Lock()
99-
is.submitid = ""
100-
is.archive(link, ch)
101-
results[link] = strings.Replace(<-ch, onion, "archive.today", 1)
102-
mu.Unlock()
103-
wg.Done()
104-
}(link)
105-
}
106-
wg.Wait()
107-
108-
if len(results) == 0 {
109-
return results, fmt.Errorf("No results")
74+
dst, err = is.archive(ctx, in)
75+
if err != nil {
76+
return
11077
}
78+
dst = strings.Replace(dst, onion, "archive.today", 1)
11179

112-
return results, nil
80+
return
11381
}
11482

11583
// Playback handle searching archived webpages from archive.is
116-
func (wbrc *Archiver) Playback(links []string) (map[string]string, error) {
117-
collects, results := make(map[string]string), make(map[string]string)
118-
for _, link := range links {
119-
if helper.IsURL(link) {
120-
collects[link] = link
121-
}
122-
}
123-
if len(collects) == 0 {
124-
return results, fmt.Errorf("Not found")
125-
}
126-
127-
ctx, cancel := context.WithCancel(context.Background())
128-
defer cancel()
84+
func (wbrc *Archiver) Playback(ctx context.Context, in *url.URL) (dst string, err error) {
12985
torClient, t, err := newTorClient(ctx)
130-
defer closeTor(t)
86+
defer closeTor(t) // nolint:errcheck
13187
if err != nil {
13288
logger.Error("%v", err)
13389
}
13490

13591
is := &IS{
13692
wbrc: wbrc,
137-
httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect},
93+
httpClient: &http.Client{CheckRedirect: noRedirect},
13894
torClient: torClient,
13995
}
14096

141-
ch := make(chan string, len(collects))
142-
defer close(ch)
143-
144-
var mu sync.Mutex
145-
var wg sync.WaitGroup
146-
for _, link := range collects {
147-
wg.Add(1)
148-
go func(link string) {
149-
mu.Lock()
150-
is.submitid = ""
151-
is.search(link, ch)
152-
results[link] = strings.Replace(<-ch, onion, "archive.today", 1)
153-
mu.Unlock()
154-
wg.Done()
155-
}(link)
156-
}
157-
wg.Wait()
158-
159-
if len(results) == 0 {
160-
return results, fmt.Errorf("No results")
97+
dst, err = is.search(ctx, in)
98+
if err != nil {
99+
return
161100
}
101+
dst = strings.Replace(dst, onion, "archive.today", 1)
162102

163-
return results, nil
103+
return
164104
}
165-
func (is *IS) archive(uri string, ch chan<- string) {
105+
func (is *IS) archive(ctx context.Context, u *url.URL) (string, error) {
166106
endpoint, err := is.getValidDomain()
167107
if err != nil {
168-
ch <- fmt.Sprint("archive.today is unavailable.")
169-
return
108+
return "", fmt.Errorf("archive.today is unavailable.")
170109
}
171110

172111
if is.wbrc.Anyway != "" {
173112
anyway = is.wbrc.Anyway
174113
}
114+
uri := u.String()
175115
data := url.Values{
176116
"submitid": {is.submitid},
177117
"anyway": {anyway},
178118
"url": {uri},
179119
}
180120
domain := endpoint.String()
181-
req, err := http.NewRequest("POST", domain+"/submit/", strings.NewReader(data.Encode()))
121+
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, domain+"/submit/", strings.NewReader(data.Encode()))
182122
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
183123
req.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
184124
req.Header.Add("User-Agent", userAgent)
@@ -188,46 +128,40 @@ func (is *IS) archive(uri string, ch chan<- string) {
188128
req.Header.Add("Cookie", is.getCookie())
189129
resp, err := is.httpClient.Do(req)
190130
if err != nil {
191-
ch <- fmt.Sprint(err)
192-
return
131+
return "", err
193132
}
194133
defer resp.Body.Close()
195134

196135
code := resp.StatusCode / 100
197136
if code == 1 || code == 4 || code == 5 {
198137
final := fmt.Sprintf("%s?url=%s", domain, uri)
199-
ch <- final
200-
return
138+
return final, nil
201139
}
202140

203141
_, err = io.Copy(ioutil.Discard, resp.Body)
204142
if err != nil {
205-
ch <- fmt.Sprint(err)
206-
return
143+
return "", err
207144
}
208145

209146
// When use anyway parameter.
210147
refresh := resp.Header.Get("Refresh")
211148
if len(refresh) > 0 {
212149
r := strings.Split(refresh, ";url=")
213150
if len(r) == 2 {
214-
ch <- r[1]
215-
return
151+
return r[1], nil
216152
}
217153
}
218154
loc := resp.Header.Get("location")
219155
if len(loc) > 2 {
220-
ch <- loc
221-
return
156+
return loc, nil
222157
}
223158
// Redirect to final url if page saved.
224159
final := resp.Request.URL.String()
225-
if len(final) > 0 && strings.Contains(final, "/submit/") == false {
226-
ch <- final
227-
return
160+
if len(final) > 0 && !strings.Contains(final, "/submit/") {
161+
return final, nil
228162
}
229163

230-
ch <- fmt.Sprintf("%s/timegate/%s", domain, uri)
164+
return fmt.Sprintf("%s/timegate/%s", domain, uri), nil
231165
}
232166

233167
func noRedirect(req *http.Request, via []*http.Request) error {
@@ -248,12 +182,12 @@ func (is *IS) getCookie() string {
248182
}
249183

250184
func (is *IS) getSubmitID(url string) (string, error) {
251-
if strings.Contains(url, "http") == false {
185+
if !strings.Contains(url, "http") {
252186
return "", fmt.Errorf("missing protocol scheme")
253187
}
254188

255189
r := strings.NewReader("")
256-
req, err := http.NewRequest("GET", url, r)
190+
req, _ := http.NewRequest("GET", url, r)
257191
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
258192
req.Header.Add("User-Agent", userAgent)
259193
req.Header.Add("Cookie", is.getCookie())
@@ -313,36 +247,36 @@ func (is *IS) getValidDomain() (*url.URL, error) {
313247
return endpoint, nil
314248
}
315249

316-
func (is *IS) search(uri string, ch chan<- string) {
250+
func (is *IS) search(ctx context.Context, in *url.URL) (string, error) {
317251
endpoint, err := is.getValidDomain()
318252
if err != nil {
319-
ch <- fmt.Sprint("archive.today is unavailable.")
320-
return
253+
return "", fmt.Errorf("archive.today is unavailable.")
321254
}
322255

256+
uri := in.String()
323257
domain := endpoint.String()
324-
req, err := http.NewRequest("GET", fmt.Sprintf("%s/%s", domain, uri), nil)
258+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s/%s", domain, uri), nil)
259+
if err != nil {
260+
return "", err
261+
}
325262
req.Header.Add("User-Agent", userAgent)
326263
req.Header.Add("Referer", domain)
327264
req.Header.Add("Host", endpoint.Hostname())
328265
resp, err := is.httpClient.Do(req)
329266
if err != nil {
330-
ch <- fmt.Sprint(err)
331-
return
267+
return "", err
332268
}
333269
defer resp.Body.Close()
334270

335271
doc, err := goquery.NewDocumentFromReader(resp.Body)
336272
if err != nil {
337-
ch <- fmt.Sprint(err)
338-
return
273+
return "", err
339274
}
340275

341276
target, exists := doc.Find("#row0 > .TEXT-BLOCK > a").Attr("href")
342277
if !exists {
343-
ch <- "Not found"
344-
return
278+
return "", fmt.Errorf("Not found")
345279
}
346280

347-
ch <- target
281+
return target, nil
348282
}

0 commit comments

Comments
 (0)