Skip to content

Commit 521d166

Browse files
committed
Archiving improve
1. Fixed archiving multiple URI only one result. 2. Tor onion got a priority.
1 parent 14ab2ec commit 521d166

File tree

5 files changed

+42
-22
lines changed

5 files changed

+42
-22
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func main() {
6060

6161
## FAQ
6262

63-
### Archive.today is unavailable?
63+
### archive.today is unavailable?
6464

6565
Archive.today may have enforced a strictly CAPTCHA policy, causing an exception to the request.
6666

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ go 1.15
55
require (
66
github.com/PuerkitoBio/goquery v1.6.1
77
github.com/cretz/bine v0.1.0
8-
golang.org/x/net v0.0.0-20200202094626-16171245cfb2
8+
github.com/stretchr/testify v1.7.0 // indirect
9+
github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616
910
)

go.sum

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z
44
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
55
github.com/cretz/bine v0.1.0 h1:1/fvhLE+fk0bPzjdO5Ci+0ComYxEMuB1JhM4X5skT3g=
66
github.com/cretz/bine v0.1.0/go.mod h1:6PF6fWAvYtwjRGkAuDEJeWNOv3a2hUouSP/yRYXmvHw=
7+
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
8+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
9+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
10+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
11+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
12+
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
13+
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
14+
github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 h1:wZ5HtpmZAVUq0Im5Sm92ycJrTeLJk5lB/Kvh55Rd+Ps=
15+
github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616/go.mod h1:N9P4r7Rn46p4nkWtXV6ztN3p5ACVnp++bgfwjTqSxQ8=
716
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
817
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
918
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -12,3 +21,7 @@ golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLL
1221
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
1322
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
1423
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
24+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
25+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
26+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
27+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

pkg/http.go

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
"net/http"
1010
"net/url"
1111
"os"
12-
"regexp"
1312
"strconv"
1413
"strings"
1514
"time"
@@ -26,7 +25,6 @@ type Archiver struct {
2625

2726
final string
2827
submitid string
29-
isTor bool
3028

3129
httpClient *http.Client
3230
torClient *http.Client
@@ -69,17 +67,17 @@ func (wbrc *Archiver) fetch(s string, ch chan<- string) {
6967
break
7068
}
7169
}
72-
r(domains)
7370

74-
if baseuri == nil || wbrc.submitid == "" {
75-
// Try request over Tor hidden service.
76-
if wbrc.torClient == nil {
77-
ch <- fmt.Sprint("Tor network unreachable.")
78-
return
79-
}
71+
// Try request over Tor hidden service.
72+
if wbrc.torClient != nil {
8073
wbrc.httpClient = wbrc.torClient
8174

8275
r([]string{onion})
76+
}
77+
defer wbrc.clear()
78+
79+
if baseuri == nil || wbrc.submitid == "" {
80+
r(domains)
8381
if baseuri == nil || wbrc.submitid == "" {
8482
ch <- fmt.Sprint("archive.today is unavailable.")
8583
return
@@ -190,9 +188,8 @@ func (wbrc *Archiver) getSubmitID(url string) (string, error) {
190188
return id, nil
191189
}
192190

193-
func isURL(str string) bool {
194-
re := regexp.MustCompile(`https?://?[-a-zA-Z0-9@:%._\+~#=]{1,255}\.[a-z]{0,63}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)`)
195-
match := re.FindAllString(str, -1)
196-
197-
return len(match) >= 1
191+
func (wbrc *Archiver) clear() {
192+
baseuri = nil
193+
wbrc.final = ""
194+
wbrc.submitid = ""
198195
}

pkg/is.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,42 @@ package is
33
import (
44
"log"
55
"strings"
6+
"sync"
7+
8+
"github.com/wabarc/helper"
69
)
710

811
// Wayback is the handle of saving webpages to archive.is
912
func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) {
1013
collect := make(map[string]string)
1114
for _, link := range links {
12-
if !isURL(link) {
15+
if !helper.IsURL(link) {
1316
log.Print(link + " is invalid url.")
1417
continue
1518
}
1619
collect[link] = link
1720
}
1821

19-
ch := make(chan string, len(collect))
20-
defer close(ch)
21-
2222
if client, tor, err := wbrc.newTorClient(); err != nil {
2323
log.Println(err)
2424
} else {
2525
wbrc.torClient = client
2626
defer tor.Close()
2727
}
2828

29+
ch := make(chan string, len(collect))
30+
defer close(ch)
31+
32+
var wg sync.WaitGroup
2933
for link := range collect {
30-
go wbrc.fetch(link, ch)
31-
collect[link] = strings.Replace(<-ch, onion, "archive.today", 1)
34+
wg.Add(1)
35+
go func(link string, ch chan string) {
36+
wbrc.fetch(link, ch)
37+
collect[link] = strings.Replace(<-ch, onion, "archive.today", 1)
38+
wg.Done()
39+
}(link, ch)
3240
}
41+
wg.Wait()
3342

3443
return collect, nil
3544
}

0 commit comments

Comments
 (0)