forked from internetarchive/gowarc
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdedupe.go
More file actions
142 lines (119 loc) · 3.24 KB
/
dedupe.go
File metadata and controls
142 lines (119 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package warc
import (
"encoding/json"
"io"
"net"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
// TODO: Add stats on how long dedupe HTTP requests take
var DedupeHTTPClient = http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
Dial: (&net.Dialer{
Timeout: 5 * time.Second,
}).Dial,
TLSHandshakeTimeout: 5 * time.Second,
},
}
type DedupeOptions struct {
CDXURL string
DoppelgangerHost string
CDXCookie string
SizeThreshold int
DedupeCacheSize int
LocalDedupe bool
CDXDedupe bool
DoppelgangerDedupe bool
}
type revisitRecord struct {
responseUUID string
targetURI string
date time.Time
size int
}
func (d *customDialer) checkLocalRevisit(digest string) revisitRecord {
revisit, exists := d.client.dedupeHashTable.Get(digest)
if exists {
return revisit
}
return revisitRecord{}
}
func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) {
// CDX expects no hash header. For now we need to strip it.
digest = strings.SplitN(digest, ":", 2)[1]
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&limit=-1", nil)
if err != nil {
return revisitRecord{}, err
}
if cookie != "" {
req.Header.Add("Cookie", cookie)
}
resp, err := DedupeHTTPClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return revisitRecord{}, err
}
cdxReply := strings.Fields(string(body))
if len(cdxReply) >= 7 && cdxReply[3] != "warc/revisit" && cdxReply[5] == digest {
recordSize, _ := strconv.Atoi(cdxReply[6])
t, err := time.Parse("20060102150405", cdxReply[1])
if err != nil {
return revisitRecord{}, err
}
return revisitRecord{
responseUUID: "",
size: recordSize,
targetURI: cdxReply[2],
date: t,
}, nil
}
return revisitRecord{}, nil
}
func checkDoppelgangerRevisit(DoppelgangerHost string, digest string, targetURI string) (revisitRecord, error) {
// Doppelganger is not expecting a hash header either but this will all be rewritten ... shortly...
digest = strings.SplitN(digest, ":", 2)[1]
req, err := http.NewRequest("GET", DoppelgangerHost+"/api/records/"+digest+"?uri="+targetURI, nil)
if err != nil {
return revisitRecord{}, err
}
// I don't think there's a need to create a new HTTP client, but it does look a little funky.
resp, err := DedupeHTTPClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return revisitRecord{}, err
}
if resp.StatusCode == 200 {
var DoppelgangerJSONResponse struct {
ID string `json:"id"`
URI string `json:"uri"`
Date int64 `json:"date"`
}
// Parse JSON response
if err := json.Unmarshal(body, &DoppelgangerJSONResponse); err != nil {
return revisitRecord{}, err
}
t, err := time.Parse("20060102150405", strconv.FormatInt(DoppelgangerJSONResponse.Date, 10))
if err != nil {
return revisitRecord{}, err
}
return revisitRecord{
responseUUID: "",
size: 0,
targetURI: DoppelgangerJSONResponse.URI,
date: t,
}, nil
}
return revisitRecord{}, nil
}