@@ -5,10 +5,14 @@ import { Deflate } from "pako";
5
5
import { v5 as uuidv5 } from "uuid" ;
6
6
7
7
import { createSHA256 } from "hash-wasm" ;
8
+ import { type IHasher } from "hash-wasm/dist/lib/WASMInterface.js" ;
8
9
9
10
import { getSurt , WARCRecord , WARCSerializer } from "warcio" ;
10
11
11
- import { getTSMillis , getStatusText , digestMessage } from "@webrecorder/wabac/src/utils.js" ;
12
+ import { getTSMillis , getStatusText , digestMessage } from "@webrecorder/wabac" ;
13
+ import { ArchiveDB , ResourceEntry } from "@webrecorder/wabac/swlib" ;
14
+ import { Signer } from "./keystore" ;
15
+ import { Collection } from "../../wabac.js/dist/types/collection" ;
12
16
13
17
14
18
// ===========================================================================
@@ -28,11 +32,11 @@ const encoder = new TextEncoder();
28
32
29
33
const EMPTY = new Uint8Array ( [ ] ) ;
30
34
31
- async function * getPayload ( payload ) {
35
+ async function * getPayload ( payload : Uint8Array ) {
32
36
yield payload ;
33
37
}
34
38
35
- async function * hashingGen ( gen , stats , hasher , sizeCallback , zipMarker ) {
39
+ async function * hashingGen ( gen : AsyncIterable < Uint8Array | string > , stats : any , hasher : IHasher , sizeCallback : any , zipMarker : Uint8Array ) {
36
40
stats . size = 0 ;
37
41
38
42
hasher . init ( ) ;
@@ -61,31 +65,89 @@ async function* hashingGen(gen, stats, hasher, sizeCallback, zipMarker) {
61
65
stats . hash = hasher . digest ( "hex" ) ;
62
66
}
63
67
68
+ type DownloaderOpts = {
69
+ coll : Collection ;
70
+ format : string ;
71
+ filename ?: string ;
72
+ pageList ?: string [ ] ;
73
+ signer ?: Signer ;
74
+ softwareString ?: string ;
75
+ gzip ?: boolean ;
76
+ uuidNamespace ?: string ;
77
+ markers ?: Record < string , Uint8Array > ;
78
+ } ;
79
+
64
80
// ===========================================================================
65
81
class Downloader
66
82
{
67
- constructor ( { coll, format = "wacz" , filename = null , pageList = null , signer = null ,
68
- softwareString = null , gzip = true , uuidNamespace = null , markers = null } ) {
83
+ db : ArchiveDB ;
84
+ pageList : string [ ] | null ;
85
+ collId : string ;
86
+ metadata : Record < string , string > ;
87
+ gzip : boolean ;
88
+
89
+ markers : Record < string , Uint8Array > ;
90
+ warcName : string ;
91
+ alreadyDecoded : boolean ;
92
+
93
+ softwareString : string ;
94
+ uuidNamespace : string ;
95
+
96
+ createdDateDt : Date ;
97
+ createdDate : string ;
98
+ modifiedDate : string | null ;
99
+
100
+ format : string ;
101
+ warcVersion : string ;
102
+
103
+ digestOpts : {
104
+ algo : string ;
105
+ prefix : string ;
106
+ base32 ?: boolean ;
107
+ } ;
108
+
109
+ filename : string ;
110
+
111
+ signer : Signer | null ;
112
+
113
+ offset = 0 ;
114
+ firstResources = [ ] ;
115
+ textResources = [ ] ;
116
+ cdxjLines = [ ] ;
117
+
118
+ // compressed index (idx) entries
119
+ indexLines = [ ] ;
120
+
121
+ digestsVisted = { } ;
122
+ fileHasher = null ;
123
+ recordHasher = null ;
124
+
125
+ datapackageDigest = null ;
126
+
127
+ fileStats = [ ] ;
128
+
129
+ constructor ( { coll, format = "wacz" , filename, pageList, signer,
130
+ softwareString, gzip = true , uuidNamespace, markers} : DownloaderOpts ) {
69
131
70
132
this . db = coll . store ;
71
- this . pageList = pageList ;
133
+ this . pageList = pageList || [ ] ;
72
134
this . collId = coll . name ;
73
- this . metadata = coll . config . metadata ;
135
+ this . metadata = coll . config [ " metadata" ] ;
74
136
this . gzip = gzip ;
75
137
76
138
this . markers = markers || { } ;
77
139
78
140
this . warcName = this . gzip ? "data.warc.gz" : "data.warc" ;
79
141
80
- this . alreadyDecoded = ! coll . config . decode && ! coll . config . loadUrl ;
142
+ this . alreadyDecoded = ! coll . config [ " decode" ] && ! coll . config [ " loadUrl" ] ;
81
143
82
144
this . softwareString = softwareString || "ArchiveWeb.page" ;
83
145
84
146
this . uuidNamespace = uuidNamespace || DEFAULT_UUID_NAMESPACE ;
85
147
86
- this . createdDateDt = new Date ( coll . config . ctime ) ;
148
+ this . createdDateDt = new Date ( coll . config [ " ctime" ] ) ;
87
149
this . createdDate = this . createdDateDt . toISOString ( ) ;
88
- this . modifiedDate = coll . config . metadata . mtime ? new Date ( coll . config . metadata . mtime ) . toISOString ( ) : null ;
150
+ this . modifiedDate = coll . config [ " metadata" ] . mtime ? new Date ( coll . config [ " metadata" ] . mtime ) . toISOString ( ) : null ;
89
151
90
152
this . format = format ;
91
153
this . warcVersion = ( format === "warc1.0" ) ? "WARC/1.0" : "WARC/1.1" ;
@@ -96,33 +158,17 @@ class Downloader
96
158
this . digestOpts = { algo : "sha-256" , prefix : "sha256:" } ;
97
159
}
98
160
99
- this . filename = filename ;
100
-
101
161
// determine filename from title, if it exists
102
- if ( ! this . filename && coll . config . metadata . title ) {
103
- this . filename = encodeURIComponent ( coll . config . metadata . title . toLowerCase ( ) . replace ( / \s / g, "-" ) ) ;
162
+ if ( ! filename && coll . config [ " metadata" ] . title ) {
163
+ filename = encodeURIComponent ( coll . config [ " metadata" ] . title . toLowerCase ( ) . replace ( / \s / g, "-" ) ) ;
104
164
}
105
165
106
- if ( ! this . filename ) {
107
- this . filename = "webarchive" ;
166
+ if ( ! filename ) {
167
+ filename = "webarchive" ;
108
168
}
169
+ this . filename = filename ;
109
170
110
- this . offset = 0 ;
111
- this . firstResources = [ ] ;
112
- this . textResources = [ ] ;
113
- this . cdxjLines = [ ] ;
114
-
115
- // compressed index (idx) entries
116
- this . indexLines = [ ] ;
117
-
118
- this . digestsVisted = { } ;
119
- this . fileHasher = null ;
120
- this . recordHasher = null ;
121
-
122
- this . datapackageDigest = null ;
123
- this . signer = signer ;
124
-
125
- this . fileStats = [ ] ;
171
+ this . signer = signer || null ;
126
172
}
127
173
128
174
download ( sizeCallback = null ) {
@@ -139,7 +185,7 @@ class Downloader
139
185
}
140
186
}
141
187
142
- downloadWARC ( filename , sizeCallback = null ) {
188
+ downloadWARC ( filename : string , sizeCallback = null ) {
143
189
filename = ( filename || "webarchive" ) . split ( "." ) [ 0 ] + ".warc" ;
144
190
145
191
const dl = this ;
@@ -160,19 +206,19 @@ class Downloader
160
206
return resp ;
161
207
}
162
208
163
- async loadResourcesBlock ( start = [ ] ) {
164
- return await this . db . db . getAll ( "resources" , IDBKeyRange . lowerBound ( start , true ) , RESOURCE_BATCH_SIZE ) ;
209
+ async loadResourcesBlock ( start : [ string , number ] | [ ] = [ ] ) {
210
+ return await this . db . db ! . getAll ( "resources" , IDBKeyRange . lowerBound ( start , true ) , RESOURCE_BATCH_SIZE ) ;
165
211
}
166
212
167
- async * iterResources ( resources ) {
168
- let start = [ ] ;
213
+ async * iterResources ( resources : ResourceEntry [ ] ) {
214
+ let start : [ string , number ] | [ ] = [ ] ;
169
215
//let count = 0;
170
216
171
217
while ( resources . length ) {
172
- const last = resources [ resources . length - 1 ] ;
218
+ const last : ResourceEntry = resources [ resources . length - 1 ] as ResourceEntry ;
173
219
174
220
if ( this . pageList ) {
175
- resources = resources . filter ( ( res ) => this . pageList . includes ( res . pageId ) ) ;
221
+ resources = resources . filter ( ( res ) => this . pageList ! . includes ( res . pageId || "" ) ) ;
176
222
}
177
223
//count += resources.length;
178
224
yield * resources ;
@@ -185,7 +231,7 @@ class Downloader
185
231
// }
186
232
}
187
233
188
- async queueWARC ( controller , filename , sizeCallback ) {
234
+ async queueWARC ( controller , filename : string , sizeCallback : any ) {
189
235
this . firstResources = await this . loadResourcesBlock ( ) ;
190
236
191
237
for await ( const chunk of this . generateWARC ( filename ) ) {
0 commit comments