Skip to content

Commit 8002234

Browse files
committed
Resizable gzip output buffer
1 parent 76dc980 commit 8002234

File tree

3 files changed

+39
-7
lines changed

3 files changed

+39
-7
lines changed

README.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,23 @@ Parquet compression types supported with `hyparquet-compressors`:
4848

4949
### Snappy
5050

51-
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm.
51+
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using a minimal [WASM](https://en.wikipedia.org/wiki/WebAssembly) module.
52+
53+
We load the wasm module _synchronously_ from base64 in the js file. This avoids a network request, and greatly simplifies bundling and serving wasm.
5254

5355
### Gzip
5456

5557
New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate).
56-
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate).
58+
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but are not supported by fflate).
59+
60+
For gzip, the `output` buffer argument is optional:
61+
- If `output` is defined, the decompressor will write to `output` until it is full.
62+
- If `output` is undefined, the decompressor will allocate a new buffer, and expand it as needed to fit the uncompressed gzip data. Importantly, the caller should use the _returned_ buffer.
5763

5864
### Brotli
5965

60-
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size.
66+
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js).
67+
Our implementation uses gzip to pre-compress the brotli dictionary, in order to minimize the bundle size.
6168

6269
### LZ4
6370

src/gzip.js

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,16 @@ function gzipStart(input, i) {
6868
/**
6969
* GZip decompression
7070
* @param {Uint8Array} input
71-
* @param {Uint8Array} out
71+
* @param {Uint8Array} [output]
7272
* @param {number} [inputIndex]
7373
* @param {number} [outputIndex]
74+
* @returns {Uint8Array}
7475
*/
75-
export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
76-
if (!(input.length - inputIndex)) return
76+
export function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
77+
let out = output ?? new Uint8Array(1024) // initial size
78+
if (!(input.length - inputIndex)) return out
7779
const payloadStart = gzipStart(input, inputIndex)
78-
if (payloadStart === input.length - 8) return
80+
if (payloadStart === input.length - 8) return out
7981
if (payloadStart > input.length - 8) throw new Error('unexpected EOF')
8082
let pos = payloadStart * 8 // position in bits
8183
let final = 0 // last chunk?
@@ -84,6 +86,16 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
8486
let lengthMap
8587
let distMap
8688
const totalBits = input.length * 8
89+
90+
/** @param {number} length */
91+
function ensureSize(length) {
92+
if (!output && length > out.length) {
93+
const old = out
94+
out = new Uint8Array(Math.max(old.length * 2, length))
95+
out.set(old)
96+
}
97+
}
98+
8799
do {
88100
if (!lengthMap) {
89101
// final chunk is next?
@@ -98,6 +110,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
98110
const t = s + l
99111
if (t > input.length) throw new Error('unexpected EOF')
100112
// copy uncompressed data
113+
ensureSize(outputIndex + l)
101114
out.set(input.subarray(s, t), outputIndex)
102115
outputIndex += l
103116
pos = t * 8
@@ -160,6 +173,8 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
160173
} else throw new Error('invalid block type')
161174
if (pos > totalBits) throw new Error('unexpected EOF')
162175
}
176+
177+
ensureSize(outputIndex + 131072) // max chunk size?
163178
const lms = (1 << lengthBits) - 1
164179
const dms = (1 << distBits) - 1
165180
let lpos = pos
@@ -199,6 +214,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
199214
if (pos > totalBits) throw new Error('unexpected EOF')
200215
const end = outputIndex + add
201216
if (outputIndex < dt) throw new Error('unexpected dictionary case')
217+
ensureSize(end)
202218
for (; outputIndex < end; outputIndex++) out[outputIndex] = out[outputIndex - dt]
203219
}
204220
}
@@ -211,4 +227,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
211227
const nextBlock = Math.ceil(pos / 8) + 8 // 8 byte gzip footer
212228
gunzip(input, out, nextBlock, outputIndex)
213229
}
230+
231+
if (!output) return out.subarray(0, outputIndex)
232+
return out
214233
}

test/gzip.test.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,10 @@ describe('gzip compressor', () => {
5454
expect(toJson(data)).toEqual(JSON.parse(expected))
5555
} })
5656
})
57+
58+
it('read gzip with unknown length', () => {
59+
const input = new Uint8Array([31, 139, 8, 0, 77, 204, 77, 102, 0, 3, 227, 230, 22, 83, 4, 0, 117, 18, 225, 170, 4, 0, 0, 0])
60+
const resized = gunzip(input)
61+
expect(resized).toEqual(new Uint8Array([11, 11, 22, 33]))
62+
})
5763
})

0 commit comments

Comments
 (0)