Skip to content

Commit a2f4bd2

Browse files
committed
Inflate: Simpler, slightly faster NEON chunk-copy
1 parent 315b663 commit a2f4bd2

File tree

2 files changed

+55
-3
lines changed

2 files changed

+55
-3
lines changed

inffast.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@
2121
length/distance code pair can output up to 258 bytes, which is the maximum
2222
length that can be coded.
2323
*/
24-
#define INFLATE_FAST_MIN_OUTPUT 258
24+
#define INFLATE_FAST_MIN_OUTPUT 298
2525

2626
void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));

inffast_chunk.c

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,31 @@
3939
# error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE
4040
#endif
4141

42+
#ifdef __aarch64__
43+
#include <arm_neon.h>
44+
static uint8x16_t distance_table[] = {
45+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
46+
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
47+
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
48+
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
49+
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
50+
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
51+
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
52+
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
53+
{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},
54+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6},
55+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5},
56+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4},
57+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3},
58+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2},
59+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1},
60+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0},
61+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
62+
};
63+
64+
static uint64_t distance_offsets[] = {16, 16, 16, 15, 16, 15, 12, 14, 16, 9, 10, 11, 12, 13, 14, 15, 16};
65+
#endif
66+
4267
/*
4368
Decode literal, length, and distance codes and write out the resulting
4469
literal and match bytes until either not enough input or output is
@@ -322,12 +347,39 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
322347
else {
323348
/* Whole reference is in range of current output. No
324349
range checks are necessary because we start with room
325-
for at least 258 bytes of output, so unroll and roundoff
350+
for at least 296 bytes of output, so unroll and roundoff
326351
operations can write beyond `out+len` so long as they
327-
stay within 258 bytes of `out`.
352+
stay within 296 bytes of `out`.
328353
*/
354+
#ifdef __aarch64__
355+
uint8_t *p = out - dist;
356+
if (dist <= 16) {
357+
uint8x16_t rep = vqtbl1q_u8(vld1q_u8(p), distance_table[dist]);
358+
uint64_t size = distance_offsets[dist];
359+
uint8_t *o = out;
360+
int64_t n = len;
361+
do {
362+
vst1q_u8(o, rep);
363+
vst1q_u8(o+size, rep);
364+
vst1q_u8(o+size*2, rep);
365+
o += size*3;
366+
n -= size*3;
367+
} while (n > 0);
368+
out += len;
369+
} else {
370+
int64_t i = 0;
371+
do {
372+
vst1q_u8(out + i, vld1q_u8(p + i));
373+
vst1q_u8(out + i+16, vld1q_u8(p + i+16));
374+
vst1q_u8(out + i+32, vld1q_u8(p + i+32));
375+
i += 48;
376+
} while (i < len);
377+
out += len;
378+
}
379+
#else
329380
out = chunkcopy_lapped_relaxed(out, dist, len);
330381

382+
#endif
331383
}
332384

333385
chunk_continue:

0 commit comments

Comments
 (0)