|
39 | 39 | # error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE |
40 | 40 | #endif |
41 | 41 |
|
| 42 | +#ifdef __aarch64__ |
| 43 | +#include <arm_neon.h> |
| 44 | +static uint8x16_t distance_table[] = { |
| 45 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
| 46 | + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, |
| 47 | + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, |
| 48 | + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, |
| 49 | + {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, |
| 50 | + {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, |
| 51 | + {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, |
| 52 | + {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, |
| 53 | + {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}, |
| 54 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6}, |
| 55 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5}, |
| 56 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4}, |
| 57 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3}, |
| 58 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2}, |
| 59 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1}, |
| 60 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0}, |
| 61 | + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
| 62 | +}; |
| 63 | + |
| 64 | +static uint64_t distance_offsets[] = {16, 16, 16, 15, 16, 15, 12, 14, 16, 9, 10, 11, 12, 13, 14, 15, 16}; |
| 65 | +#endif |
| 66 | + |
42 | 67 | /* |
43 | 68 | Decode literal, length, and distance codes and write out the resulting |
44 | 69 | literal and match bytes until either not enough input or output is |
@@ -322,12 +347,39 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ |
322 | 347 | else { |
323 | 348 | /* Whole reference is in range of current output. No |
324 | 349 | range checks are necessary because we start with room |
325 | | - for at least 258 bytes of output, so unroll and roundoff |
| 350 | + for at least 296 bytes of output, so unroll and roundoff |
326 | 351 | operations can write beyond `out+len` so long as they |
327 | | - stay within 258 bytes of `out`. |
| 352 | + stay within 296 bytes of `out`. |
328 | 353 | */ |
| 354 | +#ifdef __aarch64__ |
| 355 | + uint8_t *p = out - dist; |
| 356 | + if (dist <= 16) { |
| 357 | + uint8x16_t rep = vqtbl1q_u8(vld1q_u8(p), distance_table[dist]); |
| 358 | + uint64_t size = distance_offsets[dist]; |
| 359 | + uint8_t *o = out; |
| 360 | + int64_t n = len; |
| 361 | + do { |
| 362 | + vst1q_u8(o, rep); |
| 363 | + vst1q_u8(o+size, rep); |
| 364 | + vst1q_u8(o+size*2, rep); |
| 365 | + o += size*3; |
| 366 | + n -= size*3; |
| 367 | + } while (n > 0); |
| 368 | + out += len; |
| 369 | + } else { |
| 370 | + int64_t i = 0; |
| 371 | + do { |
| 372 | + vst1q_u8(out + i, vld1q_u8(p + i)); |
| 373 | + vst1q_u8(out + i+16, vld1q_u8(p + i+16)); |
| 374 | + vst1q_u8(out + i+32, vld1q_u8(p + i+32)); |
| 375 | + i += 48; |
| 376 | + } while (i < len); |
| 377 | + out += len; |
| 378 | + } |
| 379 | +#else |
329 | 380 | out = chunkcopy_lapped_relaxed(out, dist, len); |
330 | 381 |
|
| 382 | +#endif |
331 | 383 | } |
332 | 384 |
|
333 | 385 | chunk_continue: |
|
0 commit comments