@@ -229,6 +229,8 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
229229/* * @copydoc sz_hash_state_fold */
230230SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial (sz_hash_state_t const *state);
231231
232+ #if SZ_USE_HASWELL
233+
232234/* * @copydoc sz_bytesum */
233235SZ_PUBLIC sz_u64_t sz_bytesum_haswell (sz_cptr_t text, sz_size_t length);
234236
@@ -247,6 +249,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
247249/* * @copydoc sz_hash_state_fold */
248250SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell (sz_hash_state_t const *state);
249251
252+ #endif
253+
254+ #if SZ_USE_SKYLAKE
255+
250256/* * @copydoc sz_bytesum */
251257SZ_PUBLIC sz_u64_t sz_bytesum_skylake (sz_cptr_t text, sz_size_t length);
252258
@@ -265,6 +271,10 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
265271/* * @copydoc sz_hash_state_fold */
266272SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake (sz_hash_state_t const *state);
267273
274+ #endif
275+
276+ #if SZ_USE_ICE
277+
268278/* * @copydoc sz_bytesum */
269279SZ_PUBLIC sz_u64_t sz_bytesum_ice (sz_cptr_t text, sz_size_t length);
270280
@@ -283,9 +293,17 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
283293/* * @copydoc sz_hash_state_fold */
284294SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice (sz_hash_state_t const *state);
285295
296+ #endif
297+
298+ #if SZ_USE_NEON
299+
286300/* * @copydoc sz_bytesum */
287301SZ_PUBLIC sz_u64_t sz_bytesum_neon (sz_cptr_t text, sz_size_t length);
288302
303+ #endif
304+
305+ #if SZ_USE_NEON_AES
306+
289307/* * @copydoc sz_hash */
290308SZ_PUBLIC sz_u64_t sz_hash_neon (sz_cptr_t text, sz_size_t length, sz_u64_t seed);
291309
@@ -301,6 +319,41 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
301319/* * @copydoc sz_hash_state_fold */
302320SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon (sz_hash_state_t const *state);
303321
322+ #endif
323+
324+ #if SZ_USE_SVE
325+
326+ /* * @copydoc sz_bytesum */
327+ SZ_PUBLIC sz_u64_t sz_bytesum_sve (sz_cptr_t text, sz_size_t length);
328+
329+ #endif
330+
331+ #if SZ_USE_SVE2
332+
333+ /* * @copydoc sz_bytesum */
334+ SZ_PUBLIC sz_u64_t sz_bytesum_sve2 (sz_cptr_t text, sz_size_t length);
335+
336+ #endif
337+
338+ #if SZ_USE_SVE2_AES
339+
340+ /* * @copydoc sz_hash */
341+ SZ_PUBLIC sz_u64_t sz_hash_sve2 (sz_cptr_t text, sz_size_t length, sz_u64_t seed);
342+
343+ /* * @copydoc sz_fill_random */
344+ SZ_PUBLIC void sz_fill_random_sve2 (sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
345+
346+ /* * @copydoc sz_hash_state_init */
347+ SZ_PUBLIC void sz_hash_state_init_sve2 (sz_hash_state_t *state, sz_u64_t seed);
348+
349+ /* * @copydoc sz_hash_state_stream */
350+ SZ_PUBLIC void sz_hash_state_stream_sve2 (sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
351+
352+ /* * @copydoc sz_hash_state_fold */
353+ SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2 (sz_hash_state_t const *state);
354+
355+ #endif
356+
304357#pragma endregion // Core API
305358
306359#pragma region Helper Methods
@@ -1922,8 +1975,8 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
19221975#pragma region NEON Implementation
19231976#if SZ_USE_NEON
19241977#pragma GCC push_options
1925- #pragma GCC target("arch=armv8.2-a+simd+crypto ")
1926- #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto "))), apply_to = function)
1978+ #pragma GCC target("arch=armv8.2-a+simd")
1979+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
19271980
19281981SZ_PUBLIC sz_u64_t sz_bytesum_neon (sz_cptr_t text, sz_size_t length) {
19291982 uint64x2_t sum_vec = vdupq_n_u64 (0 );
@@ -1943,6 +1996,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
19431996 return sum;
19441997}
19451998
1999+ #pragma clang attribute pop
2000+ #pragma GCC pop_options
2001+ #endif // SZ_USE_NEON
2002+ #pragma endregion // NEON Implementation
2003+
2004+ #pragma region NEON AES Implementation
2005+ #if SZ_USE_NEON_AES
2006+ #pragma GCC push_options
2007+ #pragma GCC target("arch=armv8.2-a+simd+crypto+aes")
2008+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
2009+
19462010/* *
19472011 * @brief Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
19482012 * @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2303,7 +2367,7 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
23032367#pragma clang attribute pop
23042368#pragma GCC pop_options
23052369#endif // SZ_USE_NEON
2306- #pragma endregion // NEON Implementation
2370+ #pragma endregion // NEON AES Implementation
23072371
23082372/* Implementation of the string search algorithms using the Arm SVE variable-length registers,
23092373 * available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
@@ -2340,11 +2404,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
23402404 *
23412405 * @see https://stackoverflow.com/a/73218637/2766161
23422406 */
2343- #pragma region SVE Implementation
2407+ #pragma region SVE2 Implementation
23442408#if SZ_USE_SVE2
23452409#pragma GCC push_options
2346- #pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes ")
2347- #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes "))), apply_to = function)
2410+ #pragma GCC target("arch=armv8.2-a+sve+sve2")
2411+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
23482412
23492413SZ_PUBLIC sz_u64_t sz_bytesum_sve2 (sz_cptr_t text, sz_size_t length) {
23502414 sz_u64_t sum = 0 ;
@@ -2371,6 +2435,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
23712435 return sum;
23722436}
23732437
2438+ #pragma clang attribute pop
2439+ #pragma GCC pop_options
2440+ #endif // SZ_USE_SVE
2441+ #pragma endregion // SVE2 Implementation
2442+
2443+ #pragma region SVE2 AES Implementation
2444+ #if SZ_USE_SVE2_AES
2445+ #pragma GCC push_options
2446+ #pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
2447+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
2448+
23742449/* *
23752450 * @brief Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
23762451 * @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2504,7 +2579,7 @@ SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16],
25042579#pragma clang attribute pop
25052580#pragma GCC pop_options
25062581#endif // SZ_USE_SVE2
2507- #pragma endregion // SVE Implementation
2582+ #pragma endregion // SVE2 Implementation
25082583
25092584/* Pick the right implementation for the string search algorithms.
25102585 * To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
@@ -2537,9 +2612,9 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
25372612 return sz_hash_skylake (text, length, seed);
25382613#elif SZ_USE_HASWELL
25392614 return sz_hash_haswell (text, length, seed);
2540- #elif SZ_USE_SVE2
2615+ #elif SZ_USE_SVE2_AES
25412616 return sz_hash_sve2 (text, length, seed);
2542- #elif SZ_USE_NEON
2617+ #elif SZ_USE_NEON_AES
25432618 return sz_hash_neon (text, length, seed);
25442619#else
25452620 return sz_hash_serial (text, length, seed);
@@ -2553,9 +2628,9 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce)
25532628 sz_fill_random_skylake (text, length, nonce);
25542629#elif SZ_USE_HASWELL
25552630 sz_fill_random_haswell (text, length, nonce);
2556- #elif SZ_USE_SVE2
2631+ #elif SZ_USE_SVE2_AES
25572632 sz_fill_random_sve2 (text, length, nonce);
2558- #elif SZ_USE_NEON
2633+ #elif SZ_USE_NEON_AES
25592634 sz_fill_random_neon (text, length, nonce);
25602635#else
25612636 sz_fill_random_serial (text, length, nonce);
@@ -2569,9 +2644,9 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
25692644 sz_hash_state_init_skylake (state, seed);
25702645#elif SZ_USE_HASWELL
25712646 sz_hash_state_init_haswell (state, seed);
2572- #elif SZ_USE_SVE2
2647+ #elif SZ_USE_SVE2_AES
25732648 sz_hash_state_init_sve2 (state, seed);
2574- #elif SZ_USE_NEON
2649+ #elif SZ_USE_NEON_AES
25752650 sz_hash_state_init_neon (state, seed);
25762651#else
25772652 sz_hash_state_init_serial (state, seed);
@@ -2585,9 +2660,9 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
25852660 sz_hash_state_stream_skylake (state, text, length);
25862661#elif SZ_USE_HASWELL
25872662 sz_hash_state_stream_haswell (state, text, length);
2588- #elif SZ_USE_SVE2
2663+ #elif SZ_USE_SVE2_AES
25892664 sz_hash_state_stream_sve2 (state, text, length);
2590- #elif SZ_USE_NEON
2665+ #elif SZ_USE_NEON_AES
25912666 sz_hash_state_stream_neon (state, text, length);
25922667#else
25932668 sz_hash_state_stream_serial (state, text, length);
@@ -2601,9 +2676,9 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
26012676 return sz_hash_state_fold_skylake (state);
26022677#elif SZ_USE_HASWELL
26032678 return sz_hash_state_fold_haswell (state);
2604- #elif SZ_USE_SVE2
2679+ #elif SZ_USE_SVE2_AES
26052680 return sz_hash_state_fold_sve2 (state);
2606- #elif SZ_USE_NEON
2681+ #elif SZ_USE_NEON_AES
26072682 return sz_hash_state_fold_neon (state);
26082683#else
26092684 return sz_hash_state_fold_serial (state);
0 commit comments