@@ -953,14 +953,30 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
953953
954954SZ_INTERNAL void _sz_hash_state_update_haswell (sz_hash_state_t *state) {
955955 __m128i const shuffle_mask = _mm_load_si128 ((__m128i const *)_sz_hash_u8x16x4_shuffle ());
956- state->aes .xmms [0 ] = _mm_aesenc_si128 (state->aes .xmms [0 ], state->ins .xmms [0 ]);
957- state->sum .xmms [0 ] = _mm_add_epi64 (_mm_shuffle_epi8 (state->sum .xmms [0 ], shuffle_mask), state->ins .xmms [0 ]);
958- state->aes .xmms [1 ] = _mm_aesenc_si128 (state->aes .xmms [1 ], state->ins .xmms [1 ]);
959- state->sum .xmms [1 ] = _mm_add_epi64 (_mm_shuffle_epi8 (state->sum .xmms [1 ], shuffle_mask), state->ins .xmms [1 ]);
960- state->aes .xmms [2 ] = _mm_aesenc_si128 (state->aes .xmms [2 ], state->ins .xmms [2 ]);
961- state->sum .xmms [2 ] = _mm_add_epi64 (_mm_shuffle_epi8 (state->sum .xmms [2 ], shuffle_mask), state->ins .xmms [2 ]);
962- state->aes .xmms [3 ] = _mm_aesenc_si128 (state->aes .xmms [3 ], state->ins .xmms [3 ]);
963- state->sum .xmms [3 ] = _mm_add_epi64 (_mm_shuffle_epi8 (state->sum .xmms [3 ], shuffle_mask), state->ins .xmms [3 ]);
956+ _mm_storeu_si128 ( //
957+ &state->aes .xmms [0 ],
958+ _mm_aesenc_si128 (_mm_lddqu_si128 (&state->aes .xmms [0 ]), _mm_lddqu_si128 (&state->ins .xmms [0 ])));
959+ _mm_storeu_si128 ( //
960+ &state->sum .xmms [0 ], _mm_add_epi64 (_mm_shuffle_epi8 (_mm_lddqu_si128 (&state->sum .xmms [0 ]), shuffle_mask),
961+ _mm_lddqu_si128 (&state->ins .xmms [0 ])));
962+ _mm_storeu_si128 ( //
963+ &state->aes .xmms [1 ],
964+ _mm_aesenc_si128 (_mm_lddqu_si128 (&state->aes .xmms [1 ]), _mm_lddqu_si128 (&state->ins .xmms [1 ])));
965+ _mm_storeu_si128 ( //
966+ &state->sum .xmms [1 ], _mm_add_epi64 (_mm_shuffle_epi8 (_mm_lddqu_si128 (&state->sum .xmms [1 ]), shuffle_mask),
967+ _mm_lddqu_si128 (&state->ins .xmms [1 ])));
968+ _mm_storeu_si128 ( //
969+ &state->aes .xmms [2 ],
970+ _mm_aesenc_si128 (_mm_lddqu_si128 (&state->aes .xmms [2 ]), _mm_lddqu_si128 (&state->ins .xmms [2 ])));
971+ _mm_storeu_si128 ( //
972+ &state->sum .xmms [2 ], _mm_add_epi64 (_mm_shuffle_epi8 (_mm_lddqu_si128 (&state->sum .xmms [2 ]), shuffle_mask),
973+ _mm_lddqu_si128 (&state->ins .xmms [2 ])));
974+ _mm_storeu_si128 ( //
975+ &state->aes .xmms [3 ],
976+ _mm_aesenc_si128 (_mm_lddqu_si128 (&state->aes .xmms [3 ]), _mm_lddqu_si128 (&state->ins .xmms [3 ])));
977+ _mm_storeu_si128 ( //
978+ &state->sum .xmms [3 ], _mm_add_epi64 (_mm_shuffle_epi8 (_mm_lddqu_si128 (&state->sum .xmms [3 ]), shuffle_mask),
979+ _mm_lddqu_si128 (&state->ins .xmms [3 ])));
964980}
965981
966982SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell (sz_hash_state_t const *state) {
@@ -1074,10 +1090,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
10741090 while (length) {
10751091 // Append to the internal buffer until it's full
10761092 if (state->ins_length % 64 == 0 && length >= 64 ) {
1077- state->ins .xmms [0 ] = _mm_lddqu_si128 ((__m128i const *)(text + 0 ));
1078- state->ins .xmms [1 ] = _mm_lddqu_si128 ((__m128i const *)(text + 16 ));
1079- state->ins .xmms [2 ] = _mm_lddqu_si128 ((__m128i const *)(text + 32 ));
1080- state->ins .xmms [3 ] = _mm_lddqu_si128 ((__m128i const *)(text + 48 ));
1093+ _mm_storeu_si128 (& state->ins .xmms [0 ], _mm_lddqu_si128 ((__m128i const *)(text + 0 ) ));
1094+ _mm_storeu_si128 (& state->ins .xmms [1 ], _mm_lddqu_si128 ((__m128i const *)(text + 16 ) ));
1095+ _mm_storeu_si128 (& state->ins .xmms [2 ], _mm_lddqu_si128 ((__m128i const *)(text + 32 ) ));
1096+ _mm_storeu_si128 (& state->ins .xmms [3 ], _mm_lddqu_si128 ((__m128i const *)(text + 48 ) ));
10811097 _sz_hash_state_update_haswell (state);
10821098 state->ins_length += 64 ;
10831099 text += 64 ;
@@ -1623,9 +1639,11 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
16231639 __m512i const shuffle_mask = _mm512_load_si512 ((__m512i const *)_sz_hash_u8x16x4_shuffle ());
16241640 // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
16251641 // ! we must use `_mm512_storeu_si512` stores to update the state.
1626- _mm512_storeu_si512 (&state->aes .zmm , _mm512_aesenc_epi128 (state->aes .zmm , state->ins .zmm ));
1642+ _mm512_storeu_si512 (&state->aes .zmm ,
1643+ _mm512_aesenc_epi128 (_mm512_loadu_si512 (&state->aes .zmm ), _mm512_loadu_si512 (&state->ins .zmm )));
16271644 _mm512_storeu_si512 (&state->sum .zmm ,
1628- _mm512_add_epi64 (_mm512_shuffle_epi8 (state->sum .zmm , shuffle_mask), state->ins .zmm ));
1645+ _mm512_add_epi64 (_mm512_shuffle_epi8 (_mm512_loadu_si512 (&state->sum .zmm ), shuffle_mask),
1646+ _mm512_loadu_si512 (&state->ins .zmm )));
16291647}
16301648
16311649SZ_PUBLIC sz_u64_t sz_hash_ice (sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
0 commit comments