Skip to content

Commit 37863c9

Browse files
committed
Fix: Unaligned loads/stores of hash state
1 parent 2727a87 commit 37863c9

File tree

1 file changed

+32
-14
lines changed

1 file changed

+32
-14
lines changed

include/stringzilla/hash.h

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -953,14 +953,30 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
953953

954954
SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
955955
__m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
956-
state->aes.xmms[0] = _mm_aesenc_si128(state->aes.xmms[0], state->ins.xmms[0]);
957-
state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), state->ins.xmms[0]);
958-
state->aes.xmms[1] = _mm_aesenc_si128(state->aes.xmms[1], state->ins.xmms[1]);
959-
state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), state->ins.xmms[1]);
960-
state->aes.xmms[2] = _mm_aesenc_si128(state->aes.xmms[2], state->ins.xmms[2]);
961-
state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), state->ins.xmms[2]);
962-
state->aes.xmms[3] = _mm_aesenc_si128(state->aes.xmms[3], state->ins.xmms[3]);
963-
state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
956+
_mm_storeu_si128( //
957+
&state->aes.xmms[0],
958+
_mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[0]), _mm_lddqu_si128(&state->ins.xmms[0])));
959+
_mm_storeu_si128( //
960+
&state->sum.xmms[0], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[0]), shuffle_mask),
961+
_mm_lddqu_si128(&state->ins.xmms[0])));
962+
_mm_storeu_si128( //
963+
&state->aes.xmms[1],
964+
_mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[1]), _mm_lddqu_si128(&state->ins.xmms[1])));
965+
_mm_storeu_si128( //
966+
&state->sum.xmms[1], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[1]), shuffle_mask),
967+
_mm_lddqu_si128(&state->ins.xmms[1])));
968+
_mm_storeu_si128( //
969+
&state->aes.xmms[2],
970+
_mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[2]), _mm_lddqu_si128(&state->ins.xmms[2])));
971+
_mm_storeu_si128( //
972+
&state->sum.xmms[2], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[2]), shuffle_mask),
973+
_mm_lddqu_si128(&state->ins.xmms[2])));
974+
_mm_storeu_si128( //
975+
&state->aes.xmms[3],
976+
_mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[3]), _mm_lddqu_si128(&state->ins.xmms[3])));
977+
_mm_storeu_si128( //
978+
&state->sum.xmms[3], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[3]), shuffle_mask),
979+
_mm_lddqu_si128(&state->ins.xmms[3])));
964980
}
965981

966982
SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
@@ -1074,10 +1090,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
10741090
while (length) {
10751091
// Append to the internal buffer until it's full
10761092
if (state->ins_length % 64 == 0 && length >= 64) {
1077-
state->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(text + 0));
1078-
state->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
1079-
state->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
1080-
state->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
1093+
_mm_storeu_si128(&state->ins.xmms[0], _mm_lddqu_si128((__m128i const *)(text + 0)));
1094+
_mm_storeu_si128(&state->ins.xmms[1], _mm_lddqu_si128((__m128i const *)(text + 16)));
1095+
_mm_storeu_si128(&state->ins.xmms[2], _mm_lddqu_si128((__m128i const *)(text + 32)));
1096+
_mm_storeu_si128(&state->ins.xmms[3], _mm_lddqu_si128((__m128i const *)(text + 48)));
10811097
_sz_hash_state_update_haswell(state);
10821098
state->ins_length += 64;
10831099
text += 64;
@@ -1623,9 +1639,11 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
16231639
__m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
16241640
// ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
16251641
// ! we must use `_mm512_storeu_si512` stores to update the state.
1626-
_mm512_storeu_si512(&state->aes.zmm, _mm512_aesenc_epi128(state->aes.zmm, state->ins.zmm));
1642+
_mm512_storeu_si512(&state->aes.zmm,
1643+
_mm512_aesenc_epi128(_mm512_loadu_si512(&state->aes.zmm), _mm512_loadu_si512(&state->ins.zmm)));
16271644
_mm512_storeu_si512(&state->sum.zmm,
1628-
_mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), state->ins.zmm));
1645+
_mm512_add_epi64(_mm512_shuffle_epi8(_mm512_loadu_si512(&state->sum.zmm), shuffle_mask),
1646+
_mm512_loadu_si512(&state->ins.zmm)));
16291647
}
16301648

16311649
SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {

0 commit comments

Comments
 (0)