Skip to content

Commit 47c3c0b

Browse files
committed
arm neon ld1{,q_x[234]}: slight speadups on SSE[32] & WASM
Consolidate and propogate the use of these speedups on all _xN functions Speedups for SSE3+ confirmed with the libjpeg-turbo benchmarks Entropy encoding went from 309 to 883 Mcoefficients/sec on GCC 12.2
1 parent 0bfcce4 commit 47c3c0b

File tree

5 files changed

+268
-553
lines changed

5 files changed

+268
-553
lines changed

simde/arm/neon/ld1.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
267267
r_.v128 = wasm_v128_load(ptr);
268268
#elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
269269
r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8);
270+
#elif defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
271+
r_.m128h = _mm_loadu_ph(SIMDE_ALIGN_CAST(__m128h const *, ptr));
270272
#else
271273
simde_memcpy(&r_, ptr, sizeof(r_));
272274
#endif
@@ -289,6 +291,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
289291
r_.v128 = wasm_v128_load(ptr);
290292
#elif defined(SIMDE_RISCV_V_NATIVE)
291293
r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4);
294+
#elif defined(SIMDE_X86_SSE_NATIVE)
295+
r_.m128 = _mm_loadu_ps(ptr);
292296
#else
293297
simde_memcpy(&r_, ptr, sizeof(r_));
294298
#endif
@@ -311,6 +315,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) {
311315
r_.v128 = wasm_v128_load(ptr);
312316
#elif defined(SIMDE_RISCV_V_NATIVE)
313317
r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2);
318+
#elif defined(SIMDE_X86_SSE2_NATIVE)
319+
r_.m128d = _mm_loadu_pd(ptr);
314320
#else
315321
simde_memcpy(&r_, ptr, sizeof(r_));
316322
#endif
@@ -333,6 +339,10 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
333339
r_.v128 = wasm_v128_load(ptr);
334340
#elif defined(SIMDE_RISCV_V_NATIVE)
335341
r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16);
342+
#elif defined(SIMDE_X86_SSE3_NATIVE)
343+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
344+
#elif defined(SIMDE_X86_SSE2_NATIVE)
345+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
336346
#else
337347
simde_memcpy(&r_, ptr, sizeof(r_));
338348
#endif
@@ -355,6 +365,10 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
355365
r_.v128 = wasm_v128_load(ptr);
356366
#elif defined(SIMDE_RISCV_V_NATIVE)
357367
r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8);
368+
#elif defined(SIMDE_X86_SSE3_NATIVE)
369+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
370+
#elif defined(SIMDE_X86_SSE2_NATIVE)
371+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
358372
#else
359373
simde_memcpy(&r_, ptr, sizeof(r_));
360374
#endif
@@ -377,6 +391,10 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
377391
r_.v128 = wasm_v128_load(ptr);
378392
#elif defined(SIMDE_RISCV_V_NATIVE)
379393
r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4);
394+
#elif defined(SIMDE_X86_SSE3_NATIVE)
395+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
396+
#elif defined(SIMDE_X86_SSE2_NATIVE)
397+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
380398
#else
381399
simde_memcpy(&r_, ptr, sizeof(r_));
382400
#endif
@@ -399,6 +417,10 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
399417
r_.v128 = wasm_v128_load(ptr);
400418
#elif defined(SIMDE_RISCV_V_NATIVE)
401419
r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2);
420+
#elif defined(SIMDE_X86_SSE3_NATIVE)
421+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
422+
#elif defined(SIMDE_X86_SSE2_NATIVE)
423+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
402424
#else
403425
simde_memcpy(&r_, ptr, sizeof(r_));
404426
#endif
@@ -421,6 +443,10 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
421443
r_.v128 = wasm_v128_load(ptr);
422444
#elif defined(SIMDE_RISCV_V_NATIVE)
423445
r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16);
446+
#elif defined(SIMDE_X86_SSE3_NATIVE)
447+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
448+
#elif defined(SIMDE_X86_SSE2_NATIVE)
449+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
424450
#else
425451
simde_memcpy(&r_, ptr, sizeof(r_));
426452
#endif
@@ -443,6 +469,10 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
443469
r_.v128 = wasm_v128_load(ptr);
444470
#elif defined(SIMDE_RISCV_V_NATIVE)
445471
r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8);
472+
#elif defined(SIMDE_X86_SSE3_NATIVE)
473+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
474+
#elif defined(SIMDE_X86_SSE2_NATIVE)
475+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
446476
#else
447477
simde_memcpy(&r_, ptr, sizeof(r_));
448478
#endif
@@ -465,6 +495,10 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
465495
r_.v128 = wasm_v128_load(ptr);
466496
#elif defined(SIMDE_RISCV_V_NATIVE)
467497
r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4);
498+
#elif defined(SIMDE_X86_SSE3_NATIVE)
499+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
500+
#elif defined(SIMDE_X86_SSE2_NATIVE)
501+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
468502
#else
469503
simde_memcpy(&r_, ptr, sizeof(r_));
470504
#endif
@@ -487,6 +521,10 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
487521
r_.v128 = wasm_v128_load(ptr);
488522
#elif defined(SIMDE_RISCV_V_NATIVE)
489523
r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2);
524+
#elif defined(SIMDE_X86_SSE3_NATIVE)
525+
r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
526+
#elif defined(SIMDE_X86_SSE2_NATIVE)
527+
r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr));
490528
#else
491529
simde_memcpy(&r_, ptr, sizeof(r_));
492530
#endif

0 commit comments

Comments
 (0)