Skip to content

Commit 0246621

Browse files
committed
REVIEWED: SIMD intrinsics checks and usage
1 parent dfc94f6 commit 0246621

File tree

1 file changed

+100
-92
lines changed

1 file changed

+100
-92
lines changed

src/external/rlsw.h

Lines changed: 100 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@
4040
* If not defined, the library is in header only mode and can be included in other headers
4141
* or source files without problems. But only ONE file should hold the implementation
4242
*
43+
* #define RLSW_USE_SIMD_INTRINSICS
44+
* Detect and use SIMD intrinsics on the host compilation platform
45+
* SIMD could improve rendering considerable vectorizing some raster operations
46+
* but the target platforms running the compiled program with SIMD enabled
47+
* must support the SIMD the program has been built for, making them only
48+
* recommended under specific situations and only if the developers know
49+
* what are they doing; this flag is not defined by default
50+
*
4351
* rlsw capabilities could be customized just defining some internal
4452
* values before library inclusion (default values listed):
4553
*
@@ -636,59 +644,58 @@ SWAPI void swBindTexture(uint32_t id);
636644
#define SW_ARCH_RISCV
637645
#endif
638646

639-
// Check for SIMD vector instructions
640-
#if defined(__FMA__) && defined(__AVX2__)
641-
#define SW_HAS_FMA_AVX2
642-
#include <immintrin.h>
643-
#endif
644-
#if defined(__FMA__) && defined(__AVX__)
645-
#define SW_HAS_FMA_AVX
646-
#include <immintrin.h>
647-
#endif
648-
#if defined(__AVX2__)
649-
#define SW_HAS_AVX2
650-
#include <immintrin.h>
651-
#endif
652-
#if defined(__AVX__)
653-
#define SW_HAS_AVX
654-
#include <immintrin.h>
655-
#endif
656-
#if defined(__SSE4_2__)
657-
#define SW_HAS_SSE42
658-
#include <nmmintrin.h>
659-
#endif
660-
#if defined(__SSE4_1__)
661-
#define SW_HAS_SSE41
662-
#include <smmintrin.h>
663-
#endif
664-
#if defined(__SSSE3__)
665-
#define SW_HAS_SSSE3
666-
#include <tmmintrin.h>
667-
#endif
668-
#if defined(__SSE3__)
669-
#define SW_HAS_SSE3
670-
#include <pmmintrin.h>
671-
#endif
672-
#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64
673-
#define SW_HAS_SSE2
674-
#include <emmintrin.h>
675-
#endif
676-
#if defined(__SSE__)
677-
#define SW_HAS_SSE
678-
#include <xmmintrin.h>
679-
#endif
680-
#if defined(__ARM_NEON) || defined(__aarch64__)
681-
#if defined(__ARM_FEATURE_FMA)
682-
#define SW_HAS_NEON_FMA
683-
#else
684-
#define SW_HAS_NEON
647+
#if defined(RLSW_USE_SIMD_INTRINSICS)
648+
// Check for SIMD vector instructions
649+
// NOTE: Compiler is responsible to enable required flags for host device,
650+
// supported features are detected at compiler init but varies depending on compiler
651+
// TODO: This logic must be reviewed to avoid the inclusion of multiple headers
652+
// and enable the higher level of SIMD available
653+
#if defined(__FMA__) && defined(__AVX2__)
654+
#define SW_HAS_FMA_AVX2
655+
#include <immintrin.h>
656+
#elif defined(__FMA__) && defined(__AVX__)
657+
#define SW_HAS_FMA_AVX
658+
#include <immintrin.h>
659+
#elif defined(__AVX2__)
660+
#define SW_HAS_AVX2
661+
#include <immintrin.h>
662+
#elif defined(__AVX__)
663+
#define SW_HAS_AVX
664+
#include <immintrin.h>
685665
#endif
686-
#include <arm_neon.h>
687-
#endif
688-
#if defined(__riscv_vector)
689-
#define SW_HAS_RVV
690-
#include <riscv_vector.h>
691-
#endif
666+
#if defined(__SSE4_2__)
667+
#define SW_HAS_SSE42
668+
#include <nmmintrin.h>
669+
#elif defined(__SSE4_1__)
670+
#define SW_HAS_SSE41
671+
#include <smmintrin.h>
672+
#elif defined(__SSSE3__)
673+
#define SW_HAS_SSSE3
674+
#include <tmmintrin.h>
675+
#elif defined(__SSE3__)
676+
#define SW_HAS_SSE3
677+
#include <pmmintrin.h>
678+
#elif defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64
679+
#define SW_HAS_SSE2
680+
#include <emmintrin.h>
681+
#elif defined(__SSE__)
682+
#define SW_HAS_SSE
683+
#include <xmmintrin.h>
684+
#endif
685+
#if defined(__ARM_NEON) || defined(__aarch64__)
686+
#if defined(__ARM_FEATURE_FMA)
687+
#define SW_HAS_NEON_FMA
688+
#else
689+
#define SW_HAS_NEON
690+
#endif
691+
#include <arm_neon.h>
692+
#endif
693+
#if defined(__riscv_vector)
694+
// NOTE: Requires compilation flags: -march=rv64gcv -mabi=lp64d
695+
#define SW_HAS_RVV
696+
#include <riscv_vector.h>
697+
#endif
698+
#endif // RLSW_USE_SIMD_INTRINSICS
692699

693700
#ifdef __cplusplus
694701
#define SW_CURLY_INIT(name) name
@@ -749,31 +756,31 @@ SWAPI void swBindTexture(uint32_t id);
749756
#endif
750757

751758
#if (SW_DEPTH_BUFFER_BITS == 16)
752-
#define SW_DEPTH_TYPE uint16_t
753-
#define SW_DEPTH_IS_PACKED 1
754-
#define SW_DEPTH_PACK_COMP 1
755-
#define SW_DEPTH_MAX UINT16_MAX
756-
#define SW_DEPTH_SCALE (1.0f/UINT16_MAX)
757-
#define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)((d)*SW_DEPTH_MAX))
758-
#define SW_UNPACK_DEPTH(p) (p)
759+
#define SW_DEPTH_TYPE uint16_t
760+
#define SW_DEPTH_IS_PACKED 1
761+
#define SW_DEPTH_PACK_COMP 1
762+
#define SW_DEPTH_MAX UINT16_MAX
763+
#define SW_DEPTH_SCALE (1.0f/UINT16_MAX)
764+
#define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)((d)*SW_DEPTH_MAX))
765+
#define SW_UNPACK_DEPTH(p) (p)
759766
#elif (SW_DEPTH_BUFFER_BITS == 24)
760-
#define SW_DEPTH_TYPE uint8_t
761-
#define SW_DEPTH_IS_PACKED 0
762-
#define SW_DEPTH_PACK_COMP 3
763-
#define SW_DEPTH_MAX 0xFFFFFFU
764-
#define SW_DEPTH_SCALE (1.0f/0xFFFFFFU)
765-
#define SW_PACK_DEPTH_0(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>16)&0xFFU))
766-
#define SW_PACK_DEPTH_1(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>8)&0xFFU))
767-
#define SW_PACK_DEPTH_2(d) ((uint8_t)((uint32_t)((d)*SW_DEPTH_MAX)&0xFFU))
768-
#define SW_UNPACK_DEPTH(p) ((((uint32_t)(p)[0]<<16)|((uint32_t)(p)[1]<<8)|(uint32_t)(p)[2]))
767+
#define SW_DEPTH_TYPE uint8_t
768+
#define SW_DEPTH_IS_PACKED 0
769+
#define SW_DEPTH_PACK_COMP 3
770+
#define SW_DEPTH_MAX 0xFFFFFFU
771+
#define SW_DEPTH_SCALE (1.0f/0xFFFFFFU)
772+
#define SW_PACK_DEPTH_0(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>16)&0xFFU))
773+
#define SW_PACK_DEPTH_1(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>8)&0xFFU))
774+
#define SW_PACK_DEPTH_2(d) ((uint8_t)((uint32_t)((d)*SW_DEPTH_MAX)&0xFFU))
775+
#define SW_UNPACK_DEPTH(p) ((((uint32_t)(p)[0]<<16)|((uint32_t)(p)[1]<<8)|(uint32_t)(p)[2]))
769776
#else // 32 bits
770-
#define SW_DEPTH_TYPE float
771-
#define SW_DEPTH_IS_PACKED 1
772-
#define SW_DEPTH_PACK_COMP 1
773-
#define SW_DEPTH_MAX 1.0f
774-
#define SW_DEPTH_SCALE 1.0f
775-
#define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)(d))
776-
#define SW_UNPACK_DEPTH(p) (p)
777+
#define SW_DEPTH_TYPE float
778+
#define SW_DEPTH_IS_PACKED 1
779+
#define SW_DEPTH_PACK_COMP 1
780+
#define SW_DEPTH_MAX 1.0f
781+
#define SW_DEPTH_SCALE 1.0f
782+
#define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)(d))
783+
#define SW_UNPACK_DEPTH(p) (p)
777784
#endif
778785

779786
#define SW_STATE_CHECK(flags) (SW_STATE_CHECK_EX(RLSW.stateFlags, (flags)))
@@ -1136,25 +1143,26 @@ static inline void sw_float_to_unorm8_simd(uint8_t dst[4], const float src[4])
11361143
*(uint32_t*)dst = _mm_cvtsi128_si32(clamped);
11371144
#elif defined(SW_HAS_RVV)
11381145
// TODO: Sample code generated by AI, needs testing and review
1139-
size_t vl = vsetvl_e32m1(4); // Load up to 4 floats into a vector register
1140-
vfloat32m1_t vsrc = vle32_v_f32m1(src, vl); // Load float32 values
1146+
// NOTE: RVV 1.0 specs define the use of __riscv_ prefix for instrinsic functions
1147+
size_t vl = __riscv_vsetvl_e32m1(4); // Load up to 4 floats into a vector register
1148+
vfloat32m1_t vsrc = __riscv_vle32_v_f32m1(src, vl); // Load float32 values
11411149

11421150
// Clamp to [0.0f, 1.0f]
1143-
vfloat32m1_t vzero = vfmv_v_f_f32m1(0.0f, vl);
1144-
vfloat32m1_t vone = vfmv_v_f_f32m1(1.0f, vl);
1145-
vsrc = vfmin_vv_f32m1(vsrc, vone, vl);
1146-
vsrc = vfmax_vv_f32m1(vsrc, vzero, vl);
1151+
vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1152+
vfloat32m1_t vone = __riscv_vfmv_v_f_f32m1(1.0f, vl);
1153+
vsrc = __riscv_vfmin_vv_f32m1(vsrc, vone, vl);
1154+
vsrc = __riscv_vfmax_vv_f32m1(vsrc, vzero, vl);
11471155

11481156
// Multiply by 255.0f and add 0.5f for rounding
1149-
vfloat32m1_t vscaled = vfmul_vf_f32m1(vsrc, 255.0f, vl);
1150-
vscaled = vfadd_vf_f32m1(vscaled, 0.5f, vl);
1157+
vfloat32m1_t vscaled = __riscv_vfmul_vf_f32m1(vsrc, 255.0f, vl);
1158+
vscaled = __riscv_vfadd_vf_f32m1(vscaled, 0.5f, vl);
11511159

11521160
// Convert to unsigned integer (truncate toward zero)
1153-
vuint32m1_t vu32 = vfcvt_xu_f_v_u32m1(vscaled, vl);
1161+
vuint32m1_t vu32 = __riscv_vfcvt_xu_f_v_u32m1(vscaled, vl);
11541162

11551163
// Narrow from u32 -> u8
1156-
vuint8m1_t vu8 = vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero
1157-
vse8_v_u8m1(dst, vu8, vl); // Store result
1164+
vuint8m1_t vu8 = __riscv_vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero
1165+
__riscv_vse8_v_u8m1(dst, vu8, vl); // Store result
11581166
#else
11591167
for (int i = 0; i < 4; i++)
11601168
{
@@ -1190,12 +1198,12 @@ static inline void sw_float_from_unorm8_simd(float dst[4], const uint8_t src[4])
11901198
_mm_storeu_ps(dst, floats);
11911199
#elif defined(SW_HAS_RVV)
11921200
// TODO: Sample code generated by AI, needs testing and review
1193-
size_t vl = vsetvl_e8m1(4); // Set vector length for 8-bit input elements
1194-
vuint8m1_t vsrc_u8 = vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers
1195-
vuint32m1_t vsrc_u32 = vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers
1196-
vfloat32m1_t vsrc_f32 = vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32
1197-
vfloat32m1_t vnorm = vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize
1198-
vse32_v_f32m1(dst, vnorm, vl); // Store result
1201+
size_t vl = __riscv_vsetvl_e8m1(4); // Set vector length for 8-bit input elements
1202+
vuint8m1_t vsrc_u8 = __riscv_vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers
1203+
vuint32m1_t vsrc_u32 = __riscv_vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers
1204+
vfloat32m1_t vsrc_f32 = __riscv_vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32
1205+
vfloat32m1_t vnorm = __riscv_vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize
1206+
__riscv_vse32_v_f32m1(dst, vnorm, vl); // Store result
11991207
#else
12001208
dst[0] = (float)src[0]*SW_INV_255;
12011209
dst[1] = (float)src[1]*SW_INV_255;

0 commit comments

Comments
 (0)