|
40 | 40 | * If not defined, the library is in header only mode and can be included in other headers |
41 | 41 | * or source files without problems. But only ONE file should hold the implementation |
42 | 42 | * |
| 43 | +* #define RLSW_USE_SIMD_INTRINSICS |
| 44 | +* Detect and use SIMD intrinsics on the host compilation platform |
| 45 | +* SIMD could improve rendering considerable vectorizing some raster operations |
| 46 | +* but the target platforms running the compiled program with SIMD enabled |
| 47 | +* must support the SIMD the program has been built for, making them only |
| 48 | +* recommended under specific situations and only if the developers know |
| 49 | +* what are they doing; this flag is not defined by default |
| 50 | +* |
43 | 51 | * rlsw capabilities could be customized just defining some internal |
44 | 52 | * values before library inclusion (default values listed): |
45 | 53 | * |
@@ -636,59 +644,58 @@ SWAPI void swBindTexture(uint32_t id); |
636 | 644 | #define SW_ARCH_RISCV |
637 | 645 | #endif |
638 | 646 |
|
639 | | -// Check for SIMD vector instructions |
640 | | -#if defined(__FMA__) && defined(__AVX2__) |
641 | | - #define SW_HAS_FMA_AVX2 |
642 | | - #include <immintrin.h> |
643 | | -#endif |
644 | | -#if defined(__FMA__) && defined(__AVX__) |
645 | | - #define SW_HAS_FMA_AVX |
646 | | - #include <immintrin.h> |
647 | | -#endif |
648 | | -#if defined(__AVX2__) |
649 | | - #define SW_HAS_AVX2 |
650 | | - #include <immintrin.h> |
651 | | -#endif |
652 | | -#if defined(__AVX__) |
653 | | - #define SW_HAS_AVX |
654 | | - #include <immintrin.h> |
655 | | -#endif |
656 | | -#if defined(__SSE4_2__) |
657 | | - #define SW_HAS_SSE42 |
658 | | - #include <nmmintrin.h> |
659 | | -#endif |
660 | | -#if defined(__SSE4_1__) |
661 | | - #define SW_HAS_SSE41 |
662 | | - #include <smmintrin.h> |
663 | | -#endif |
664 | | -#if defined(__SSSE3__) |
665 | | - #define SW_HAS_SSSE3 |
666 | | - #include <tmmintrin.h> |
667 | | -#endif |
668 | | -#if defined(__SSE3__) |
669 | | - #define SW_HAS_SSE3 |
670 | | - #include <pmmintrin.h> |
671 | | -#endif |
672 | | -#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64 |
673 | | - #define SW_HAS_SSE2 |
674 | | - #include <emmintrin.h> |
675 | | -#endif |
676 | | -#if defined(__SSE__) |
677 | | - #define SW_HAS_SSE |
678 | | - #include <xmmintrin.h> |
679 | | -#endif |
680 | | -#if defined(__ARM_NEON) || defined(__aarch64__) |
681 | | - #if defined(__ARM_FEATURE_FMA) |
682 | | - #define SW_HAS_NEON_FMA |
683 | | - #else |
684 | | - #define SW_HAS_NEON |
| 647 | +#if defined(RLSW_USE_SIMD_INTRINSICS) |
| 648 | + // Check for SIMD vector instructions |
| 649 | + // NOTE: Compiler is responsible to enable required flags for host device, |
| 650 | + // supported features are detected at compiler init but varies depending on compiler |
| 651 | + // TODO: This logic must be reviewed to avoid the inclusion of multiple headers |
| 652 | + // and enable the higher level of SIMD available |
| 653 | + #if defined(__FMA__) && defined(__AVX2__) |
| 654 | + #define SW_HAS_FMA_AVX2 |
| 655 | + #include <immintrin.h> |
| 656 | + #elif defined(__FMA__) && defined(__AVX__) |
| 657 | + #define SW_HAS_FMA_AVX |
| 658 | + #include <immintrin.h> |
| 659 | + #elif defined(__AVX2__) |
| 660 | + #define SW_HAS_AVX2 |
| 661 | + #include <immintrin.h> |
| 662 | + #elif defined(__AVX__) |
| 663 | + #define SW_HAS_AVX |
| 664 | + #include <immintrin.h> |
685 | 665 | #endif |
686 | | - #include <arm_neon.h> |
687 | | -#endif |
688 | | -#if defined(__riscv_vector) |
689 | | - #define SW_HAS_RVV |
690 | | - #include <riscv_vector.h> |
691 | | -#endif |
| 666 | + #if defined(__SSE4_2__) |
| 667 | + #define SW_HAS_SSE42 |
| 668 | + #include <nmmintrin.h> |
| 669 | + #elif defined(__SSE4_1__) |
| 670 | + #define SW_HAS_SSE41 |
| 671 | + #include <smmintrin.h> |
| 672 | + #elif defined(__SSSE3__) |
| 673 | + #define SW_HAS_SSSE3 |
| 674 | + #include <tmmintrin.h> |
| 675 | + #elif defined(__SSE3__) |
| 676 | + #define SW_HAS_SSE3 |
| 677 | + #include <pmmintrin.h> |
| 678 | + #elif defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64 |
| 679 | + #define SW_HAS_SSE2 |
| 680 | + #include <emmintrin.h> |
| 681 | + #elif defined(__SSE__) |
| 682 | + #define SW_HAS_SSE |
| 683 | + #include <xmmintrin.h> |
| 684 | + #endif |
| 685 | + #if defined(__ARM_NEON) || defined(__aarch64__) |
| 686 | + #if defined(__ARM_FEATURE_FMA) |
| 687 | + #define SW_HAS_NEON_FMA |
| 688 | + #else |
| 689 | + #define SW_HAS_NEON |
| 690 | + #endif |
| 691 | + #include <arm_neon.h> |
| 692 | + #endif |
| 693 | + #if defined(__riscv_vector) |
| 694 | + // NOTE: Requires compilation flags: -march=rv64gcv -mabi=lp64d |
| 695 | + #define SW_HAS_RVV |
| 696 | + #include <riscv_vector.h> |
| 697 | + #endif |
| 698 | +#endif // RLSW_USE_SIMD_INTRINSICS |
692 | 699 |
|
693 | 700 | #ifdef __cplusplus |
694 | 701 | #define SW_CURLY_INIT(name) name |
@@ -749,31 +756,31 @@ SWAPI void swBindTexture(uint32_t id); |
749 | 756 | #endif |
750 | 757 |
|
751 | 758 | #if (SW_DEPTH_BUFFER_BITS == 16) |
752 | | - #define SW_DEPTH_TYPE uint16_t |
753 | | - #define SW_DEPTH_IS_PACKED 1 |
754 | | - #define SW_DEPTH_PACK_COMP 1 |
755 | | - #define SW_DEPTH_MAX UINT16_MAX |
756 | | - #define SW_DEPTH_SCALE (1.0f/UINT16_MAX) |
757 | | - #define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)((d)*SW_DEPTH_MAX)) |
758 | | - #define SW_UNPACK_DEPTH(p) (p) |
| 759 | + #define SW_DEPTH_TYPE uint16_t |
| 760 | + #define SW_DEPTH_IS_PACKED 1 |
| 761 | + #define SW_DEPTH_PACK_COMP 1 |
| 762 | + #define SW_DEPTH_MAX UINT16_MAX |
| 763 | + #define SW_DEPTH_SCALE (1.0f/UINT16_MAX) |
| 764 | + #define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)((d)*SW_DEPTH_MAX)) |
| 765 | + #define SW_UNPACK_DEPTH(p) (p) |
759 | 766 | #elif (SW_DEPTH_BUFFER_BITS == 24) |
760 | | - #define SW_DEPTH_TYPE uint8_t |
761 | | - #define SW_DEPTH_IS_PACKED 0 |
762 | | - #define SW_DEPTH_PACK_COMP 3 |
763 | | - #define SW_DEPTH_MAX 0xFFFFFFU |
764 | | - #define SW_DEPTH_SCALE (1.0f/0xFFFFFFU) |
765 | | - #define SW_PACK_DEPTH_0(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>16)&0xFFU)) |
766 | | - #define SW_PACK_DEPTH_1(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>8)&0xFFU)) |
767 | | - #define SW_PACK_DEPTH_2(d) ((uint8_t)((uint32_t)((d)*SW_DEPTH_MAX)&0xFFU)) |
768 | | - #define SW_UNPACK_DEPTH(p) ((((uint32_t)(p)[0]<<16)|((uint32_t)(p)[1]<<8)|(uint32_t)(p)[2])) |
| 767 | + #define SW_DEPTH_TYPE uint8_t |
| 768 | + #define SW_DEPTH_IS_PACKED 0 |
| 769 | + #define SW_DEPTH_PACK_COMP 3 |
| 770 | + #define SW_DEPTH_MAX 0xFFFFFFU |
| 771 | + #define SW_DEPTH_SCALE (1.0f/0xFFFFFFU) |
| 772 | + #define SW_PACK_DEPTH_0(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>16)&0xFFU)) |
| 773 | + #define SW_PACK_DEPTH_1(d) ((uint8_t)(((uint32_t)((d)*SW_DEPTH_MAX)>>8)&0xFFU)) |
| 774 | + #define SW_PACK_DEPTH_2(d) ((uint8_t)((uint32_t)((d)*SW_DEPTH_MAX)&0xFFU)) |
| 775 | + #define SW_UNPACK_DEPTH(p) ((((uint32_t)(p)[0]<<16)|((uint32_t)(p)[1]<<8)|(uint32_t)(p)[2])) |
769 | 776 | #else // 32 bits |
770 | | - #define SW_DEPTH_TYPE float |
771 | | - #define SW_DEPTH_IS_PACKED 1 |
772 | | - #define SW_DEPTH_PACK_COMP 1 |
773 | | - #define SW_DEPTH_MAX 1.0f |
774 | | - #define SW_DEPTH_SCALE 1.0f |
775 | | - #define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)(d)) |
776 | | - #define SW_UNPACK_DEPTH(p) (p) |
| 777 | + #define SW_DEPTH_TYPE float |
| 778 | + #define SW_DEPTH_IS_PACKED 1 |
| 779 | + #define SW_DEPTH_PACK_COMP 1 |
| 780 | + #define SW_DEPTH_MAX 1.0f |
| 781 | + #define SW_DEPTH_SCALE 1.0f |
| 782 | + #define SW_PACK_DEPTH(d) ((SW_DEPTH_TYPE)(d)) |
| 783 | + #define SW_UNPACK_DEPTH(p) (p) |
777 | 784 | #endif |
778 | 785 |
|
779 | 786 | #define SW_STATE_CHECK(flags) (SW_STATE_CHECK_EX(RLSW.stateFlags, (flags))) |
@@ -1136,25 +1143,26 @@ static inline void sw_float_to_unorm8_simd(uint8_t dst[4], const float src[4]) |
1136 | 1143 | *(uint32_t*)dst = _mm_cvtsi128_si32(clamped); |
1137 | 1144 | #elif defined(SW_HAS_RVV) |
1138 | 1145 | // TODO: Sample code generated by AI, needs testing and review |
1139 | | - size_t vl = vsetvl_e32m1(4); // Load up to 4 floats into a vector register |
1140 | | - vfloat32m1_t vsrc = vle32_v_f32m1(src, vl); // Load float32 values |
| 1146 | + // NOTE: RVV 1.0 specs define the use of __riscv_ prefix for instrinsic functions |
| 1147 | + size_t vl = __riscv_vsetvl_e32m1(4); // Load up to 4 floats into a vector register |
| 1148 | + vfloat32m1_t vsrc = __riscv_vle32_v_f32m1(src, vl); // Load float32 values |
1141 | 1149 |
|
1142 | 1150 | // Clamp to [0.0f, 1.0f] |
1143 | | - vfloat32m1_t vzero = vfmv_v_f_f32m1(0.0f, vl); |
1144 | | - vfloat32m1_t vone = vfmv_v_f_f32m1(1.0f, vl); |
1145 | | - vsrc = vfmin_vv_f32m1(vsrc, vone, vl); |
1146 | | - vsrc = vfmax_vv_f32m1(vsrc, vzero, vl); |
| 1151 | + vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1(0.0f, vl); |
| 1152 | + vfloat32m1_t vone = __riscv_vfmv_v_f_f32m1(1.0f, vl); |
| 1153 | + vsrc = __riscv_vfmin_vv_f32m1(vsrc, vone, vl); |
| 1154 | + vsrc = __riscv_vfmax_vv_f32m1(vsrc, vzero, vl); |
1147 | 1155 |
|
1148 | 1156 | // Multiply by 255.0f and add 0.5f for rounding |
1149 | | - vfloat32m1_t vscaled = vfmul_vf_f32m1(vsrc, 255.0f, vl); |
1150 | | - vscaled = vfadd_vf_f32m1(vscaled, 0.5f, vl); |
| 1157 | + vfloat32m1_t vscaled = __riscv_vfmul_vf_f32m1(vsrc, 255.0f, vl); |
| 1158 | + vscaled = __riscv_vfadd_vf_f32m1(vscaled, 0.5f, vl); |
1151 | 1159 |
|
1152 | 1160 | // Convert to unsigned integer (truncate toward zero) |
1153 | | - vuint32m1_t vu32 = vfcvt_xu_f_v_u32m1(vscaled, vl); |
| 1161 | + vuint32m1_t vu32 = __riscv_vfcvt_xu_f_v_u32m1(vscaled, vl); |
1154 | 1162 |
|
1155 | 1163 | // Narrow from u32 -> u8 |
1156 | | - vuint8m1_t vu8 = vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero |
1157 | | - vse8_v_u8m1(dst, vu8, vl); // Store result |
| 1164 | + vuint8m1_t vu8 = __riscv_vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero |
| 1165 | + __riscv_vse8_v_u8m1(dst, vu8, vl); // Store result |
1158 | 1166 | #else |
1159 | 1167 | for (int i = 0; i < 4; i++) |
1160 | 1168 | { |
@@ -1190,12 +1198,12 @@ static inline void sw_float_from_unorm8_simd(float dst[4], const uint8_t src[4]) |
1190 | 1198 | _mm_storeu_ps(dst, floats); |
1191 | 1199 | #elif defined(SW_HAS_RVV) |
1192 | 1200 | // TODO: Sample code generated by AI, needs testing and review |
1193 | | - size_t vl = vsetvl_e8m1(4); // Set vector length for 8-bit input elements |
1194 | | - vuint8m1_t vsrc_u8 = vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers |
1195 | | - vuint32m1_t vsrc_u32 = vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers |
1196 | | - vfloat32m1_t vsrc_f32 = vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32 |
1197 | | - vfloat32m1_t vnorm = vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize |
1198 | | - vse32_v_f32m1(dst, vnorm, vl); // Store result |
| 1201 | + size_t vl = __riscv_vsetvl_e8m1(4); // Set vector length for 8-bit input elements |
| 1202 | + vuint8m1_t vsrc_u8 = __riscv_vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers |
| 1203 | + vuint32m1_t vsrc_u32 = __riscv_vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers |
| 1204 | + vfloat32m1_t vsrc_f32 = __riscv_vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32 |
| 1205 | + vfloat32m1_t vnorm = __riscv_vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize |
| 1206 | + __riscv_vse32_v_f32m1(dst, vnorm, vl); // Store result |
1199 | 1207 | #else |
1200 | 1208 | dst[0] = (float)src[0]*SW_INV_255; |
1201 | 1209 | dst[1] = (float)src[1]*SW_INV_255; |
|
0 commit comments