@@ -616,69 +616,74 @@ SWAPI void swBindTexture(uint32_t id);
616616#include <math.h> // Required for: floorf(), fabsf()
617617
618618#if defined(__FMA__ ) && defined(__AVX2__ )
619- # define SW_HAS_FMA_AVX2
620- # include <immintrin.h>
619+ # define SW_HAS_FMA_AVX2
620+ # include <immintrin.h>
621621#endif
622622
623623#if defined(__FMA__ ) && defined(__AVX__ )
624- # define SW_HAS_FMA_AVX
625- # include <immintrin.h>
624+ # define SW_HAS_FMA_AVX
625+ # include <immintrin.h>
626626#endif
627627
628628#if defined(__AVX2__ )
629- # define SW_HAS_AVX2
630- # include <immintrin.h>
629+ # define SW_HAS_AVX2
630+ # include <immintrin.h>
631631#endif
632632
633633#if defined(__AVX__ )
634- # define SW_HAS_AVX
635- # include <immintrin.h>
634+ # define SW_HAS_AVX
635+ # include <immintrin.h>
636636#endif
637637
638638#if defined(__SSE4_2__ )
639- # define SW_HAS_SSE42
640- # include <nmmintrin.h>
639+ # define SW_HAS_SSE42
640+ # include <nmmintrin.h>
641641#endif
642642
643643#if defined(__SSE4_1__ )
644- # define SW_HAS_SSE41
645- # include <smmintrin.h>
644+ # define SW_HAS_SSE41
645+ # include <smmintrin.h>
646646#endif
647647
648648#if defined(__SSSE3__ )
649- # define SW_HAS_SSSE3
650- # include <tmmintrin.h>
649+ # define SW_HAS_SSSE3
650+ # include <tmmintrin.h>
651651#endif
652652
653653#if defined(__SSE3__ )
654- # define SW_HAS_SSE3
655- # include <pmmintrin.h>
654+ # define SW_HAS_SSE3
655+ # include <pmmintrin.h>
656656#endif
657657
658658#if defined(__SSE2__ )
659- # define SW_HAS_SSE2
660- # include <emmintrin.h>
659+ # define SW_HAS_SSE2
660+ # include <emmintrin.h>
661661#endif
662662
663663#if defined(__SSE__ )
664- # define SW_HAS_SSE
665- # include <xmmintrin.h>
664+ # define SW_HAS_SSE
665+ # include <xmmintrin.h>
666666#endif
667667
668668#if defined(__ARM_NEON ) || defined(__aarch64__ )
669- # if defined(__ARM_FEATURE_FMA )
670- # define SW_HAS_NEON_FMA
671- # else
672- # define SW_HAS_NEON
673- # endif
674- # include <arm_neon.h>
669+ #if defined(__ARM_FEATURE_FMA )
670+ #define SW_HAS_NEON_FMA
671+ #else
672+ #define SW_HAS_NEON
673+ #endif
674+ #include <arm_neon.h>
675+ #endif
676+
677+ #ifdef __riscv_vector
678+ #define SW_HAS_RVV
679+ #include <riscv_vector.h>
675680#endif
676681
677682//----------------------------------------------------------------------------------
678683// Defines and Macros
679684//----------------------------------------------------------------------------------
680685#define SW_PI 3.14159265358979323846f
681- #define SW_INV_255 0.00392156862745098f
686+ #define SW_INV_255 0.00392156862745098f // 1.0f/255.0f
682687#define SW_DEG2RAD (SW_PI/180.0f)
683688#define SW_RAD2DEG (180.0f/SW_PI)
684689
@@ -1102,6 +1107,27 @@ static inline void sw_float_to_unorm8_simd(uint8_t dst[4], const float src[4])
11021107 clamped = _mm_packs_epi32 (clamped , clamped ); // s32 -> s16 (saturated)
11031108 clamped = _mm_packus_epi16 (clamped , clamped ); // s16 -> u8 (saturated < 0 to 0)
11041109 * (uint32_t * )dst = _mm_cvtsi128_si32 (clamped );
1110+ #elif defined(SW_HAS_RVV )
1111+ // TODO: Sample code generated by AI, needs testing and review
1112+ size_t vl = vsetvl_e32m1 (4 ); // Load up to 4 floats into a vector register
1113+ vfloat32m1_t vsrc = vle32_v_f32m1 (src , vl ); // Load float32 values
1114+
1115+ // Clamp to [0.0f, 1.0f]
1116+ vfloat32m1_t vzero = vfmv_v_f_f32m1 (0.0f , vl );
1117+ vfloat32m1_t vone = vfmv_v_f_f32m1 (1.0f , vl );
1118+ vsrc = vfmin_vv_f32m1 (vsrc , vone , vl );
1119+ vsrc = vfmax_vv_f32m1 (vsrc , vzero , vl );
1120+
1121+ // Multiply by 255.0f and add 0.5f for rounding
1122+ vfloat32m1_t vscaled = vfmul_vf_f32m1 (vsrc , 255.0f , vl );
1123+ vscaled = vfadd_vf_f32m1 (vscaled , 0.5f , vl );
1124+
1125+ // Convert to unsigned integer (truncate toward zero)
1126+ vuint32m1_t vu32 = vfcvt_xu_f_v_u32m1 (vscaled , vl );
1127+
1128+ // Narrow from u32 -> u8
1129+ vuint8m1_t vu8 = vnclipu_wx_u8m1 (vu32 , 0 , vl ); // Round toward zero
1130+ vse8_v_u8m1 (dst , vu8 , vl ); // Store result
11051131#else
11061132 for (int i = 0 ; i < 4 ; i ++ )
11071133 {
@@ -1123,18 +1149,26 @@ static inline void sw_float_from_unorm8_simd(float dst[4], const uint8_t src[4])
11231149 floats = vmulq_n_f32 (floats , SW_INV_255 );
11241150 vst1q_f32 (dst , floats );
11251151#elif defined(SW_HAS_SSE41 )
1126- __m128i bytes = _mm_cvtsi32_si128 (* (const uint32_t * )src );
1152+ __m128i bytes = _mm_cvtsi32_si128 (* (const uint32_t * )src );
11271153 __m128i ints = _mm_cvtepu8_epi32 (bytes );
11281154 __m128 floats = _mm_cvtepi32_ps (ints );
11291155 floats = _mm_mul_ps (floats , _mm_set1_ps (SW_INV_255 ));
11301156 _mm_storeu_ps (dst , floats );
11311157#elif defined(SW_HAS_SSE2 )
1132- __m128i bytes = _mm_cvtsi32_si128 (* (const uint32_t * )src );
1158+ __m128i bytes = _mm_cvtsi32_si128 (* (const uint32_t * )src );
11331159 bytes = _mm_unpacklo_epi8 (bytes , _mm_setzero_si128 ());
11341160 __m128i ints = _mm_unpacklo_epi16 (bytes , _mm_setzero_si128 ());
11351161 __m128 floats = _mm_cvtepi32_ps (ints );
11361162 floats = _mm_mul_ps (floats , _mm_set1_ps (SW_INV_255 ));
11371163 _mm_storeu_ps (dst , floats );
1164+ #elif defined(SW_HAS_RVV )
1165+ // TODO: Sample code generated by AI, needs testing and review
1166+ size_t vl = vsetvl_e8m1 (4 ); // Set vector length for 8-bit input elements
1167+ vuint8m1_t vsrc_u8 = vle8_v_u8m1 (src , vl ); // Load 4 unsigned 8-bit integers
1168+ vuint32m1_t vsrc_u32 = vwcvt_xu_u_v_u32m1 (vsrc_u8 , vl ); // Widen to 32-bit unsigned integers
1169+ vfloat32m1_t vsrc_f32 = vfcvt_f_xu_v_f32m1 (vsrc_u32 , vl ); // Convert to float32
1170+ vfloat32m1_t vnorm = vfmul_vf_f32m1 (vsrc_f32 , SW_INV_255 , vl ); // Multiply by 1/255.0 to normalize
1171+ vse32_v_f32m1 (dst , vnorm , vl ); // Store result
11381172#else
11391173 dst [0 ] = (float )src [0 ]* SW_INV_255 ;
11401174 dst [1 ] = (float )src [1 ]* SW_INV_255 ;
@@ -2672,8 +2706,8 @@ static inline void FUNC_NAME(void)
26722706 float ySubstep = 1.0f - sw_fract (v0 -> screen [1 ]); \
26732707 \
26742708 /* Calculation of vertex gradients in X and Y */ \
2675- float dUdx , dVdx ; \
2676- float dUdy , dVdy ; \
2709+ float dUdx = 0.0f , dVdx = 0.0f ; \
2710+ float dUdy = 0.0f , dVdy = 0.0f ; \
26772711 if (ENABLE_TEXTURE ) { \
26782712 dUdx = (v1 -> texcoord [0 ] - v0 -> texcoord [0 ])* wRcp ; \
26792713 dVdx = (v1 -> texcoord [1 ] - v0 -> texcoord [1 ])* wRcp ; \
0 commit comments