Skip to content

Commit f106301

Browse files
committed
ADDED: Some code sample for RISC-V RVV vector instructions -WIP-
1 parent 7887033 commit f106301

File tree

1 file changed

+65
-31
lines changed

1 file changed

+65
-31
lines changed

src/external/rlsw.h

Lines changed: 65 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -616,69 +616,74 @@ SWAPI void swBindTexture(uint32_t id);
616616
#include <math.h> // Required for: floorf(), fabsf()
617617

618618
#if defined(__FMA__) && defined(__AVX2__)
619-
# define SW_HAS_FMA_AVX2
620-
# include <immintrin.h>
619+
#define SW_HAS_FMA_AVX2
620+
#include <immintrin.h>
621621
#endif
622622

623623
#if defined(__FMA__) && defined(__AVX__)
624-
# define SW_HAS_FMA_AVX
625-
# include <immintrin.h>
624+
#define SW_HAS_FMA_AVX
625+
#include <immintrin.h>
626626
#endif
627627

628628
#if defined(__AVX2__)
629-
# define SW_HAS_AVX2
630-
# include <immintrin.h>
629+
#define SW_HAS_AVX2
630+
#include <immintrin.h>
631631
#endif
632632

633633
#if defined(__AVX__)
634-
# define SW_HAS_AVX
635-
# include <immintrin.h>
634+
#define SW_HAS_AVX
635+
#include <immintrin.h>
636636
#endif
637637

638638
#if defined(__SSE4_2__)
639-
# define SW_HAS_SSE42
640-
# include <nmmintrin.h>
639+
#define SW_HAS_SSE42
640+
#include <nmmintrin.h>
641641
#endif
642642

643643
#if defined(__SSE4_1__)
644-
# define SW_HAS_SSE41
645-
# include <smmintrin.h>
644+
#define SW_HAS_SSE41
645+
#include <smmintrin.h>
646646
#endif
647647

648648
#if defined(__SSSE3__)
649-
# define SW_HAS_SSSE3
650-
# include <tmmintrin.h>
649+
#define SW_HAS_SSSE3
650+
#include <tmmintrin.h>
651651
#endif
652652

653653
#if defined(__SSE3__)
654-
# define SW_HAS_SSE3
655-
# include <pmmintrin.h>
654+
#define SW_HAS_SSE3
655+
#include <pmmintrin.h>
656656
#endif
657657

658658
#if defined(__SSE2__)
659-
# define SW_HAS_SSE2
660-
# include <emmintrin.h>
659+
#define SW_HAS_SSE2
660+
#include <emmintrin.h>
661661
#endif
662662

663663
#if defined(__SSE__)
664-
# define SW_HAS_SSE
665-
# include <xmmintrin.h>
664+
#define SW_HAS_SSE
665+
#include <xmmintrin.h>
666666
#endif
667667

668668
#if defined(__ARM_NEON) || defined(__aarch64__)
669-
# if defined(__ARM_FEATURE_FMA)
670-
# define SW_HAS_NEON_FMA
671-
# else
672-
# define SW_HAS_NEON
673-
# endif
674-
# include <arm_neon.h>
669+
#if defined(__ARM_FEATURE_FMA)
670+
#define SW_HAS_NEON_FMA
671+
#else
672+
#define SW_HAS_NEON
673+
#endif
674+
#include <arm_neon.h>
675+
#endif
676+
677+
#ifdef __riscv_vector
678+
#define SW_HAS_RVV
679+
#include <riscv_vector.h>
675680
#endif
676681

677682
//----------------------------------------------------------------------------------
678683
// Defines and Macros
679684
//----------------------------------------------------------------------------------
680685
#define SW_PI 3.14159265358979323846f
681-
#define SW_INV_255 0.00392156862745098f
686+
#define SW_INV_255 0.00392156862745098f // 1.0f/255.0f
682687
#define SW_DEG2RAD (SW_PI/180.0f)
683688
#define SW_RAD2DEG (180.0f/SW_PI)
684689

@@ -1102,6 +1107,27 @@ static inline void sw_float_to_unorm8_simd(uint8_t dst[4], const float src[4])
11021107
clamped = _mm_packs_epi32(clamped, clamped); // s32 -> s16 (saturated)
11031108
clamped = _mm_packus_epi16(clamped, clamped); // s16 -> u8 (saturated < 0 to 0)
11041109
*(uint32_t*)dst = _mm_cvtsi128_si32(clamped);
1110+
#elif defined(SW_HAS_RVV)
1111+
// TODO: Sample code generated by AI, needs testing and review
1112+
size_t vl = vsetvl_e32m1(4); // Load up to 4 floats into a vector register
1113+
vfloat32m1_t vsrc = vle32_v_f32m1(src, vl); // Load float32 values
1114+
1115+
// Clamp to [0.0f, 1.0f]
1116+
vfloat32m1_t vzero = vfmv_v_f_f32m1(0.0f, vl);
1117+
vfloat32m1_t vone = vfmv_v_f_f32m1(1.0f, vl);
1118+
vsrc = vfmin_vv_f32m1(vsrc, vone, vl);
1119+
vsrc = vfmax_vv_f32m1(vsrc, vzero, vl);
1120+
1121+
// Multiply by 255.0f and add 0.5f for rounding
1122+
vfloat32m1_t vscaled = vfmul_vf_f32m1(vsrc, 255.0f, vl);
1123+
vscaled = vfadd_vf_f32m1(vscaled, 0.5f, vl);
1124+
1125+
// Convert to unsigned integer (truncate toward zero)
1126+
vuint32m1_t vu32 = vfcvt_xu_f_v_u32m1(vscaled, vl);
1127+
1128+
// Narrow from u32 -> u8
1129+
vuint8m1_t vu8 = vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero
1130+
vse8_v_u8m1(dst, vu8, vl); // Store result
11051131
#else
11061132
for (int i = 0; i < 4; i++)
11071133
{
@@ -1123,18 +1149,26 @@ static inline void sw_float_from_unorm8_simd(float dst[4], const uint8_t src[4])
11231149
floats = vmulq_n_f32(floats, SW_INV_255);
11241150
vst1q_f32(dst, floats);
11251151
#elif defined(SW_HAS_SSE41)
1126-
__m128i bytes = _mm_cvtsi32_si128(*(const uint32_t*)src);
1152+
__m128i bytes = _mm_cvtsi32_si128(*(const uint32_t *)src);
11271153
__m128i ints = _mm_cvtepu8_epi32(bytes);
11281154
__m128 floats = _mm_cvtepi32_ps(ints);
11291155
floats = _mm_mul_ps(floats, _mm_set1_ps(SW_INV_255));
11301156
_mm_storeu_ps(dst, floats);
11311157
#elif defined(SW_HAS_SSE2)
1132-
__m128i bytes = _mm_cvtsi32_si128(*(const uint32_t*)src);
1158+
__m128i bytes = _mm_cvtsi32_si128(*(const uint32_t *)src);
11331159
bytes = _mm_unpacklo_epi8(bytes, _mm_setzero_si128());
11341160
__m128i ints = _mm_unpacklo_epi16(bytes, _mm_setzero_si128());
11351161
__m128 floats = _mm_cvtepi32_ps(ints);
11361162
floats = _mm_mul_ps(floats, _mm_set1_ps(SW_INV_255));
11371163
_mm_storeu_ps(dst, floats);
1164+
#elif defined(SW_HAS_RVV)
1165+
// TODO: Sample code generated by AI, needs testing and review
1166+
size_t vl = vsetvl_e8m1(4); // Set vector length for 8-bit input elements
1167+
vuint8m1_t vsrc_u8 = vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers
1168+
vuint32m1_t vsrc_u32 = vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers
1169+
vfloat32m1_t vsrc_f32 = vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32
1170+
vfloat32m1_t vnorm = vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize
1171+
vse32_v_f32m1(dst, vnorm, vl); // Store result
11381172
#else
11391173
dst[0] = (float)src[0]*SW_INV_255;
11401174
dst[1] = (float)src[1]*SW_INV_255;
@@ -2672,8 +2706,8 @@ static inline void FUNC_NAME(void)
26722706
float ySubstep = 1.0f - sw_fract(v0->screen[1]); \
26732707
\
26742708
/* Calculation of vertex gradients in X and Y */ \
2675-
float dUdx, dVdx; \
2676-
float dUdy, dVdy; \
2709+
float dUdx = 0.0f, dVdx = 0.0f; \
2710+
float dUdy = 0.0f, dVdy = 0.0f; \
26772711
if (ENABLE_TEXTURE) { \
26782712
dUdx = (v1->texcoord[0] - v0->texcoord[0])*wRcp; \
26792713
dVdx = (v1->texcoord[1] - v0->texcoord[1])*wRcp; \

0 commit comments

Comments
 (0)