Skip to content

Commit 4121a65

Browse files
committed
meshletcodec: Simplify vertex decoding tail
switch/case doesn't really help here; the typical lowering is using branches, and we always need to write the first component anyway. As such it's simpler - and a little faster - to use explicit ifs/writes. Also change more general repack shuffle to shufflelo with an immediate for 16-bit tail processing. The compiler can do this transform too but it's better to be explicit.
1 parent 476953b commit 4121a65

File tree

1 file changed

+9
-21
lines changed

1 file changed

+9
-21
lines changed

src/meshletcodec.cpp

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -787,31 +787,19 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
787787

788788
unsigned int* tail = &vertices[vertex_count & ~3];
789789

790-
switch (vertex_count & 3)
791-
{
792790
#if defined(SIMD_SSE)
793-
case 3:
794-
tail[2] = _mm_extract_epi32(last, 2);
795-
// fallthrough
796-
case 2:
791+
tail[0] = _mm_cvtsi128_si32(last);
792+
if ((vertex_count & 3) > 1)
797793
tail[1] = _mm_extract_epi32(last, 1);
798-
// fallthrough
799-
case 1:
800-
tail[0] = _mm_extract_epi32(last, 0);
801-
// fallthrough
794+
if ((vertex_count & 3) > 2)
795+
tail[2] = _mm_extract_epi32(last, 2);
802796
#elif defined(SIMD_NEON)
803-
case 3:
804-
vst1q_lane_u32(&tail[2], last, 2);
805-
// fallthrough
806-
case 2:
797+
vst1q_lane_u32(&tail[0], last, 0);
798+
if ((vertex_count & 3) > 1)
807799
vst1q_lane_u32(&tail[1], last, 1);
808-
// fallthrough
809-
case 1:
810-
vst1q_lane_u32(&tail[0], last, 0);
811-
// fallthrough
800+
if ((vertex_count & 3) > 2)
801+
vst1q_lane_u32(&tail[2], last, 2);
812802
#endif
813-
default:;
814-
}
815803
}
816804

817805
return data;
@@ -863,7 +851,7 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
863851
unsigned short* tail = &vertices[vertex_count & ~3];
864852

865853
#if defined(SIMD_SSE)
866-
__m128i r = _mm_shuffle_epi8(last, repack);
854+
__m128i r = _mm_shufflelo_epi16(last, 8);
867855
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
868856
#elif defined(SIMD_NEON)
869857
uint16x4_t r = vmovn_u32(last);

0 commit comments

Comments
 (0)