Skip to content

Commit bbb1314

Browse files
committed
meshletcodec: Simplify triangle decoding tail
When decoding triangles into 24-bit representation, the tail needs to write 4 or 8 bytes. We were processing this as either a 8 or 4 byte write; however, that results in a more complex branching structure than writing 4 or 4+4 bytes does; and in practice, doing two writes (second - conditionally) seems to be a little bit faster.
1 parent 4121a65 commit bbb1314

File tree

1 file changed

+6
-8
lines changed

1 file changed

+6
-8
lines changed

src/meshletcodec.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -731,17 +731,15 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
731731
#if defined(SIMD_SSE)
732732
__m128i r = _mm_srli_si128(state, 9);
733733

734-
if ((triangle_count & 3) == 1)
735-
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
736-
else
737-
_mm_storel_epi64(reinterpret_cast<__m128i*>(tail), r);
734+
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
735+
if ((triangle_count & 3) > 1)
736+
*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
738737
#elif defined(SIMD_NEON)
739738
uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
740739

741-
if ((triangle_count & 3) == 1)
742-
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
743-
else
744-
vst1_u8(tail, vget_low_u8(r));
740+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
741+
if ((triangle_count & 3) > 1)
742+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
745743
#endif
746744
}
747745

0 commit comments

Comments
 (0)