Skip to content

Commit 1801e3e

Browse files
committed
meshletcodec: Truncate tail offset to 32 bit to improve codegen
clang uses known-bits analysis that has an unfortunate consequence when using offsets with removed low bits for addressing. Specifically, instead of generating something like `and rcx, -3`, it notices that the offset is multiplied by 4/8 in pointer addressing, and as such the top bits of the offset are not used - and it propagates these top bits into the and mask, resulting in an extra 10-byte instruction movabs to emit the mask before and'ing it. For now work around this problem by using unsigned 32-bit masking, which improves this without regressing other platforms or compilers.
1 parent bbb1314 commit 1801e3e

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

src/meshletcodec.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
654654

655655
state = decodeTriangleGroup(state, code, extra);
656656

657-
unsigned int* tail = &triangles[triangle_count & ~1];
657+
unsigned int* tail = &triangles[triangle_count & ~1u];
658658

659659
#if defined(SIMD_SSE)
660660
__m128i r = _mm_shuffle_epi8(state, repack);
@@ -726,7 +726,7 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
726726

727727
state = decodeTriangleGroup(state, code, extra);
728728

729-
unsigned char* tail = &triangles[(triangle_count & ~3) * 3];
729+
unsigned char* tail = &triangles[(triangle_count & ~3u) * 3];
730730

731731
#if defined(SIMD_SSE)
732732
__m128i r = _mm_srli_si128(state, 9);
@@ -783,7 +783,7 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
783783

784784
last = decodeVertexGroup(last, code, data);
785785

786-
unsigned int* tail = &vertices[vertex_count & ~3];
786+
unsigned int* tail = &vertices[vertex_count & ~3u];
787787

788788
#if defined(SIMD_SSE)
789789
tail[0] = _mm_cvtsi128_si32(last);
@@ -846,7 +846,7 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
846846

847847
last = decodeVertexGroup(last, code, data);
848848

849-
unsigned short* tail = &vertices[vertex_count & ~3];
849+
unsigned short* tail = &vertices[vertex_count & ~3u];
850850

851851
#if defined(SIMD_SSE)
852852
__m128i r = _mm_shufflelo_epi16(last, 8);

0 commit comments

Comments
 (0)