meshletcodec: Switch from indexing output arrays to pointer adjustment

zeux · zeux · commit 3341797aee4f · 2026-02-05T10:07:13.000-08:00
clang uses known-bits analysis that has an unfortunate consequence when
using offsets with removed low bits for addressing. Specifically,
instead of generating something like `and rcx, -3`, it notices that the
offset is multiplied by 4/8 in pointer addressing, and as such the top
bits of the offset are not used - and it propagates these top bits into
the and mask, resulting in an extra 10-byte instruction movabs to emit
the mask before and'ing it.

Notably, here any extra address math is wasteful - the tail code should
be placed after the loop code which has already computed the output
address on the previous iteration. Instead of relying on the compiler to
correctly synthesize the induction variables here, we can do it
ourselves which makes the tail code write directly to the adjusted
output array.
diff --git a/src/meshletcodec.cpp b/src/meshletcodec.cpp
@@ -637,11 +637,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
 		// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
 #if defined(SIMD_SSE)
 		__m128i r = _mm_shuffle_epi8(state, repack);
-		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
+		_mm_storel_epi64(reinterpret_cast<__m128i*>(triangles), r);
 #elif defined(SIMD_NEON)
 		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
-		vst1_u32(&triangles[i * 2], r);
+		vst1_u32(triangles, r);
 #endif
+
+		triangles += 2;
 	}
 
 	// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -654,14 +656,12 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
 
 		state = decodeTriangleGroup(state, code, extra);
 
-		unsigned int* tail = &triangles[triangle_count & ~1];
-
 #if defined(SIMD_SSE)
 		__m128i r = _mm_shuffle_epi8(state, repack);
-		*tail = unsigned(_mm_cvtsi128_si32(r));
+		*triangles = unsigned(_mm_cvtsi128_si32(r));
 #elif defined(SIMD_NEON)
 		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
-		vst1_lane_u32(tail, r, 0);
+		vst1_lane_u32(triangles, r, 0);
 #endif
 	}
 
@@ -697,10 +697,10 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
 		// write first decoded triangle and first index of second decoded triangle
 #if defined(SIMD_SSE)
 		__m128i r0 = _mm_srli_si128(state, 9);
-		*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
+		*reinterpret_cast<unaligned_int*>(triangles) = _mm_cvtsi128_si32(r0);
 #elif defined(SIMD_NEON)
 		uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
-		vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
+		vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles), vreinterpretq_u32_u8(r0), 0);
 #endif
 
 		state = decodeTriangleGroup(state, code1, extra);
@@ -709,11 +709,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
 		// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
 #if defined(SIMD_SSE)
 		__m128i r1 = _mm_srli_si128(state, 7);
-		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
+		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[4]), r1);
 #elif defined(SIMD_NEON)
 		uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
-		vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
+		vst1_u8(&triangles[4], vget_low_u8(r1));
 #endif
+
+		triangles += 12;
 	}
 
 	// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
@@ -726,20 +728,18 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
 
 		state = decodeTriangleGroup(state, code, extra);
 
-		unsigned char* tail = &triangles[(triangle_count & ~3) * 3];
-
 #if defined(SIMD_SSE)
 		__m128i r = _mm_srli_si128(state, 9);
 
-		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
+		*reinterpret_cast<unaligned_int*>(triangles) = _mm_cvtsi128_si32(r);
 		if ((triangle_count & 3) > 1)
-			*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
+			*reinterpret_cast<unaligned_int*>(triangles + 4) = _mm_extract_epi32(r, 1);
 #elif defined(SIMD_NEON)
 		uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
 
-		vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
+		vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles), vreinterpretq_u32_u8(r), 0);
 		if ((triangle_count & 3) > 1)
-			vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
+			vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles + 4), vreinterpretq_u32_u8(r), 1);
 #endif
 	}
 
@@ -767,10 +767,12 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
 		last = decodeVertexGroup(last, code, data);
 
 #if defined(SIMD_SSE)
-		_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(vertices), last);
 #elif defined(SIMD_NEON)
-		vst1q_u32(&vertices[i * 4], last);
+		vst1q_u32(vertices, last);
 #endif
+
+		vertices += 4;
 	}
 
 	// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
@@ -783,20 +785,18 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
 
 		last = decodeVertexGroup(last, code, data);
 
-		unsigned int* tail = &vertices[vertex_count & ~3];
-
 #if defined(SIMD_SSE)
-		tail[0] = _mm_cvtsi128_si32(last);
+		vertices[0] = _mm_cvtsi128_si32(last);
 		if ((vertex_count & 3) > 1)
-			tail[1] = _mm_extract_epi32(last, 1);
+			vertices[1] = _mm_extract_epi32(last, 1);
 		if ((vertex_count & 3) > 2)
-			tail[2] = _mm_extract_epi32(last, 2);
+			vertices[2] = _mm_extract_epi32(last, 2);
 #elif defined(SIMD_NEON)
-		vst1q_lane_u32(&tail[0], last, 0);
+		vst1q_lane_u32(&vertices[0], last, 0);
 		if ((vertex_count & 3) > 1)
-			vst1q_lane_u32(&tail[1], last, 1);
+			vst1q_lane_u32(&vertices[1], last, 1);
 		if ((vertex_count & 3) > 2)
-			vst1q_lane_u32(&tail[2], last, 2);
+			vst1q_lane_u32(&vertices[2], last, 2);
 #endif
 	}
 
@@ -829,11 +829,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
 
 #if defined(SIMD_SSE)
 		__m128i r = _mm_shuffle_epi8(last, repack);
-		_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
+		_mm_storel_epi64(reinterpret_cast<__m128i*>(vertices), r);
 #elif defined(SIMD_NEON)
 		uint16x4_t r = vmovn_u32(last);
-		vst1_u16(&vertices[i * 4], r);
+		vst1_u16(vertices, r);
 #endif
+
+		vertices += 4;
 	}
 
 	// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -846,14 +848,12 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
 
 		last = decodeVertexGroup(last, code, data);
 
-		unsigned short* tail = &vertices[vertex_count & ~3];
-
 #if defined(SIMD_SSE)
 		__m128i r = _mm_shufflelo_epi16(last, 8);
-		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
+		*reinterpret_cast<unaligned_int*>(vertices) = _mm_cvtsi128_si32(r);
 #elif defined(SIMD_NEON)
 		uint16x4_t r = vmovn_u32(last);
-		vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
+		vst1_lane_u32(reinterpret_cast<unsigned int*>(vertices), vreinterpret_u32_u16(r), 0);
 #endif
 	}