Skip to content

Commit 3341797

Browse files
committed
meshletcodec: Switch from indexing output arrays to pointer adjustment
clang uses known-bits analysis that has an unfortunate consequence when using offsets with removed low bits for addressing. Specifically, instead of generating something like `and rcx, -3`, it notices that the offset is multiplied by 4/8 in pointer addressing, and as such the top bits of the offset are not used - and it propagates these top bits into the and mask, resulting in an extra 10-byte instruction movabs to emit the mask before and'ing it. Notably, here any extra address math is wasteful - the tail code should be placed after the loop code which has already computed the output address on the previous iteration. Instead of relying on the compiler to correctly synthesize the induction variables here, we can do it ourselves which makes the tail code write directly to the adjusted output array.
1 parent bbb1314 commit 3341797

File tree

1 file changed

+32
-32
lines changed

1 file changed

+32
-32
lines changed

src/meshletcodec.cpp

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -637,11 +637,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
637637
// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
638638
#if defined(SIMD_SSE)
639639
__m128i r = _mm_shuffle_epi8(state, repack);
640-
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
640+
_mm_storel_epi64(reinterpret_cast<__m128i*>(triangles), r);
641641
#elif defined(SIMD_NEON)
642642
uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
643-
vst1_u32(&triangles[i * 2], r);
643+
vst1_u32(triangles, r);
644644
#endif
645+
646+
triangles += 2;
645647
}
646648

647649
// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -654,14 +656,12 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
654656

655657
state = decodeTriangleGroup(state, code, extra);
656658

657-
unsigned int* tail = &triangles[triangle_count & ~1];
658-
659659
#if defined(SIMD_SSE)
660660
__m128i r = _mm_shuffle_epi8(state, repack);
661-
*tail = unsigned(_mm_cvtsi128_si32(r));
661+
*triangles = unsigned(_mm_cvtsi128_si32(r));
662662
#elif defined(SIMD_NEON)
663663
uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
664-
vst1_lane_u32(tail, r, 0);
664+
vst1_lane_u32(triangles, r, 0);
665665
#endif
666666
}
667667

@@ -697,10 +697,10 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
697697
// write first decoded triangle and first index of second decoded triangle
698698
#if defined(SIMD_SSE)
699699
__m128i r0 = _mm_srli_si128(state, 9);
700-
*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
700+
*reinterpret_cast<unaligned_int*>(triangles) = _mm_cvtsi128_si32(r0);
701701
#elif defined(SIMD_NEON)
702702
uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
703-
vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
703+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles), vreinterpretq_u32_u8(r0), 0);
704704
#endif
705705

706706
state = decodeTriangleGroup(state, code1, extra);
@@ -709,11 +709,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
709709
// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
710710
#if defined(SIMD_SSE)
711711
__m128i r1 = _mm_srli_si128(state, 7);
712-
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
712+
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[4]), r1);
713713
#elif defined(SIMD_NEON)
714714
uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
715-
vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
715+
vst1_u8(&triangles[4], vget_low_u8(r1));
716716
#endif
717+
718+
triangles += 12;
717719
}
718720

719721
// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
@@ -726,20 +728,18 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
726728

727729
state = decodeTriangleGroup(state, code, extra);
728730

729-
unsigned char* tail = &triangles[(triangle_count & ~3) * 3];
730-
731731
#if defined(SIMD_SSE)
732732
__m128i r = _mm_srli_si128(state, 9);
733733

734-
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
734+
*reinterpret_cast<unaligned_int*>(triangles) = _mm_cvtsi128_si32(r);
735735
if ((triangle_count & 3) > 1)
736-
*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
736+
*reinterpret_cast<unaligned_int*>(triangles + 4) = _mm_extract_epi32(r, 1);
737737
#elif defined(SIMD_NEON)
738738
uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
739739

740-
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
740+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles), vreinterpretq_u32_u8(r), 0);
741741
if ((triangle_count & 3) > 1)
742-
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
742+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(triangles + 4), vreinterpretq_u32_u8(r), 1);
743743
#endif
744744
}
745745

@@ -767,10 +767,12 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
767767
last = decodeVertexGroup(last, code, data);
768768

769769
#if defined(SIMD_SSE)
770-
_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
770+
_mm_storeu_si128(reinterpret_cast<__m128i*>(vertices), last);
771771
#elif defined(SIMD_NEON)
772-
vst1q_u32(&vertices[i * 4], last);
772+
vst1q_u32(vertices, last);
773773
#endif
774+
775+
vertices += 4;
774776
}
775777

776778
// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
@@ -783,20 +785,18 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
783785

784786
last = decodeVertexGroup(last, code, data);
785787

786-
unsigned int* tail = &vertices[vertex_count & ~3];
787-
788788
#if defined(SIMD_SSE)
789-
tail[0] = _mm_cvtsi128_si32(last);
789+
vertices[0] = _mm_cvtsi128_si32(last);
790790
if ((vertex_count & 3) > 1)
791-
tail[1] = _mm_extract_epi32(last, 1);
791+
vertices[1] = _mm_extract_epi32(last, 1);
792792
if ((vertex_count & 3) > 2)
793-
tail[2] = _mm_extract_epi32(last, 2);
793+
vertices[2] = _mm_extract_epi32(last, 2);
794794
#elif defined(SIMD_NEON)
795-
vst1q_lane_u32(&tail[0], last, 0);
795+
vst1q_lane_u32(&vertices[0], last, 0);
796796
if ((vertex_count & 3) > 1)
797-
vst1q_lane_u32(&tail[1], last, 1);
797+
vst1q_lane_u32(&vertices[1], last, 1);
798798
if ((vertex_count & 3) > 2)
799-
vst1q_lane_u32(&tail[2], last, 2);
799+
vst1q_lane_u32(&vertices[2], last, 2);
800800
#endif
801801
}
802802

@@ -829,11 +829,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
829829

830830
#if defined(SIMD_SSE)
831831
__m128i r = _mm_shuffle_epi8(last, repack);
832-
_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
832+
_mm_storel_epi64(reinterpret_cast<__m128i*>(vertices), r);
833833
#elif defined(SIMD_NEON)
834834
uint16x4_t r = vmovn_u32(last);
835-
vst1_u16(&vertices[i * 4], r);
835+
vst1_u16(vertices, r);
836836
#endif
837+
838+
vertices += 4;
837839
}
838840

839841
// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -846,14 +848,12 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
846848

847849
last = decodeVertexGroup(last, code, data);
848850

849-
unsigned short* tail = &vertices[vertex_count & ~3];
850-
851851
#if defined(SIMD_SSE)
852852
__m128i r = _mm_shufflelo_epi16(last, 8);
853-
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
853+
*reinterpret_cast<unaligned_int*>(vertices) = _mm_cvtsi128_si32(r);
854854
#elif defined(SIMD_NEON)
855855
uint16x4_t r = vmovn_u32(last);
856-
vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
856+
vst1_lane_u32(reinterpret_cast<unsigned int*>(vertices), vreinterpret_u32_u16(r), 0);
857857
#endif
858858
}
859859

0 commit comments

Comments
 (0)