Skip to content

Commit ff565a6

Browse files
authored
Merge pull request #1009 from zeux/mlc-neon
meshletcodec: Implement NEON decoding support
2 parents c0a5a6c + d814174 commit ff565a6

File tree

2 files changed

+156
-19
lines changed

2 files changed

+156
-19
lines changed

src/meshletcodec.cpp

Lines changed: 155 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
#define SIMD_TARGET __attribute__((target("sse4.1")))
2626
#endif
2727

28+
// When targeting AArch64, enable NEON SIMD unconditionally; we do not support SIMD decoding for 32-bit ARM
29+
#if defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
30+
#define SIMD_NEON
31+
#endif
32+
2833
#ifndef SIMD_TARGET
2934
#define SIMD_TARGET
3035
#endif
@@ -35,6 +40,10 @@
3540
#include <smmintrin.h>
3641
#endif
3742

43+
#ifdef SIMD_NEON
44+
#include <arm_neon.h>
45+
#endif
46+
3847
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
3948
#ifdef _MSC_VER
4049
#include <intrin.h> // __cpuid
@@ -215,7 +224,7 @@ static size_t encodeVertices(unsigned char* ctrl, unsigned char* data, const uns
215224
return data - start;
216225
}
217226

218-
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE))
227+
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
219228
inline void writeTriangle(unsigned int* triangles, size_t i, unsigned int fifo)
220229
{
221230
// output triangle is stored without extra edge vertex (0xcbac => 0xcba)
@@ -342,7 +351,7 @@ static int decodeMeshlet(void* vertices, void* triangles, const unsigned char* c
342351
}
343352
#endif
344353

345-
#if defined(SIMD_SSE)
354+
#if defined(SIMD_SSE) || defined(SIMD_NEON)
346355
// SIMD state is stored in a single 16b register as follows:
347356
// 0..5: 6 next extra bytes
348357
// 6..14: 9 bytes = 3 triangles worth of index data
@@ -523,13 +532,61 @@ inline __m128i decodeVertexGroup(__m128i last, unsigned char code, const unsigne
523532

524533
return x;
525534
}
535+
#endif
536+
537+
#if defined(SIMD_NEON)
538+
SIMD_TARGET
539+
inline uint8x16_t decodeTriangleGroup(uint8x16_t state, unsigned char code, const unsigned char*& extra)
540+
{
541+
uint8x16_t shuf = vld1q_u8(kDecodeTableMasks[code]);
542+
uint8x16_t next = vextq_u8(vdupq_n_u8(0), shuf, 6);
543+
544+
// patch first 6 bytes with current extra and roll state forward
545+
uint8x8_t extl = vld1_u8(extra);
546+
uint8x16_t ext = vcombine_u8(extl, vdup_n_u8(0));
547+
state = vbslq_u8(vcombine_u8(vcreate_u8(0xffffffffffffull), vdup_n_u8(0)), ext, state);
548+
state = vaddq_u8(vqtbl1q_u8(state, shuf), next);
549+
550+
extra += kDecodeTableExtra[code];
551+
552+
return state;
553+
}
554+
555+
SIMD_TARGET
556+
inline uint32x4_t decodeVertexGroup(uint32x4_t last, unsigned char code, const unsigned char*& data)
557+
{
558+
uint8x16_t word = vld1q_u8(data);
559+
uint8x16_t shuf = vld1q_u8(kDecodeTableVerts[code]);
560+
561+
uint32x4_t v = vreinterpretq_u32_u8(vqtbl1q_u8(word, shuf));
562+
563+
// unzigzag+1
564+
uint32x4_t xl = vsubq_u32(vdupq_n_u32(0), vandq_u32(v, vdupq_n_u32(1)));
565+
uint32x4_t xr = vshrq_n_u32(v, 1);
566+
uint32x4_t x = vaddq_u32(veorq_u32(xl, xr), vdupq_n_u32(1));
567+
568+
// prefix sum
569+
x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 2));
570+
x = vaddq_u32(x, vextq_u32(vdupq_n_u32(0), x, 3));
571+
x = vaddq_u32(x, vdupq_n_u32(vgetq_lane_u32(last, 3)));
572+
573+
data += kDecodeTableLength[code];
574+
575+
return x;
576+
}
577+
#endif
526578

579+
#if defined(SIMD_SSE) || defined(SIMD_NEON)
527580
SIMD_TARGET
528581
static const unsigned char* decodeTrianglesRawSimd(unsigned int* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
529582
{
583+
#if defined(SIMD_SSE)
530584
__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);
531-
532585
__m128i state = _mm_setzero_si128();
586+
#elif defined(SIMD_NEON)
587+
uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);
588+
uint8x16_t state = vdupq_n_u8(0);
589+
#endif
533590

534591
for (size_t i = 0; i < triangle_count; i += 2)
535592
{
@@ -542,8 +599,13 @@ static const unsigned char* decodeTrianglesRawSimd(unsigned int* triangles, cons
542599

543600
// copy 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
544601
// safe to write 2 triangles as caller provides padded output buffer
602+
#if defined(SIMD_SSE)
545603
__m128i r = _mm_shuffle_epi8(state, repack);
546604
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i]), r);
605+
#elif defined(SIMD_NEON)
606+
uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
607+
vst1_u32(&triangles[i], r);
608+
#endif
547609
}
548610

549611
return extra;
@@ -552,9 +614,13 @@ static const unsigned char* decodeTrianglesRawSimd(unsigned int* triangles, cons
552614
SIMD_TARGET
553615
static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
554616
{
617+
#if defined(SIMD_SSE)
555618
__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);
556-
557619
__m128i state = _mm_setzero_si128();
620+
#elif defined(SIMD_NEON)
621+
uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);
622+
uint8x16_t state = vdupq_n_u8(0);
623+
#endif
558624

559625
size_t groups = triangle_count / 2;
560626

@@ -569,8 +635,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
569635
state = decodeTriangleGroup(state, code, extra);
570636

571637
// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
638+
#if defined(SIMD_SSE)
572639
__m128i r = _mm_shuffle_epi8(state, repack);
573640
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
641+
#elif defined(SIMD_NEON)
642+
uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
643+
vst1_u32(&triangles[i * 2], r);
644+
#endif
574645
}
575646

576647
// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -585,23 +656,34 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
585656

586657
unsigned int* tail = &triangles[triangle_count & ~1];
587658

659+
#if defined(SIMD_SSE)
588660
__m128i r = _mm_shuffle_epi8(state, repack);
589661
*tail = unsigned(_mm_cvtsi128_si32(r));
662+
#elif defined(SIMD_NEON)
663+
uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
664+
vst1_lane_u32(tail, r, 0);
665+
#endif
590666
}
591667

592668
return extra;
593669
}
594670

595-
SIMD_TARGET
596-
static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
597-
{
671+
#if defined(SIMD_SSE)
598672
#ifdef __GNUC__
599-
typedef int __attribute__((aligned(1))) unaligned_int;
673+
typedef int __attribute__((aligned(1))) unaligned_int;
600674
#else
601-
typedef int unaligned_int;
675+
typedef int unaligned_int;
676+
#endif
602677
#endif
603678

679+
SIMD_TARGET
680+
static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const unsigned char* codes, const unsigned char* extra, const unsigned char* bound, size_t triangle_count)
681+
{
682+
#if defined(SIMD_SSE)
604683
__m128i state = _mm_setzero_si128();
684+
#elif defined(SIMD_NEON)
685+
uint8x16_t state = vdupq_n_u8(0);
686+
#endif
605687

606688
// because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing
607689
// instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present
@@ -621,15 +703,25 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
621703
state = decodeTriangleGroup(state, code0, extra);
622704

623705
// write first decoded triangle and first index of second decoded triangle
706+
#if defined(SIMD_SSE)
624707
__m128i r0 = _mm_srli_si128(state, 9);
625708
*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
709+
#elif defined(SIMD_NEON)
710+
uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
711+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
712+
#endif
626713

627714
state = decodeTriangleGroup(state, code1, extra);
628715

629716
// write last two indices of second decoded triangle that we didn't write above plus two new ones
630717
// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
718+
#if defined(SIMD_SSE)
631719
__m128i r1 = _mm_srli_si128(state, 7);
632720
_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
721+
#elif defined(SIMD_NEON)
722+
uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
723+
vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
724+
#endif
633725
}
634726

635727
// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
@@ -644,12 +736,21 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
644736

645737
unsigned char* tail = &triangles[(triangle_count & ~3) * 3];
646738

739+
#if defined(SIMD_SSE)
647740
__m128i r = _mm_srli_si128(state, 9);
648741

649742
if ((triangle_count & 3) == 1)
650743
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
651744
else
652745
_mm_storel_epi64(reinterpret_cast<__m128i*>(tail), r);
746+
#elif defined(SIMD_NEON)
747+
uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
748+
749+
if ((triangle_count & 3) == 1)
750+
vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
751+
else
752+
vst1_u8(tail, vget_low_u8(r));
753+
#endif
653754
}
654755

655756
return extra;
@@ -658,7 +759,11 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
658759
SIMD_TARGET
659760
static const unsigned char* decodeVerticesRawSimd(unsigned int* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
660761
{
762+
#if defined(SIMD_SSE)
661763
__m128i last = _mm_set1_epi32(-1);
764+
#elif defined(SIMD_NEON)
765+
uint32x4_t last = vdupq_n_u32(~0u);
766+
#endif
662767

663768
for (size_t i = 0; i < vertex_count; i += 4)
664769
{
@@ -669,7 +774,11 @@ static const unsigned char* decodeVerticesRawSimd(unsigned int* vertices, const
669774
last = decodeVertexGroup(last, code, data);
670775

671776
// safe to write 4 vertices as caller provides padded output buffer
777+
#if defined(SIMD_SSE)
672778
_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i]), last);
779+
#elif defined(SIMD_NEON)
780+
vst1q_u32(&vertices[i], last);
781+
#endif
673782
}
674783

675784
return data;
@@ -678,7 +787,11 @@ static const unsigned char* decodeVerticesRawSimd(unsigned int* vertices, const
678787
SIMD_TARGET
679788
static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
680789
{
790+
#if defined(SIMD_SSE)
681791
__m128i last = _mm_set1_epi32(-1);
792+
#elif defined(SIMD_NEON)
793+
uint32x4_t last = vdupq_n_u32(~0u);
794+
#endif
682795

683796
size_t groups = vertex_count / 4;
684797

@@ -691,7 +804,11 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
691804

692805
last = decodeVertexGroup(last, code, data);
693806

807+
#if defined(SIMD_SSE)
694808
_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
809+
#elif defined(SIMD_NEON)
810+
vst1q_u32(&vertices[i * 4], last);
811+
#endif
695812
}
696813

697814
// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
@@ -708,6 +825,7 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
708825

709826
switch (vertex_count & 3)
710827
{
828+
#if defined(SIMD_SSE)
711829
case 3:
712830
tail[2] = _mm_extract_epi32(last, 2);
713831
// fallthrough
@@ -717,6 +835,17 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
717835
case 1:
718836
tail[0] = _mm_extract_epi32(last, 0);
719837
// fallthrough
838+
#elif defined(SIMD_NEON)
839+
case 3:
840+
vst1q_lane_u32(&tail[2], last, 2);
841+
// fallthrough
842+
case 2:
843+
vst1q_lane_u32(&tail[1], last, 1);
844+
// fallthrough
845+
case 1:
846+
vst1q_lane_u32(&tail[0], last, 0);
847+
// fallthrough
848+
#endif
720849
default:;
721850
}
722851
}
@@ -727,15 +856,12 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
727856
SIMD_TARGET
728857
static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const unsigned char* ctrl, const unsigned char* data, const unsigned char* bound, size_t vertex_count)
729858
{
730-
#ifdef __GNUC__
731-
typedef int __attribute__((aligned(1))) unaligned_int;
732-
#else
733-
typedef int unaligned_int;
734-
#endif
735-
859+
#if defined(SIMD_SSE)
736860
__m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
737-
738861
__m128i last = _mm_set1_epi32(-1);
862+
#elif defined(SIMD_NEON)
863+
uint32x4_t last = vdupq_n_u32(~0u);
864+
#endif
739865

740866
// because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing
741867
// if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop
@@ -751,8 +877,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
751877

752878
last = decodeVertexGroup(last, code, data);
753879

880+
#if defined(SIMD_SSE)
754881
__m128i r = _mm_shuffle_epi8(last, repack);
755882
_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
883+
#elif defined(SIMD_NEON)
884+
uint16x4_t r = vmovn_u32(last);
885+
vst1_u16(&vertices[i * 4], r);
886+
#endif
756887
}
757888

758889
// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -767,8 +898,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
767898

768899
unsigned short* tail = &vertices[vertex_count & ~3];
769900

901+
#if defined(SIMD_SSE)
770902
__m128i r = _mm_shuffle_epi8(last, repack);
771903
*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
904+
#elif defined(SIMD_NEON)
905+
uint16x4_t r = vmovn_u32(last);
906+
vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
907+
#endif
772908
}
773909

774910
return data;
@@ -938,7 +1074,7 @@ int meshopt_decodeMeshlet(void* vertices, size_t vertex_count, size_t vertex_siz
9381074

9391075
#if defined(SIMD_FALLBACK)
9401076
return (gDecodeTablesInitialized ? decodeMeshletSimd : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
941-
#elif defined(SIMD_SSE)
1077+
#elif defined(SIMD_SSE) || defined(SIMD_NEON)
9421078
return decodeMeshletSimd(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
9431079
#else
9441080
return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
@@ -970,13 +1106,14 @@ int meshopt_decodeMeshletRaw(unsigned int* vertices, size_t vertex_count, unsign
9701106

9711107
#if defined(SIMD_FALLBACK)
9721108
return (gDecodeTablesInitialized ? decodeMeshletRawSimd : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
973-
#elif defined(SIMD_SSE)
1109+
#elif defined(SIMD_SSE) || defined(SIMD_NEON)
9741110
return decodeMeshletRawSimd(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
9751111
#else
9761112
return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
9771113
#endif
9781114
}
9791115

9801116
#undef SIMD_SSE
1117+
#undef SIMD_NEON
9811118
#undef SIMD_FALLBACK
9821119
#undef SIMD_TARGET

tools/codecbench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ void benchMeshlets(const std::vector<float>& positions, const std::vector<unsign
225225
int(max_vertices), int(max_triangles), int(meshlets.size()), int(packed.size()), double(packed.size() * 8) / double(indices.size() / 3));
226226

227227
unsigned int rv[256];
228-
unsigned int rt[256];
228+
unsigned char rt[256 * 3];
229229

230230
for (int attempt = 0; attempt < 50; ++attempt)
231231
{

0 commit comments

Comments
 (0)