2525#define SIMD_TARGET __attribute__ ((target(" sse4.1" )))
2626#endif
2727
28+ // When targeting AArch64, enable NEON SIMD unconditionally; we do not support SIMD decoding for 32-bit ARM
29+ #if defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
30+ #define SIMD_NEON
31+ #endif
32+
2833#ifndef SIMD_TARGET
2934#define SIMD_TARGET
3035#endif
3540#include < smmintrin.h>
3641#endif
3742
43+ #ifdef SIMD_NEON
44+ #include < arm_neon.h>
45+ #endif
46+
3847#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
3948#ifdef _MSC_VER
4049#include < intrin.h> // __cpuid
@@ -215,7 +224,7 @@ static size_t encodeVertices(unsigned char* ctrl, unsigned char* data, const uns
215224 return data - start;
216225}
217226
218- #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE))
227+ #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) )
219228inline void writeTriangle (unsigned int * triangles, size_t i, unsigned int fifo)
220229{
221230 // output triangle is stored without extra edge vertex (0xcbac => 0xcba)
@@ -342,7 +351,7 @@ static int decodeMeshlet(void* vertices, void* triangles, const unsigned char* c
342351}
343352#endif
344353
345- #if defined(SIMD_SSE)
354+ #if defined(SIMD_SSE) || defined(SIMD_NEON)
346355// SIMD state is stored in a single 16b register as follows:
347356// 0..5: 6 next extra bytes
348357// 6..14: 9 bytes = 3 triangles worth of index data
@@ -523,13 +532,61 @@ inline __m128i decodeVertexGroup(__m128i last, unsigned char code, const unsigne
523532
524533 return x;
525534}
535+ #endif
536+
537+ #if defined(SIMD_NEON)
538+ SIMD_TARGET
539+ inline uint8x16_t decodeTriangleGroup (uint8x16_t state, unsigned char code, const unsigned char *& extra)
540+ {
541+ uint8x16_t shuf = vld1q_u8 (kDecodeTableMasks [code]);
542+ uint8x16_t next = vextq_u8 (vdupq_n_u8 (0 ), shuf, 6 );
543+
544+ // patch first 6 bytes with current extra and roll state forward
545+ uint8x8_t extl = vld1_u8 (extra);
546+ uint8x16_t ext = vcombine_u8 (extl, vdup_n_u8 (0 ));
547+ state = vbslq_u8 (vcombine_u8 (vcreate_u8 (0xffffffffffffull ), vdup_n_u8 (0 )), ext, state);
548+ state = vaddq_u8 (vqtbl1q_u8 (state, shuf), next);
549+
550+ extra += kDecodeTableExtra [code];
551+
552+ return state;
553+ }
554+
555+ SIMD_TARGET
556+ inline uint32x4_t decodeVertexGroup (uint32x4_t last, unsigned char code, const unsigned char *& data)
557+ {
558+ uint8x16_t word = vld1q_u8 (data);
559+ uint8x16_t shuf = vld1q_u8 (kDecodeTableVerts [code]);
560+
561+ uint32x4_t v = vreinterpretq_u32_u8 (vqtbl1q_u8 (word, shuf));
562+
563+ // unzigzag+1
564+ uint32x4_t xl = vsubq_u32 (vdupq_n_u32 (0 ), vandq_u32 (v, vdupq_n_u32 (1 )));
565+ uint32x4_t xr = vshrq_n_u32 (v, 1 );
566+ uint32x4_t x = vaddq_u32 (veorq_u32 (xl, xr), vdupq_n_u32 (1 ));
567+
568+ // prefix sum
569+ x = vaddq_u32 (x, vextq_u32 (vdupq_n_u32 (0 ), x, 2 ));
570+ x = vaddq_u32 (x, vextq_u32 (vdupq_n_u32 (0 ), x, 3 ));
571+ x = vaddq_u32 (x, vdupq_n_u32 (vgetq_lane_u32 (last, 3 )));
572+
573+ data += kDecodeTableLength [code];
574+
575+ return x;
576+ }
577+ #endif
526578
579+ #if defined(SIMD_SSE) || defined(SIMD_NEON)
527580SIMD_TARGET
528581static const unsigned char * decodeTrianglesRawSimd (unsigned int * triangles, const unsigned char * codes, const unsigned char * extra, const unsigned char * bound, size_t triangle_count)
529582{
583+ #if defined(SIMD_SSE)
530584 __m128i repack = _mm_setr_epi8 (9 , 10 , 11 , -1 , 12 , 13 , 14 , -1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
531-
532585 __m128i state = _mm_setzero_si128 ();
586+ #elif defined(SIMD_NEON)
587+ uint8x8_t repack = vcreate_u8 (0xff0e0d0cff0b0a09ull );
588+ uint8x16_t state = vdupq_n_u8 (0 );
589+ #endif
533590
534591 for (size_t i = 0 ; i < triangle_count; i += 2 )
535592 {
@@ -542,8 +599,13 @@ static const unsigned char* decodeTrianglesRawSimd(unsigned int* triangles, cons
542599
543600 // copy 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
544601 // safe to write 2 triangles as caller provides padded output buffer
602+ #if defined(SIMD_SSE)
545603 __m128i r = _mm_shuffle_epi8 (state, repack);
546604 _mm_storel_epi64 (reinterpret_cast <__m128i*>(&triangles[i]), r);
605+ #elif defined(SIMD_NEON)
606+ uint32x2_t r = vreinterpret_u32_u8 (vqtbl1_u8 (state, repack));
607+ vst1_u32 (&triangles[i], r);
608+ #endif
547609 }
548610
549611 return extra;
@@ -552,9 +614,13 @@ static const unsigned char* decodeTrianglesRawSimd(unsigned int* triangles, cons
552614SIMD_TARGET
553615static const unsigned char * decodeTrianglesSimd (unsigned int * triangles, const unsigned char * codes, const unsigned char * extra, const unsigned char * bound, size_t triangle_count)
554616{
617+ #if defined(SIMD_SSE)
555618 __m128i repack = _mm_setr_epi8 (9 , 10 , 11 , -1 , 12 , 13 , 14 , -1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
556-
557619 __m128i state = _mm_setzero_si128 ();
620+ #elif defined(SIMD_NEON)
621+ uint8x8_t repack = vcreate_u8 (0xff0e0d0cff0b0a09ull );
622+ uint8x16_t state = vdupq_n_u8 (0 );
623+ #endif
558624
559625 size_t groups = triangle_count / 2 ;
560626
@@ -569,8 +635,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
569635 state = decodeTriangleGroup (state, code, extra);
570636
571637 // write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
638+ #if defined(SIMD_SSE)
572639 __m128i r = _mm_shuffle_epi8 (state, repack);
573640 _mm_storel_epi64 (reinterpret_cast <__m128i*>(&triangles[i * 2 ]), r);
641+ #elif defined(SIMD_NEON)
642+ uint32x2_t r = vreinterpret_u32_u8 (vqtbl1_u8 (state, repack));
643+ vst1_u32 (&triangles[i * 2 ], r);
644+ #endif
574645 }
575646
576647 // process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -585,23 +656,34 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
585656
586657 unsigned int * tail = &triangles[triangle_count & ~1 ];
587658
659+ #if defined(SIMD_SSE)
588660 __m128i r = _mm_shuffle_epi8 (state, repack);
589661 *tail = unsigned (_mm_cvtsi128_si32 (r));
662+ #elif defined(SIMD_NEON)
663+ uint32x2_t r = vreinterpret_u32_u8 (vqtbl1_u8 (state, repack));
664+ vst1_lane_u32 (tail, r, 0 );
665+ #endif
590666 }
591667
592668 return extra;
593669}
594670
595- SIMD_TARGET
596- static const unsigned char * decodeTrianglesSimd (unsigned char * triangles, const unsigned char * codes, const unsigned char * extra, const unsigned char * bound, size_t triangle_count)
597- {
671+ #if defined(SIMD_SSE)
598672#ifdef __GNUC__
599- typedef int __attribute__ ((aligned (1 ))) unaligned_int;
673+ typedef int __attribute__ ((aligned(1 ))) unaligned_int;
600674#else
601- typedef int unaligned_int;
675+ typedef int unaligned_int;
676+ #endif
602677#endif
603678
679+ SIMD_TARGET
680+ static const unsigned char * decodeTrianglesSimd (unsigned char * triangles, const unsigned char * codes, const unsigned char * extra, const unsigned char * bound, size_t triangle_count)
681+ {
682+ #if defined(SIMD_SSE)
604683 __m128i state = _mm_setzero_si128 ();
684+ #elif defined(SIMD_NEON)
685+ uint8x16_t state = vdupq_n_u8 (0 );
686+ #endif
605687
606688 // because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing
607689 // instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present
@@ -621,15 +703,25 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
621703 state = decodeTriangleGroup (state, code0, extra);
622704
623705 // write first decoded triangle and first index of second decoded triangle
706+ #if defined(SIMD_SSE)
624707 __m128i r0 = _mm_srli_si128 (state, 9 );
625708 *reinterpret_cast <unaligned_int*>(&triangles[i * 12 ]) = _mm_cvtsi128_si32 (r0);
709+ #elif defined(SIMD_NEON)
710+ uint8x16_t r0 = vextq_u8 (state, vdupq_n_u8 (0 ), 9 );
711+ vst1q_lane_u32 (reinterpret_cast <unsigned int *>(&triangles[i * 12 ]), vreinterpretq_u32_u8 (r0), 0 );
712+ #endif
626713
627714 state = decodeTriangleGroup (state, code1, extra);
628715
629716 // write last two indices of second decoded triangle that we didn't write above plus two new ones
630717 // note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
718+ #if defined(SIMD_SSE)
631719 __m128i r1 = _mm_srli_si128 (state, 7 );
632720 _mm_storel_epi64 (reinterpret_cast <__m128i*>(&triangles[i * 12 + 4 ]), r1);
721+ #elif defined(SIMD_NEON)
722+ uint8x16_t r1 = vextq_u8 (state, vdupq_n_u8 (0 ), 7 );
723+ vst1_u8 (&triangles[i * 12 + 4 ], vget_low_u8 (r1));
724+ #endif
633725 }
634726
635727 // process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
@@ -644,12 +736,21 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
644736
645737 unsigned char * tail = &triangles[(triangle_count & ~3 ) * 3 ];
646738
739+ #if defined(SIMD_SSE)
647740 __m128i r = _mm_srli_si128 (state, 9 );
648741
649742 if ((triangle_count & 3 ) == 1 )
650743 *reinterpret_cast <unaligned_int*>(tail) = _mm_cvtsi128_si32 (r);
651744 else
652745 _mm_storel_epi64 (reinterpret_cast <__m128i*>(tail), r);
746+ #elif defined(SIMD_NEON)
747+ uint8x16_t r = vextq_u8 (state, vdupq_n_u8 (0 ), 9 );
748+
749+ if ((triangle_count & 3 ) == 1 )
750+ vst1q_lane_u32 (reinterpret_cast <unsigned int *>(tail), vreinterpretq_u32_u8 (r), 0 );
751+ else
752+ vst1_u8 (tail, vget_low_u8 (r));
753+ #endif
653754 }
654755
655756 return extra;
@@ -658,7 +759,11 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
658759SIMD_TARGET
659760static const unsigned char * decodeVerticesRawSimd (unsigned int * vertices, const unsigned char * ctrl, const unsigned char * data, const unsigned char * bound, size_t vertex_count)
660761{
762+ #if defined(SIMD_SSE)
661763 __m128i last = _mm_set1_epi32 (-1 );
764+ #elif defined(SIMD_NEON)
765+ uint32x4_t last = vdupq_n_u32 (~0u );
766+ #endif
662767
663768 for (size_t i = 0 ; i < vertex_count; i += 4 )
664769 {
@@ -669,7 +774,11 @@ static const unsigned char* decodeVerticesRawSimd(unsigned int* vertices, const
669774 last = decodeVertexGroup (last, code, data);
670775
671776 // safe to write 4 vertices as caller provides padded output buffer
777+ #if defined(SIMD_SSE)
672778 _mm_storeu_si128 (reinterpret_cast <__m128i*>(&vertices[i]), last);
779+ #elif defined(SIMD_NEON)
780+ vst1q_u32 (&vertices[i], last);
781+ #endif
673782 }
674783
675784 return data;
@@ -678,7 +787,11 @@ static const unsigned char* decodeVerticesRawSimd(unsigned int* vertices, const
678787SIMD_TARGET
679788static const unsigned char * decodeVerticesSimd (unsigned int * vertices, const unsigned char * ctrl, const unsigned char * data, const unsigned char * bound, size_t vertex_count)
680789{
790+ #if defined(SIMD_SSE)
681791 __m128i last = _mm_set1_epi32 (-1 );
792+ #elif defined(SIMD_NEON)
793+ uint32x4_t last = vdupq_n_u32 (~0u );
794+ #endif
682795
683796 size_t groups = vertex_count / 4 ;
684797
@@ -691,7 +804,11 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
691804
692805 last = decodeVertexGroup (last, code, data);
693806
807+ #if defined(SIMD_SSE)
694808 _mm_storeu_si128 (reinterpret_cast <__m128i*>(&vertices[i * 4 ]), last);
809+ #elif defined(SIMD_NEON)
810+ vst1q_u32 (&vertices[i * 4 ], last);
811+ #endif
695812 }
696813
697814 // process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
@@ -708,6 +825,7 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
708825
709826 switch (vertex_count & 3 )
710827 {
828+ #if defined(SIMD_SSE)
711829 case 3 :
712830 tail[2 ] = _mm_extract_epi32 (last, 2 );
713831 // fallthrough
@@ -717,6 +835,17 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
717835 case 1 :
718836 tail[0 ] = _mm_extract_epi32 (last, 0 );
719837 // fallthrough
838+ #elif defined(SIMD_NEON)
839+ case 3 :
840+ vst1q_lane_u32 (&tail[2 ], last, 2 );
841+ // fallthrough
842+ case 2 :
843+ vst1q_lane_u32 (&tail[1 ], last, 1 );
844+ // fallthrough
845+ case 1 :
846+ vst1q_lane_u32 (&tail[0 ], last, 0 );
847+ // fallthrough
848+ #endif
720849 default :;
721850 }
722851 }
@@ -727,15 +856,12 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
727856SIMD_TARGET
728857static const unsigned char * decodeVerticesSimd (unsigned short * vertices, const unsigned char * ctrl, const unsigned char * data, const unsigned char * bound, size_t vertex_count)
729858{
730- #ifdef __GNUC__
731- typedef int __attribute__ ((aligned (1 ))) unaligned_int;
732- #else
733- typedef int unaligned_int;
734- #endif
735-
859+ #if defined(SIMD_SSE)
736860 __m128i repack = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
737-
738861 __m128i last = _mm_set1_epi32 (-1 );
862+ #elif defined(SIMD_NEON)
863+ uint32x4_t last = vdupq_n_u32 (~0u );
864+ #endif
739865
740866 // because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing
741867 // if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop
@@ -751,8 +877,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
751877
752878 last = decodeVertexGroup (last, code, data);
753879
880+ #if defined(SIMD_SSE)
754881 __m128i r = _mm_shuffle_epi8 (last, repack);
755882 _mm_storel_epi64 (reinterpret_cast <__m128i*>(&vertices[i * 4 ]), r);
883+ #elif defined(SIMD_NEON)
884+ uint16x4_t r = vmovn_u32 (last);
885+ vst1_u16 (&vertices[i * 4 ], r);
886+ #endif
756887 }
757888
758889 // process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -767,8 +898,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
767898
768899 unsigned short * tail = &vertices[vertex_count & ~3 ];
769900
901+ #if defined(SIMD_SSE)
770902 __m128i r = _mm_shuffle_epi8 (last, repack);
771903 *reinterpret_cast <unaligned_int*>(tail) = _mm_cvtsi128_si32 (r);
904+ #elif defined(SIMD_NEON)
905+ uint16x4_t r = vmovn_u32 (last);
906+ vst1_lane_u32 (reinterpret_cast <unsigned int *>(tail), vreinterpret_u32_u16 (r), 0 );
907+ #endif
772908 }
773909
774910 return data;
@@ -938,7 +1074,7 @@ int meshopt_decodeMeshlet(void* vertices, size_t vertex_count, size_t vertex_siz
9381074
9391075#if defined(SIMD_FALLBACK)
9401076 return (gDecodeTablesInitialized ? decodeMeshletSimd : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
941- #elif defined(SIMD_SSE)
1077+ #elif defined(SIMD_SSE) || defined(SIMD_NEON)
9421078 return decodeMeshletSimd (vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
9431079#else
9441080 return decodeMeshlet (vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
@@ -970,13 +1106,14 @@ int meshopt_decodeMeshletRaw(unsigned int* vertices, size_t vertex_count, unsign
9701106
9711107#if defined(SIMD_FALLBACK)
9721108 return (gDecodeTablesInitialized ? decodeMeshletRawSimd : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4 , 4 );
973- #elif defined(SIMD_SSE)
1109+ #elif defined(SIMD_SSE) || defined(SIMD_NEON)
9741110 return decodeMeshletRawSimd (vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4 , 4 );
9751111#else
9761112 return decodeMeshlet (vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4 , 4 );
9771113#endif
9781114}
9791115
9801116#undef SIMD_SSE
1117+ #undef SIMD_NEON
9811118#undef SIMD_FALLBACK
9821119#undef SIMD_TARGET
0 commit comments