@@ -637,11 +637,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
637637 // write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
638638#if defined(SIMD_SSE)
639639 __m128i r = _mm_shuffle_epi8 (state, repack);
640- _mm_storel_epi64 (reinterpret_cast <__m128i*>(& triangles[i * 2 ] ), r);
640+ _mm_storel_epi64 (reinterpret_cast <__m128i*>(triangles), r);
641641#elif defined(SIMD_NEON)
642642 uint32x2_t r = vreinterpret_u32_u8 (vqtbl1_u8 (state, repack));
643- vst1_u32 (& triangles[i * 2 ] , r);
643+ vst1_u32 (triangles, r);
644644#endif
645+
646+ triangles += 2 ;
645647 }
646648
647649 // process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -654,14 +656,12 @@ static const unsigned char* decodeTrianglesSimd(unsigned int* triangles, const u
654656
655657 state = decodeTriangleGroup (state, code, extra);
656658
657- unsigned int * tail = &triangles[triangle_count & ~1 ];
658-
659659#if defined(SIMD_SSE)
660660 __m128i r = _mm_shuffle_epi8 (state, repack);
661- *tail = unsigned (_mm_cvtsi128_si32 (r));
661+ *triangles = unsigned (_mm_cvtsi128_si32 (r));
662662#elif defined(SIMD_NEON)
663663 uint32x2_t r = vreinterpret_u32_u8 (vqtbl1_u8 (state, repack));
664- vst1_lane_u32 (tail , r, 0 );
664+ vst1_lane_u32 (triangles , r, 0 );
665665#endif
666666 }
667667
@@ -697,10 +697,10 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
697697 // write first decoded triangle and first index of second decoded triangle
698698#if defined(SIMD_SSE)
699699 __m128i r0 = _mm_srli_si128 (state, 9 );
700- *reinterpret_cast <unaligned_int*>(& triangles[i * 12 ] ) = _mm_cvtsi128_si32 (r0);
700+ *reinterpret_cast <unaligned_int*>(triangles) = _mm_cvtsi128_si32 (r0);
701701#elif defined(SIMD_NEON)
702702 uint8x16_t r0 = vextq_u8 (state, vdupq_n_u8 (0 ), 9 );
703- vst1q_lane_u32 (reinterpret_cast <unsigned int *>(& triangles[i * 12 ] ), vreinterpretq_u32_u8 (r0), 0 );
703+ vst1q_lane_u32 (reinterpret_cast <unsigned int *>(triangles), vreinterpretq_u32_u8 (r0), 0 );
704704#endif
705705
706706 state = decodeTriangleGroup (state, code1, extra);
@@ -709,11 +709,13 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
709709 // note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
710710#if defined(SIMD_SSE)
711711 __m128i r1 = _mm_srli_si128 (state, 7 );
712- _mm_storel_epi64 (reinterpret_cast <__m128i*>(&triangles[i * 12 + 4 ]), r1);
712+ _mm_storel_epi64 (reinterpret_cast <__m128i*>(&triangles[4 ]), r1);
713713#elif defined(SIMD_NEON)
714714 uint8x16_t r1 = vextq_u8 (state, vdupq_n_u8 (0 ), 7 );
715- vst1_u8 (&triangles[i * 12 + 4 ], vget_low_u8 (r1));
715+ vst1_u8 (&triangles[4 ], vget_low_u8 (r1));
716716#endif
717+
718+ triangles += 12 ;
717719 }
718720
719721 // process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
@@ -726,20 +728,18 @@ static const unsigned char* decodeTrianglesSimd(unsigned char* triangles, const
726728
727729 state = decodeTriangleGroup (state, code, extra);
728730
729- unsigned char * tail = &triangles[(triangle_count & ~3 ) * 3 ];
730-
731731#if defined(SIMD_SSE)
732732 __m128i r = _mm_srli_si128 (state, 9 );
733733
734- *reinterpret_cast <unaligned_int*>(tail ) = _mm_cvtsi128_si32 (r);
734+ *reinterpret_cast <unaligned_int*>(triangles ) = _mm_cvtsi128_si32 (r);
735735 if ((triangle_count & 3 ) > 1 )
736- *reinterpret_cast <unaligned_int*>(tail + 4 ) = _mm_extract_epi32 (r, 1 );
736+ *reinterpret_cast <unaligned_int*>(triangles + 4 ) = _mm_extract_epi32 (r, 1 );
737737#elif defined(SIMD_NEON)
738738 uint8x16_t r = vextq_u8 (state, vdupq_n_u8 (0 ), 9 );
739739
740- vst1q_lane_u32 (reinterpret_cast <unsigned int *>(tail ), vreinterpretq_u32_u8 (r), 0 );
740+ vst1q_lane_u32 (reinterpret_cast <unsigned int *>(triangles ), vreinterpretq_u32_u8 (r), 0 );
741741 if ((triangle_count & 3 ) > 1 )
742- vst1q_lane_u32 (reinterpret_cast <unsigned int *>(tail + 4 ), vreinterpretq_u32_u8 (r), 1 );
742+ vst1q_lane_u32 (reinterpret_cast <unsigned int *>(triangles + 4 ), vreinterpretq_u32_u8 (r), 1 );
743743#endif
744744 }
745745
@@ -767,10 +767,12 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
767767 last = decodeVertexGroup (last, code, data);
768768
769769#if defined(SIMD_SSE)
770- _mm_storeu_si128 (reinterpret_cast <__m128i*>(& vertices[i * 4 ] ), last);
770+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(vertices), last);
771771#elif defined(SIMD_NEON)
772- vst1q_u32 (& vertices[i * 4 ] , last);
772+ vst1q_u32 (vertices, last);
773773#endif
774+
775+ vertices += 4 ;
774776 }
775777
776778 // process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
@@ -783,20 +785,18 @@ static const unsigned char* decodeVerticesSimd(unsigned int* vertices, const uns
783785
784786 last = decodeVertexGroup (last, code, data);
785787
786- unsigned int * tail = &vertices[vertex_count & ~3 ];
787-
788788#if defined(SIMD_SSE)
789- tail [0 ] = _mm_cvtsi128_si32 (last);
789+ vertices [0 ] = _mm_cvtsi128_si32 (last);
790790 if ((vertex_count & 3 ) > 1 )
791- tail [1 ] = _mm_extract_epi32 (last, 1 );
791+ vertices [1 ] = _mm_extract_epi32 (last, 1 );
792792 if ((vertex_count & 3 ) > 2 )
793- tail [2 ] = _mm_extract_epi32 (last, 2 );
793+ vertices [2 ] = _mm_extract_epi32 (last, 2 );
794794#elif defined(SIMD_NEON)
795- vst1q_lane_u32 (&tail [0 ], last, 0 );
795+ vst1q_lane_u32 (&vertices [0 ], last, 0 );
796796 if ((vertex_count & 3 ) > 1 )
797- vst1q_lane_u32 (&tail [1 ], last, 1 );
797+ vst1q_lane_u32 (&vertices [1 ], last, 1 );
798798 if ((vertex_count & 3 ) > 2 )
799- vst1q_lane_u32 (&tail [2 ], last, 2 );
799+ vst1q_lane_u32 (&vertices [2 ], last, 2 );
800800#endif
801801 }
802802
@@ -829,11 +829,13 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
829829
830830#if defined(SIMD_SSE)
831831 __m128i r = _mm_shuffle_epi8 (last, repack);
832- _mm_storel_epi64 (reinterpret_cast <__m128i*>(& vertices[i * 4 ] ), r);
832+ _mm_storel_epi64 (reinterpret_cast <__m128i*>(vertices), r);
833833#elif defined(SIMD_NEON)
834834 uint16x4_t r = vmovn_u32 (last);
835- vst1_u16 (& vertices[i * 4 ] , r);
835+ vst1_u16 (vertices, r);
836836#endif
837+
838+ vertices += 4 ;
837839 }
838840
839841 // process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
@@ -846,14 +848,12 @@ static const unsigned char* decodeVerticesSimd(unsigned short* vertices, const u
846848
847849 last = decodeVertexGroup (last, code, data);
848850
849- unsigned short * tail = &vertices[vertex_count & ~3 ];
850-
851851#if defined(SIMD_SSE)
852852 __m128i r = _mm_shufflelo_epi16 (last, 8 );
853- *reinterpret_cast <unaligned_int*>(tail ) = _mm_cvtsi128_si32 (r);
853+ *reinterpret_cast <unaligned_int*>(vertices ) = _mm_cvtsi128_si32 (r);
854854#elif defined(SIMD_NEON)
855855 uint16x4_t r = vmovn_u32 (last);
856- vst1_lane_u32 (reinterpret_cast <unsigned int *>(tail ), vreinterpret_u32_u16 (r), 0 );
856+ vst1_lane_u32 (reinterpret_cast <unsigned int *>(vertices ), vreinterpret_u32_u16 (r), 0 );
857857#endif
858858 }
859859
0 commit comments