@@ -27,16 +27,14 @@ float FP32_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_
2727
2828 svfloat32_t sum0 = svdup_f32 (0 .0f );
2929 svfloat32_t sum1 = svdup_f32 (0 .0f );
30- svfloat32_t sum2 = svdup_f32 (0 .0f );
31- svfloat32_t sum3 = svdup_f32 (0 .0f );
3230
3331 auto chunk_size = 4 * vl;
3432 size_t number_of_chunks = dimension / chunk_size;
3533 for (size_t i = 0 ; i < number_of_chunks; i++) {
3634 InnerProductStep (pVect1, pVect2, offset, sum0);
3735 InnerProductStep (pVect1, pVect2, offset, sum1);
38- InnerProductStep (pVect1, pVect2, offset, sum2 );
39- InnerProductStep (pVect1, pVect2, offset, sum3 );
36+ InnerProductStep (pVect1, pVect2, offset, sum0 );
37+ InnerProductStep (pVect1, pVect2, offset, sum1 );
4038 }
4139
4240 if constexpr (additional_steps > 0 ) {
@@ -47,21 +45,19 @@ float FP32_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_
4745 InnerProductStep (pVect1, pVect2, offset, sum1);
4846 }
4947 if constexpr (additional_steps >= 3 ) {
50- InnerProductStep (pVect1, pVect2, offset, sum2 );
48+ InnerProductStep (pVect1, pVect2, offset, sum0 );
5149 }
5250 }
5351
5452 if constexpr (partial_chunk) {
5553 svbool_t pg = svwhilelt_b32 (offset, dimension);
5654 svfloat32_t v1 = svld1_f32 (pg, pVect1 + offset);
5755 svfloat32_t v2 = svld1_f32 (pg, pVect2 + offset);
58- sum0 = svmla_f32_m (pg, sum0 , v1, v2);
56+ sum1 = svmla_f32_m (pg, sum1 , v1, v2);
5957 }
6058
6159 // Combine the partial sums
6260 sum0 = svadd_f32_z (svptrue_b32 (), sum0, sum1);
63- sum2 = svadd_f32_z (svptrue_b32 (), sum2, sum3);
64- sum0 = svadd_f32_z (svptrue_b32 (), sum0, sum2);
6561
6662 // Horizontal sum
6763 float result = svaddv_f32 (svptrue_b32 (), sum0);
0 commit comments