Skip to content

Commit 417a95c

Browse files
committed
Change to 2 sums
1 parent 29688d9 commit 417a95c

File tree

2 files changed

+8
-16
lines changed

2 files changed

+8
-16
lines changed

src/VecSim/spaces/IP/IP_SVE2_FP32.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,14 @@ float FP32_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_
2727

2828
svfloat32_t sum0 = svdup_f32(0.0f);
2929
svfloat32_t sum1 = svdup_f32(0.0f);
30-
svfloat32_t sum2 = svdup_f32(0.0f);
31-
svfloat32_t sum3 = svdup_f32(0.0f);
3230

3331
auto chunk_size = 4 * vl;
3432
size_t number_of_chunks = dimension / chunk_size;
3533
for (size_t i = 0; i < number_of_chunks; i++) {
3634
InnerProductStep(pVect1, pVect2, offset, sum0);
3735
InnerProductStep(pVect1, pVect2, offset, sum1);
38-
InnerProductStep(pVect1, pVect2, offset, sum2);
39-
InnerProductStep(pVect1, pVect2, offset, sum3);
36+
InnerProductStep(pVect1, pVect2, offset, sum0);
37+
InnerProductStep(pVect1, pVect2, offset, sum1);
4038
}
4139

4240
if constexpr (additional_steps > 0) {
@@ -47,21 +45,19 @@ float FP32_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_
4745
InnerProductStep(pVect1, pVect2, offset, sum1);
4846
}
4947
if constexpr (additional_steps >= 3) {
50-
InnerProductStep(pVect1, pVect2, offset, sum2);
48+
InnerProductStep(pVect1, pVect2, offset, sum0);
5149
}
5250
}
5351

5452
if constexpr (partial_chunk) {
5553
svbool_t pg = svwhilelt_b32(offset, dimension);
5654
svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
5755
svfloat32_t v2 = svld1_f32(pg, pVect2 + offset);
58-
sum0 = svmla_f32_m(pg, sum0, v1, v2);
56+
sum1 = svmla_f32_m(pg, sum1, v1, v2);
5957
}
6058

6159
// Combine the partial sums
6260
sum0 = svadd_f32_z(svptrue_b32(), sum0, sum1);
63-
sum2 = svadd_f32_z(svptrue_b32(), sum2, sum3);
64-
sum0 = svadd_f32_z(svptrue_b32(), sum0, sum2);
6561

6662
// Horizontal sum
6763
float result = svaddv_f32(svptrue_b32(), sum0);

src/VecSim/spaces/IP/IP_SVE_FP32.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,14 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t
2727

2828
svfloat32_t sum0 = svdup_f32(0.0f);
2929
svfloat32_t sum1 = svdup_f32(0.0f);
30-
svfloat32_t sum2 = svdup_f32(0.0f);
31-
svfloat32_t sum3 = svdup_f32(0.0f);
3230

3331
auto chunk_size = 4 * vl;
3432
size_t number_of_chunks = dimension / chunk_size;
3533
for (size_t i = 0; i < number_of_chunks; i++) {
3634
InnerProductStep(pVect1, pVect2, offset, sum0);
3735
InnerProductStep(pVect1, pVect2, offset, sum1);
38-
InnerProductStep(pVect1, pVect2, offset, sum2);
39-
InnerProductStep(pVect1, pVect2, offset, sum3);
36+
InnerProductStep(pVect1, pVect2, offset, sum0);
37+
InnerProductStep(pVect1, pVect2, offset, sum1);
4038
}
4139

4240
if constexpr (additional_steps > 0) {
@@ -47,21 +45,19 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t
4745
InnerProductStep(pVect1, pVect2, offset, sum1);
4846
}
4947
if constexpr (additional_steps >= 3) {
50-
InnerProductStep(pVect1, pVect2, offset, sum2);
48+
InnerProductStep(pVect1, pVect2, offset, sum0);
5149
}
5250
}
5351

5452
if constexpr (partial_chunk) {
5553
svbool_t pg = svwhilelt_b32(offset, dimension);
5654
svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
5755
svfloat32_t v2 = svld1_f32(pg, pVect2 + offset);
58-
sum0 = svmla_f32_m(pg, sum0, v1, v2);
56+
sum1 = svmla_f32_m(pg, sum1, v1, v2);
5957
}
6058

6159
// Combine the partial sums
6260
sum0 = svadd_f32_z(svptrue_b32(), sum0, sum1);
63-
sum2 = svadd_f32_z(svptrue_b32(), sum2, sum3);
64-
sum0 = svadd_f32_z(svptrue_b32(), sum0, sum2);
6561

6662
// Horizontal sum
6763
float result = svaddv_f32(svptrue_b32(), sum0);

0 commit comments

Comments
 (0)