Skip to content

Commit f1487b8

Browse files
committed
fix the l2
1 parent 45e8fdd commit f1487b8

File tree

3 files changed

+80
-26
lines changed

3 files changed

+80
-26
lines changed

src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,43 @@
77
#include "VecSim/spaces/space_includes.h"
88
#include <armpl.h>
99

10+
template <unsigned char residual> // 0..15
1011
float FP32_L2Sqr_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
1112
const float *vec1 = static_cast<const float *>(pVect1v);
1213
const float *vec2 = static_cast<const float *>(pVect2v);
1314

1415
float result = 0.0f;
15-
constexpr const size_t blockSize = 1024;
16+
constexpr const size_t blockSize = 1024;
1617
float buffer[blockSize];
1718

18-
for (size_t i = 0; i < dimension; i += blockSize) {
19-
// Process in smaller chunks to improve cache behavior
20-
size_t currentBlock = std::min(blockSize, dimension - i);
19+
// Pre-calculate number of full blocks and the size of the last partial block
20+
const size_t fullBlockCount = dimension / blockSize;
21+
const size_t lastBlockSize = dimension % blockSize;
2122

22-
// Calculate difference vector in chunks
23-
for (size_t j = 0; j < currentBlock; j++) {
24-
buffer[j] = vec1[i + j] - vec2[i + j];
23+
// Process full blocks
24+
for (size_t i = 0; i < fullBlockCount; i++) {
25+
size_t offset = i * blockSize;
26+
27+
// Calculate difference vector for full block
28+
for (size_t j = 0; j < blockSize; j++) {
29+
buffer[j] = vec1[offset + j] - vec2[offset + j];
30+
}
31+
32+
// Use ARMPL to compute dot product
33+
result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
34+
}
35+
36+
// Handle remaining elements (if any)
37+
if (lastBlockSize > 0) {
38+
size_t offset = fullBlockCount * blockSize;
39+
40+
// Calculate difference vector for remaining elements
41+
for (size_t j = 0; j < lastBlockSize; j++) {
42+
buffer[j] = vec1[offset + j] - vec2[offset + j];
2543
}
2644

27-
// Notice: Armpl can choose different implementation based on cpu features.
28-
result += cblas_sdot(currentBlock, buffer, 1, buffer, 1);
45+
// Use ARMPL to compute dot product
46+
result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
2947
}
3048

3149
return result;

src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "VecSim/spaces/space_includes.h"
88
#include "armpl.h"
99

10+
template <unsigned char residual> // 0..15
1011
float FP32_L2Sqr_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
1112
const float *vec1 = static_cast<const float *>(pVect1v);
1213
const float *vec2 = static_cast<const float *>(pVect2v);
@@ -15,17 +16,34 @@ float FP32_L2Sqr_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dim
1516
constexpr const size_t blockSize = 1024;
1617
float buffer[blockSize];
1718

18-
for (size_t i = 0; i < dimension; i += blockSize) {
19-
// Process in smaller chunks to improve cache behavior
20-
size_t currentBlock = std::min(blockSize, dimension - i);
19+
// Pre-calculate number of full blocks and the size of the last partial block
20+
const size_t fullBlockCount = dimension / blockSize;
21+
const size_t lastBlockSize = dimension % blockSize;
2122

22-
// Calculate difference vector in chunks
23-
for (size_t j = 0; j < currentBlock; j++) {
24-
buffer[j] = vec1[i + j] - vec2[i + j];
23+
// Process full blocks
24+
for (size_t i = 0; i < fullBlockCount; i++) {
25+
size_t offset = i * blockSize;
26+
27+
// Calculate difference vector for full block
28+
for (size_t j = 0; j < blockSize; j++) {
29+
buffer[j] = vec1[offset + j] - vec2[offset + j];
30+
}
31+
32+
// Use ARMPL to compute dot product
33+
result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
34+
}
35+
36+
// Handle remaining elements (if any)
37+
if (lastBlockSize > 0) {
38+
size_t offset = fullBlockCount * blockSize;
39+
40+
// Calculate difference vector for remaining elements
41+
for (size_t j = 0; j < lastBlockSize; j++) {
42+
buffer[j] = vec1[offset + j] - vec2[offset + j];
2543
}
2644

27-
// Notice: Armpl can choose different implementation based on cpu features.
28-
result += cblas_sdot(currentBlock, buffer, 1, buffer, 1);
45+
// Use ARMPL to compute dot product
46+
result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
2947
}
3048

3149
return result;

src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "VecSim/spaces/space_includes.h"
88
#include "armpl.h"
99

10+
template <unsigned char residual>
1011
float FP32_L2Sqr_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
1112
const float *vec1 = static_cast<const float *>(pVect1v);
1213
const float *vec2 = static_cast<const float *>(pVect2v);
@@ -15,18 +16,35 @@ float FP32_L2Sqr_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dime
1516
constexpr const size_t blockSize = 1024;
1617
float buffer[blockSize];
1718

18-
for (size_t i = 0; i < dimension; i += blockSize) {
19-
// Process in smaller chunks to improve cache behavior
20-
size_t currentBlock = std::min(blockSize, dimension - i);
19+
// Pre-calculate number of full blocks and the size of the last partial block
20+
const size_t fullBlockCount = dimension / blockSize;
21+
const size_t lastBlockSize = dimension % blockSize;
2122

22-
// Calculate difference vector in chunks
23-
for (size_t j = 0; j < currentBlock; j++) {
24-
buffer[j] = vec1[i + j] - vec2[i + j];
23+
// Process full blocks
24+
for (size_t i = 0; i < fullBlockCount; i++) {
25+
size_t offset = i * blockSize;
26+
27+
// Calculate difference vector for full block
28+
for (size_t j = 0; j < blockSize; j++) {
29+
buffer[j] = vec1[offset + j] - vec2[offset + j];
30+
}
31+
32+
// Use ARMPL to compute dot product
33+
result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
34+
}
35+
36+
// Handle remaining elements (if any)
37+
if (lastBlockSize > 0) {
38+
size_t offset = fullBlockCount * blockSize;
39+
40+
// Calculate difference vector for remaining elements
41+
for (size_t j = 0; j < lastBlockSize; j++) {
42+
buffer[j] = vec1[offset + j] - vec2[offset + j];
2543
}
2644

27-
// Notice: Armpl can choose different implementation based on cpu features.
28-
result += cblas_sdot(currentBlock, buffer, 1, buffer, 1);
45+
// Use ARMPL to compute dot product
46+
result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
2947
}
3048

3149
return result;
32-
}
50+
}

0 commit comments

Comments
 (0)