Skip to content

Commit 51017c6

Browse files
Gyuuul2milot-mirdita
authored andcommitted
Restore SIMD prefix_sum and remove unnecessary simd_float member
1 parent c0bff1e commit 51017c6

File tree

2 files changed

+39
-38
lines changed

2 files changed

+39
-38
lines changed

src/alignment/Fwbw.cpp

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -44,26 +44,27 @@ inline void calculate_max4(float& max, float& term1, float& term2, float& term3,
4444
}
4545

4646
inline simd_float simdf32_prefixsum(simd_float a) {
47-
// a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 4)));
48-
// a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 8)));
49-
// #ifdef AVX2
50-
// a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 16)));
51-
// #endif
52-
// return a;
53-
float buf[8];
54-
simdf32_storeu(buf, a);
55-
56-
buf[1] += buf[0];
57-
buf[2] += buf[1];
58-
buf[3] += buf[2];
47+
a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 4)));
48+
a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 8)));
5949
#ifdef AVX2
60-
buf[4] += buf[3];
61-
buf[5] += buf[4];
62-
buf[6] += buf[5];
63-
buf[7] += buf[6];
50+
a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 16)));
6451
#endif
52+
return a;
53+
// Fallback scalar implementation
54+
// float buf[8];
55+
// simdf32_storeu(buf, a);
56+
57+
// buf[1] += buf[0];
58+
// buf[2] += buf[1];
59+
// buf[3] += buf[2];
60+
// #ifdef AVX2
61+
// buf[4] += buf[3];
62+
// buf[5] += buf[4];
63+
// buf[6] += buf[5];
64+
// buf[7] += buf[6];
65+
// #endif
6566

66-
return simdf32_loadu(buf);
67+
// return simdf32_loadu(buf);
6768
}
6869

6970
// FwBwAligner Constructor for general case: use profile scoring matrix
@@ -100,8 +101,8 @@ FwBwAligner::FwBwAligner(SubstitutionMatrix &subMat, float gapOpen, float gapExt
100101
exp_ge_arr[i] = exp((i * gapExtend + gapExtend) / temperature);
101102
}
102103
// Gap open and extend
103-
exp_go = simdf32_set(static_cast<float>(exp(gapOpen / temperature)));
104-
exp_ge = simdf32_set(static_cast<float>(exp(gapExtend / temperature)));
104+
exp_go = (static_cast<float>(exp(gapOpen / temperature)));
105+
exp_ge = (static_cast<float>(exp(gapExtend / temperature)));
105106
// Blosum matrix
106107
blosum = malloc_matrix<float>(21, 21);
107108
for (int i = 0; i < subMat.alphabetSize; ++i) {
@@ -149,8 +150,8 @@ FwBwAligner::FwBwAligner(float gapOpen, float gapExtend, float temperature, floa
149150
exp_ge_arr[i] = exp((i * gapExtend + gapExtend) / temperature);
150151
}
151152
// Gap open and extend
152-
exp_go = simdf32_set(static_cast<float>(exp(gapOpen / temperature)));
153-
exp_ge = simdf32_set(static_cast<float>(exp(gapExtend / temperature)));
153+
exp_go = (static_cast<float>(exp(gapOpen / temperature)));
154+
exp_ge = (static_cast<float>(exp(gapExtend / temperature)));
154155

155156
if (backtrace != 0) {
156157
blosum = nullptr;
@@ -333,8 +334,8 @@ void FwBwAligner::resetParams(float newGapOpen, float newGapExtend, float newTem
333334
gapOpen = newGapOpen;
334335
gapExtend = newGapExtend;
335336
temperature = newTemperature;
336-
exp_go = simdf32_set(static_cast<float>(exp(gapOpen / temperature)));
337-
exp_ge = simdf32_set(static_cast<float>(exp(gapExtend / temperature)));
337+
exp_go = (static_cast<float>(exp(gapOpen / temperature)));
338+
exp_ge = (static_cast<float>(exp(gapExtend / temperature)));
338339

339340
for (size_t i = 0; i < length; ++i) {
340341
vj[i] = exp(((length - 1) * gapExtend + gapOpen - i * gapExtend) / temperature);
@@ -420,7 +421,7 @@ void FwBwAligner::forward() {
420421
std::fill(zInit[i], zInit[i] + rowsCapacity, FLT_MIN_EXP); // rowsCapacity -> tlen
421422
}
422423
max_zm = -std::numeric_limits<float>::max();
423-
vMax_zm = simdf32_set(max_zm);
424+
simd_float vMax_zm = simdf32_set(max_zm);
424425
P = nullptr; // reset p. do we need this?
425426
for (size_t b = 0; b < blocks; ++b) {
426427
size_t start = b * length;
@@ -476,8 +477,8 @@ void FwBwAligner::forward() {
476477
simd_float vZmPrev = simdf32_loadu(&zmBlockPrev[j]);
477478
simd_float vZf = simdf32_loadu(&zfBlock[j]);
478479
simd_float vZfUpdate = simdf32_add(
479-
simdf32_mul(vZmPrev, exp_go),
480-
simdf32_mul(vZf, exp_ge)
480+
simdf32_mul(vZmPrev, simdf32_set(exp_go)),
481+
simdf32_mul(vZf, simdf32_set(exp_ge))
481482
);
482483
vZfUpdate = simdf32_div(vZfUpdate, vZmMaxRowBlock);
483484
simdf32_storeu(&zfBlock[j], vZfUpdate);
@@ -491,11 +492,11 @@ void FwBwAligner::forward() {
491492
vLastPrefixSum = simdf32_set(vCumsumZm[(VECSIZE_FLOAT - 1)]);
492493
simd_float vWj = simdf32_load(&wj[j]);
493494
simd_float vExp_ge_arr = simdf32_load(&exp_ge_arr[j]);
494-
// simd_float vZeUpdate = simdf32_add(
495-
// simdf32_div(vCumsumZm, vWj),
496-
// simdf32_mul(vZeI0, vExp_ge_arr)
497-
// );
498-
simd_float vZeUpdate = simdf32_fmadd(vZeI0, vExp_ge_arr, simdf32_div(vCumsumZm, vWj));
495+
simd_float vZeUpdate = simdf32_add(
496+
simdf32_div(vCumsumZm, vWj),
497+
simdf32_mul(vZeI0, vExp_ge_arr)
498+
);
499+
// simd_float vZeUpdate = simdf32_fmadd(vZeI0, vExp_ge_arr, simdf32_div(vCumsumZm, vWj));
499500
vZeUpdate = simdf32_div(vZeUpdate, vZmMaxRowBlock);
500501
simdf32_storeu(&zeBlock[j+1], vZeUpdate);
501502
}
@@ -693,8 +694,8 @@ void FwBwAligner::backward() {
693694
simd_float vZmPrev = simdf32_loadu(&zmBlockPrev[j]);
694695
simd_float vZf = simdf32_loadu(&zfBlock[j]);
695696
simd_float vZfUpdate = simdf32_add(
696-
simdf32_mul(vZmPrev, exp_go),
697-
simdf32_mul(vZf, exp_ge)
697+
simdf32_mul(vZmPrev, simdf32_set(exp_go)),
698+
simdf32_mul(vZf, simdf32_set(exp_ge))
698699
);
699700
vZfUpdate = simdf32_div(vZfUpdate, vZmMaxRowBlock);
700701
simdf32_storeu(&zfBlock[j], vZfUpdate);
@@ -1186,4 +1187,4 @@ int fwbw(int argc, const char **argv, const Command &command) {
11861187
tdbr.close();
11871188

11881189
return EXIT_SUCCESS;
1189-
}
1190+
}

src/alignment/Fwbw.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,14 @@ class FwBwAligner {
115115
float** blosum= nullptr; // Profile true
116116
float* S_prev = nullptr; // backtrace true
117117
float* S_curr = nullptr; // backtrace true
118-
simd_float exp_go;
119-
simd_float exp_ge;
118+
float exp_go;
119+
float exp_ge;
120120
float max_zm;
121121
float sum_exp;
122122
// float maxP;
123123

124-
simd_float vMax_zm;
125-
simd_float vSum_exp;
124+
125+
//simd_float vSum_exp;
126126
size_t colSeqLen_padding;
127127

128128
s_align alignResult;

0 commit comments

Comments
 (0)