@@ -44,26 +44,27 @@ inline void calculate_max4(float& max, float& term1, float& term2, float& term3,
4444}
4545
4646inline simd_float simdf32_prefixsum (simd_float a) {
47- // a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 4)));
48- // a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 8)));
49- // #ifdef AVX2
50- // a = simdf32_add(a, simdi_i2fcast(simdi8_shiftl(simdf_f2icast(a), 16)));
51- // #endif
52- // return a;
53- float buf[8 ];
54- simdf32_storeu (buf, a);
55-
56- buf[1 ] += buf[0 ];
57- buf[2 ] += buf[1 ];
58- buf[3 ] += buf[2 ];
47+ a = simdf32_add (a, simdi_i2fcast (simdi8_shiftl (simdf_f2icast (a), 4 )));
48+ a = simdf32_add (a, simdi_i2fcast (simdi8_shiftl (simdf_f2icast (a), 8 )));
5949#ifdef AVX2
60- buf[4 ] += buf[3 ];
61- buf[5 ] += buf[4 ];
62- buf[6 ] += buf[5 ];
63- buf[7 ] += buf[6 ];
50+ a = simdf32_add (a, simdi_i2fcast (simdi8_shiftl (simdf_f2icast (a), 16 )));
6451#endif
52+ return a;
53+ // Fallback scalar implementation
54+ // float buf[8];
55+ // simdf32_storeu(buf, a);
56+
57+ // buf[1] += buf[0];
58+ // buf[2] += buf[1];
59+ // buf[3] += buf[2];
60+ // #ifdef AVX2
61+ // buf[4] += buf[3];
62+ // buf[5] += buf[4];
63+ // buf[6] += buf[5];
64+ // buf[7] += buf[6];
65+ // #endif
6566
66- return simdf32_loadu (buf);
67+ // return simdf32_loadu(buf);
6768}
6869
6970// FwBwAligner Constructor for general case: use profile scoring matrix
@@ -100,8 +101,8 @@ FwBwAligner::FwBwAligner(SubstitutionMatrix &subMat, float gapOpen, float gapExt
100101 exp_ge_arr[i] = exp ((i * gapExtend + gapExtend) / temperature);
101102 }
102103 // Gap open and extend
103- exp_go = simdf32_set (static_cast <float >(exp (gapOpen / temperature)));
104- exp_ge = simdf32_set (static_cast <float >(exp (gapExtend / temperature)));
104+ exp_go = (static_cast <float >(exp (gapOpen / temperature)));
105+ exp_ge = (static_cast <float >(exp (gapExtend / temperature)));
105106 // Blosum matrix
106107 blosum = malloc_matrix<float >(21 , 21 );
107108 for (int i = 0 ; i < subMat.alphabetSize ; ++i) {
@@ -149,8 +150,8 @@ FwBwAligner::FwBwAligner(float gapOpen, float gapExtend, float temperature, floa
149150 exp_ge_arr[i] = exp ((i * gapExtend + gapExtend) / temperature);
150151 }
151152 // Gap open and extend
152- exp_go = simdf32_set (static_cast <float >(exp (gapOpen / temperature)));
153- exp_ge = simdf32_set (static_cast <float >(exp (gapExtend / temperature)));
153+ exp_go = (static_cast <float >(exp (gapOpen / temperature)));
154+ exp_ge = (static_cast <float >(exp (gapExtend / temperature)));
154155
155156 if (backtrace != 0 ) {
156157 blosum = nullptr ;
@@ -333,8 +334,8 @@ void FwBwAligner::resetParams(float newGapOpen, float newGapExtend, float newTem
333334 gapOpen = newGapOpen;
334335 gapExtend = newGapExtend;
335336 temperature = newTemperature;
336- exp_go = simdf32_set (static_cast <float >(exp (gapOpen / temperature)));
337- exp_ge = simdf32_set (static_cast <float >(exp (gapExtend / temperature)));
337+ exp_go = (static_cast <float >(exp (gapOpen / temperature)));
338+ exp_ge = (static_cast <float >(exp (gapExtend / temperature)));
338339
339340 for (size_t i = 0 ; i < length; ++i) {
340341 vj[i] = exp (((length - 1 ) * gapExtend + gapOpen - i * gapExtend) / temperature);
@@ -420,7 +421,7 @@ void FwBwAligner::forward() {
420421 std::fill (zInit[i], zInit[i] + rowsCapacity, FLT_MIN_EXP); // rowsCapacity -> tlen
421422 }
422423 max_zm = -std::numeric_limits<float >::max ();
423- vMax_zm = simdf32_set (max_zm);
424+ simd_float vMax_zm = simdf32_set (max_zm);
424425 P = nullptr ; // reset p. do we need this?
425426 for (size_t b = 0 ; b < blocks; ++b) {
426427 size_t start = b * length;
@@ -476,8 +477,8 @@ void FwBwAligner::forward() {
476477 simd_float vZmPrev = simdf32_loadu (&zmBlockPrev[j]);
477478 simd_float vZf = simdf32_loadu (&zfBlock[j]);
478479 simd_float vZfUpdate = simdf32_add (
479- simdf32_mul (vZmPrev, exp_go),
480- simdf32_mul (vZf, exp_ge)
480+ simdf32_mul (vZmPrev, simdf32_set ( exp_go) ),
481+ simdf32_mul (vZf, simdf32_set ( exp_ge) )
481482 );
482483 vZfUpdate = simdf32_div (vZfUpdate, vZmMaxRowBlock);
483484 simdf32_storeu (&zfBlock[j], vZfUpdate);
@@ -491,11 +492,11 @@ void FwBwAligner::forward() {
491492 vLastPrefixSum = simdf32_set (vCumsumZm[(VECSIZE_FLOAT - 1 )]);
492493 simd_float vWj = simdf32_load (&wj[j]);
493494 simd_float vExp_ge_arr = simdf32_load (&exp_ge_arr[j]);
494- // simd_float vZeUpdate = simdf32_add(
495- // simdf32_div(vCumsumZm, vWj),
496- // simdf32_mul(vZeI0, vExp_ge_arr)
497- // );
498- simd_float vZeUpdate = simdf32_fmadd (vZeI0, vExp_ge_arr, simdf32_div (vCumsumZm, vWj));
495+ simd_float vZeUpdate = simdf32_add (
496+ simdf32_div (vCumsumZm, vWj),
497+ simdf32_mul (vZeI0, vExp_ge_arr)
498+ );
499+ // simd_float vZeUpdate = simdf32_fmadd(vZeI0, vExp_ge_arr, simdf32_div(vCumsumZm, vWj));
499500 vZeUpdate = simdf32_div (vZeUpdate, vZmMaxRowBlock);
500501 simdf32_storeu (&zeBlock[j+1 ], vZeUpdate);
501502 }
@@ -693,8 +694,8 @@ void FwBwAligner::backward() {
693694 simd_float vZmPrev = simdf32_loadu (&zmBlockPrev[j]);
694695 simd_float vZf = simdf32_loadu (&zfBlock[j]);
695696 simd_float vZfUpdate = simdf32_add (
696- simdf32_mul (vZmPrev, exp_go),
697- simdf32_mul (vZf, exp_ge)
697+ simdf32_mul (vZmPrev, simdf32_set ( exp_go) ),
698+ simdf32_mul (vZf, simdf32_set ( exp_ge) )
698699 );
699700 vZfUpdate = simdf32_div (vZfUpdate, vZmMaxRowBlock);
700701 simdf32_storeu (&zfBlock[j], vZfUpdate);
@@ -1186,4 +1187,4 @@ int fwbw(int argc, const char **argv, const Command &command) {
11861187 tdbr.close ();
11871188
11881189 return EXIT_SUCCESS;
1189- }
1190+ }
0 commit comments