From 6723d170171f4e2e0e8def0f6ba6090c071521b8 Mon Sep 17 00:00:00 2001 From: gsc74 Date: Mon, 14 Apr 2025 11:24:04 +0530 Subject: [PATCH 1/5] Added AVX512 intrinsics-based Ungapped SW alignent kernel --- CMakeLists.txt | 13 +- lib/simd/simd.h | 8 +- src/alignment/StripedSmithWaterman.cpp | 266 +++++++++++++++++++------ src/alignment/StripedSmithWaterman.h | 19 +- 4 files changed, 241 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c14b98f3c..36ce7605e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,14 @@ set(MMSEQS_CXX_FLAGS "-fsigned-char") # SIMD instruction sets support set(MMSEQS_ARCH "") -if (HAVE_AVX2) +if (HAVE_AVX512) + if (CMAKE_COMPILER_IS_CLANG) + set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16") + else () + set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16 -Wa,-q") + endif () + set(X64 1 CACHE INTERNAL "") +elseif (HAVE_AVX2) if (CMAKE_COMPILER_IS_CLANG) set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx2 -mcx16") else () @@ -182,10 +189,10 @@ if (PPC64 OR SPARC OR ZARCH) endif () set(MMSEQS_C_FLAGS "${MMSEQS_CXX_FLAGS}") -set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -std=c++1y") +set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -std=c++17") # Compiler-specific features if (CMAKE_COMPILER_IS_CLANG AND (NOT EMSCRIPTEN)) - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++17") set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -stdlib=libc++") endif () diff --git a/lib/simd/simd.h b/lib/simd/simd.h index 77eaf5101..7c9f277c3 100644 --- a/lib/simd/simd.h +++ b/lib/simd/simd.h @@ -46,7 +46,11 @@ //#define AVX512 //#endif -#if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX512BW_NATIVE) +#define AVX512BW +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) || defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE) #define AVX2 #endif @@ -694,4 +698,4 @@ inline float ScalarProd20(const float* qi, const float* tj) { // + tj[18] * qi[18] + tj[19] * qi[19]; } -#endif //SIMD_H +#endif //SIMD_H \ No newline at end of file diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 29ca26def..4be90ff0d 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -42,6 +42,10 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo segSize = segmentSize; vHStore = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); vHLoad = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); +#ifdef AVX512BW + vHStoreBW = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); + vHLoadBW = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); +#endif vE = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); vHmax = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); @@ -56,6 +60,9 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo profile = new s_profile(); // query profile profile->profile_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); +#ifdef AVX512BW + profile->profile_byteBW = (__m512i*)mem_align(AVX512_ALIGN_INT, aaSize * segSize * sizeof(__m512i)); +#endif profile->profile_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); profile->profile_rev_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); profile->profile_rev_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); @@ -111,10 +118,17 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo SmithWaterman::~SmithWaterman(){ free(vHStore); free(vHLoad); +#ifdef AVX512BW + free(vHStoreBW); + free(vHLoadBW); +#endif free(vE); free(vHmax); free(target_profile_byte); free(profile->profile_byte); +#ifdef AVX512BW + free(profile->profile_byteBW); +#endif free(profile->profile_word); free(profile->profile_rev_byte); free(profile->profile_rev_word); @@ -157,6 +171,37 @@ SmithWaterman::~SmithWaterman(){ delete profile; } +#ifdef AVX512BW + /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */ + template + void SmithWaterman::createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat, + const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength) + { + const int32_t segLen = (query_length + Elements - 1) / Elements; + T* t = (T*) profile; + for (int32_t nt = 0; LIKELY(nt < aaSize); nt++) { + for (int32_t i = 0; i < segLen; i++) { + int32_t j = i; + for (size_t segNum = 0; LIKELY(segNum < Elements) ; segNum++) { + // if will be optmized out by compiler + if(type == SUBSTITUTIONMATRIX) { // substitution score for query_seq constrained by nt + // query_sequence starts from 1 to n + *t++ = ( j >= query_length) ? bias : mat[nt * aaSize + query_sequence[j + offset ]] + composition_bias[j + offset] + bias; // mat[nt][q[j]] mat eq 20*20 + } if(type == PROFILE) { + // profile starts by 0 + // *t++ = (j >= query_length) ? bias : (mat[nt * entryLength + (j + (offset - 1))] + bias); //mat eq L*20 // mat[nt][j] + *t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias; + // // profile starts by 0 // TODO: offset? + // *t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias; //mat eq L*20 // mat[nt][j] + // printf("(%1d, %1d) ", j , *(t-1)); + } + j += segLen; + } + } + } + } +#endif + /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */ template @@ -214,6 +259,21 @@ void SmithWaterman::createTargetProfile(simd_int *profile, const int8_t *mat, co } } } + + +#ifdef AVX512BW + template + void SmithWaterman::updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize, + uint8_t shift) { + const int32_t segLen = (query_length + Elements - 1) / Elements; + T* t = (T*) profile; + for (uint32_t i = 0; i < segLen * Elements * aaSize; i++) { + t[i] += shift; + } + } +#endif + + template void SmithWaterman::updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift) { @@ -347,6 +407,9 @@ s_align SmithWaterman::ssw_align_private ( if (db_bias > profile->bias) { uint8_t shift = abs(profile->bias - db_bias); updateQueryProfile(profile->profile_byte, profile->query_length, profile->alphabetSize, shift); + #ifdef AVX512BW + updateQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_length, profile->alphabetSize, shift); + #endif } profile->bias = std::max(db_bias, profile->bias); createTargetProfile(db_profile_byte, db_mat, db_length, profile->alphabetSize - 1, profile->bias); @@ -1307,6 +1370,10 @@ void SmithWaterman::ssw_init(const Sequence* q, // create byte version of profiles createQueryProfile(profile->profile_byte, profile->query_sequence, NULL, profile->mat, q->L, alphabetSize, profile->bias, 0, q->L); + #ifdef AVX512BW + createQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_sequence, NULL, + profile->mat, q->L, alphabetSize, profile->bias, 0, q->L); + #endif createConsensProfile(profile->consens_byte, profile->query_consens_sequence, q->L, 0); #ifdef GAP_POS_SCORING createGapProfile(profile->profile_gDelOpen_byte, profile->profile_gDelClose_byte, @@ -1330,7 +1397,10 @@ void SmithWaterman::ssw_init(const Sequence* q, } else { // create byte version of query profile createQueryProfile(profile->profile_byte, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0); - // create word version of query profile + #ifdef AVX512BW + createQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0); + #endif + // create word version of query profile createQueryProfile(profile->profile_word, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, 0, 0, 0); // create linear version of word profile for (int32_t i = 0; i< alphabetSize; i++) { @@ -1735,63 +1805,141 @@ inline F simd_hmax(const F * in, unsigned int n) { return current; } -int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { -#define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; - - int i; // position in query bands (0,..,W-1) - int j; // position in db sequence (0,..,dbseq_length-1) - int element_count = (VECSIZE_INT * 4); - const int W = (profile->query_length + (element_count - 1)) / - element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 - - simd_int *p; - simd_int S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) - simd_int Smax = simdi_setzero(); - simd_int Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values - simd_int *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. - simd_int *qji; // query profile score in row j (for residue x_j) - simd_int *s_prev_it, *s_curr_it; - simd_int *query_profile_it = (simd_int *) profile->profile_byte; - - // Load the score offset to all 16 unsigned byte elements of Soffset - Soffset = simdi8_set(profile->bias); - s_curr = vHStore; - s_prev = vHLoad; - - memset(vHStore, 0, W * sizeof(simd_int)); - memset(vHLoad, 0, W * sizeof(simd_int)); - - for (j = 0; j < db_length; ++j) // loop over db sequence positions - { - - // Get address of query scores for row j - qji = query_profile_it + db_sequence[j] * W; - - // Load the next S value - S = simdi_load(s_curr + W - 1); - S = simdi8_shiftl(S, 1); - - // Swap s_prev and s_curr, smax_prev and smax_curr - SWAP(p, s_prev, s_curr); - - s_curr_it = s_curr; - s_prev_it = s_prev; - - for (i = 0; i < W; ++i) // loop over query band positions - { - // Saturated addition and subtraction to score S(i,j) - S = simdui8_adds(S, *(qji++)); // S(i,j) = S(i-1,j-1) + (q(i,x_j) + Soffset) - S = simdui8_subs(S, Soffset); // S(i,j) = max(0, S(i,j) - Soffset) - simdi_store(s_curr_it++, S); // store S to s_curr[i] - Smax = simdui8_max(Smax, S); // Smax(i,j) = max(Smax(i,j), S(i,j)) - - // Load the next S and Smax values - S = simdi_load(s_prev_it++); - } - } - int score = simd_hmax((unsigned char *) &Smax, element_count); +#ifdef AVX512BW + template // https://stackoverflow.com/questions/58322652/emulating-shifts-on-64-bytes-with-avx-512 + __m512i shift_right(__m512i a, __m512i carry = _mm512_setzero_si512()) { + static_assert(0 <= N && N <= 64); + if constexpr(N == 0) return a; + if constexpr(N == 64) return carry; + if constexpr(N % 4 == 0) return _mm512_alignr_epi32(carry, a, N / 4); + else { + __m512i a0 = shift_right<(N / 16 + 1) * 16>(a, carry); // 16, 32, 48, 64 + __m512i a1 = shift_right<(N / 16) * 16>(a, carry); // 0, 16, 32, 48 + return _mm512_alignr_epi8(a0, a1, N % 16); + } + } - /* return largest score */ - return score; -#undef SWAP + template +__m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) { + return shift_right<64 - N>(carry, a); } + + int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { + #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; + + int i; // position in query bands (0,..,W-1) + int j; // position in db sequence (0,..,dbseq_length-1) + int element_count = (AVX512_VECSIZE_INT * 4); // 8 bit datatype + const int W = (profile->query_length + (element_count - 1)) / + element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 + + // simd_int *p; + __m512i *p; + __m512i S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) + __m512i Smax = _mm512_setzero_si512(); + __m512i Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values + __m512i *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. + __m512i *qji; // query profile score in row j (for residue x_j) + __m512i *s_prev_it, *s_curr_it; + __m512i *query_profile_it = (__m512i *) profile->profile_byteBW; + + // Load the score offset to all 16 unsigned byte elements of Soffset + Soffset = _mm512_set1_epi8(profile->bias); + s_curr = vHStoreBW; + s_prev = vHLoadBW; + + memset(vHStoreBW, 0, W * sizeof(__m512i)); + memset(vHLoadBW, 0, W * sizeof(__m512i)); + + for (j = 0; j < db_length; ++j) // loop over db sequence positions + { + // Get address of query scores for row j + qji = query_profile_it + db_sequence[j] * W; + + // Load the next S value + S = _mm512_load_si512(s_curr + W - 1); + S = shift_right_AVX512<1>(S); + + // Swap s_prev and s_curr, smax_prev and smax_curr + SWAP(p, s_prev, s_curr); + + s_curr_it = s_curr; + s_prev_it = s_prev; + + for (i = 0; i < W; ++i) // loop over query band positions + { + S = _mm512_adds_epu8(S, *(qji++)); + S = _mm512_subs_epu8(S, Soffset); + _mm512_store_si512(s_curr_it++, S); + Smax = _mm512_max_epu8(Smax, S); + + S = _mm512_load_si512(s_prev_it++); + } + } + int score = simd_hmax((unsigned char *) &Smax, element_count); // done + + return score; + #undef SWAP + } +#else + int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { + #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; + + int i; // position in query bands (0,..,W-1) + int j; // position in db sequence (0,..,dbseq_length-1) + int element_count = (VECSIZE_INT * 4); + const int W = (profile->query_length + (element_count - 1)) / + element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 + + simd_int *p; + simd_int S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) + simd_int Smax = simdi_setzero(); + simd_int Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values + simd_int *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. + simd_int *qji; // query profile score in row j (for residue x_j) + simd_int *s_prev_it, *s_curr_it; + simd_int *query_profile_it = (simd_int *) profile->profile_byte; + + // Load the score offset to all 16 unsigned byte elements of Soffset + Soffset = simdi8_set(profile->bias); + s_curr = vHStore; + s_prev = vHLoad; + + memset(vHStore, 0, W * sizeof(simd_int)); + memset(vHStore, 0, W * sizeof(simd_int)); + + for (j = 0; j < db_length; ++j) // loop over db sequence positions + { + + // Get address of query scores for row j + qji = query_profile_it + db_sequence[j] * W; + + // Load the next S value + S = simdi_load(s_curr + W - 1); + S = simdi8_shiftl(S, 1); + + // Swap s_prev and s_curr, smax_prev and smax_curr + SWAP(p, s_prev, s_curr); + + s_curr_it = s_curr; + s_prev_it = s_prev; + + for (i = 0; i < W; ++i) // loop over query band positions + { + // Saturated addition and subtraction to score S(i,j) + S = simdui8_adds(S, *(qji++)); // S(i,j) = S(i-1,j-1) + (q(i,x_j) + Soffset) + S = simdui8_subs(S, Soffset); // S(i,j) = max(0, S(i,j) - Soffset) + simdi_store(s_curr_it++, S); // store S to s_curr[i] + Smax = simdui8_max(Smax, S); // Smax(i,j) = max(Smax(i,j), S(i,j)) + + // Load the next S and Smax values + S = simdi_load(s_prev_it++); + } + } + int score = simd_hmax((unsigned char *) &Smax, element_count); // done + + /* return largest score */ + return score; + #undef SWAP + } +#endif \ No newline at end of file diff --git a/src/alignment/StripedSmithWaterman.h b/src/alignment/StripedSmithWaterman.h index d4d3aa555..fa49db38c 100644 --- a/src/alignment/StripedSmithWaterman.h +++ b/src/alignment/StripedSmithWaterman.h @@ -79,6 +79,9 @@ class SmithWaterman{ // TODO: private or public? struct s_profile{ simd_int* profile_byte; // 0: none + #ifdef AVX512BW + __m512i* profile_byteBW; // 0: none + #endif simd_int* profile_word; // 0: none simd_int* profile_rev_byte; // 0: none simd_int* profile_rev_word; // 0: none @@ -279,6 +282,10 @@ class SmithWaterman{ simd_int* vHStore; simd_int* vHLoad; +#ifdef AVX512BW + __m512i* vHStoreBW; + __m512i* vHLoadBW; +#endif simd_int* vE; simd_int* vHmax; uint8_t * maxColumn; @@ -387,6 +394,11 @@ class SmithWaterman{ size_t query_id; size_t target_id; + #ifdef AVX512BW + template + void createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat, + const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength); + #endif template void createQueryProfile(simd_int *profile, const int8_t *query_sequence, const int8_t * composition_bias, @@ -403,6 +415,11 @@ class SmithWaterman{ void createTargetProfile(simd_int *profile, const int8_t *mat, const int target_length, const int32_t aaSize, uint8_t bias); +#ifdef AVX512BW + template + void updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift); +#endif + template void updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift); @@ -410,4 +427,4 @@ class SmithWaterman{ void reverseMat(int8_t *rev_mat, const int8_t *mat, const int32_t aaSize, const int32_t target_length); }; -#endif /* SMITH_WATERMAN_SSE2_H */ +#endif /* SMITH_WATERMAN_SSE2_H */ \ No newline at end of file From a9b42f1311893930e6c8aadf1386037461c2426a Mon Sep 17 00:00:00 2001 From: gsc74 Date: Mon, 14 Apr 2025 11:56:31 +0530 Subject: [PATCH 2/5] removed -std=c++17 for compatibility with GCC-4.9 --- CMakeLists.txt | 4 +-- src/alignment/StripedSmithWaterman.cpp | 39 +++++++++++++++----------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36ce7605e..84c1d3579 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -189,10 +189,10 @@ if (PPC64 OR SPARC OR ZARCH) endif () set(MMSEQS_C_FLAGS "${MMSEQS_CXX_FLAGS}") -set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -std=c++17") +set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -std=c++1y") # Compiler-specific features if (CMAKE_COMPILER_IS_CLANG AND (NOT EMSCRIPTEN)) - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++17") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11") set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -stdlib=libc++") endif () diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 4be90ff0d..1339c1516 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -1806,24 +1806,29 @@ inline F simd_hmax(const F * in, unsigned int n) { } #ifdef AVX512BW - template // https://stackoverflow.com/questions/58322652/emulating-shifts-on-64-bytes-with-avx-512 - __m512i shift_right(__m512i a, __m512i carry = _mm512_setzero_si512()) { - static_assert(0 <= N && N <= 64); - if constexpr(N == 0) return a; - if constexpr(N == 64) return carry; - if constexpr(N % 4 == 0) return _mm512_alignr_epi32(carry, a, N / 4); - else { - __m512i a0 = shift_right<(N / 16 + 1) * 16>(a, carry); // 16, 32, 48, 64 - __m512i a1 = shift_right<(N / 16) * 16>(a, carry); // 0, 16, 32, 48 - return _mm512_alignr_epi8(a0, a1, N % 16); - } + __m512i shift_right_1_byte(__m512i a) { + __m128i zero = _mm_setzero_si128(); + __m128i lane0 = _mm512_extracti32x4_epi32(a, 0); + __m128i lane1 = _mm512_extracti32x4_epi32(a, 1); + __m128i lane2 = _mm512_extracti32x4_epi32(a, 2); + __m128i lane3 = _mm512_extracti32x4_epi32(a, 3); + + // For lane0: result[0]=0, result[1..15]=lane0[0..14] + __m128i res0 = _mm_alignr_epi8(lane0, zero, 15); + // For lane1: result[0]=lane0[15], result[1..15]=lane1[0..14] + __m128i res1 = _mm_alignr_epi8(lane1, lane0, 15); + // For lane2: result[0]=lane1[15], result[1..15]=lane2[0..14] + __m128i res2 = _mm_alignr_epi8(lane2, lane1, 15); + // For lane3: result[0]=lane2[15], result[1..15]=lane3[0..14] + __m128i res3 = _mm_alignr_epi8(lane3, lane2, 15); + + __m512i result = _mm512_castsi128_si512(res0); + result = _mm512_inserti32x4(result, res1, 1); + result = _mm512_inserti32x4(result, res2, 2); + result = _mm512_inserti32x4(result, res3, 3); + return result; } - template -__m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) { - return shift_right<64 - N>(carry, a); -} - int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; @@ -1858,7 +1863,7 @@ __m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) { // Load the next S value S = _mm512_load_si512(s_curr + W - 1); - S = shift_right_AVX512<1>(S); + S = shift_right_1_byte(S); // Swap s_prev and s_curr, smax_prev and smax_curr SWAP(p, s_prev, s_curr); From 0be4b581829d8b4267afe188573c7035982d48bf Mon Sep 17 00:00:00 2001 From: gsc74 Date: Sun, 25 May 2025 19:18:52 +0530 Subject: [PATCH 3/5] Added cleaner code --- src/alignment/StripedSmithWaterman.cpp | 234 +++++++++---------------- 1 file changed, 87 insertions(+), 147 deletions(-) diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 1339c1516..3068a53ba 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -22,6 +22,10 @@ /* Written by Michael Farrar, 2006 (alignment), Mengyao Zhao (SSW Library) and Martin Steinegger (change structure add aa composition, profile and AVX2 support). Please send bug reports and/or suggestions to martin.steinegger@snu.ac.kr. + + AVX512BW support + Modified Copyright (C) 2025 Intel Corporation + Contacts: Ghanshyam Chandra */ #include "Parameters.h" #include "StripedSmithWaterman.h" @@ -42,10 +46,6 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo segSize = segmentSize; vHStore = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); vHLoad = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); -#ifdef AVX512BW - vHStoreBW = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); - vHLoadBW = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); -#endif vE = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); vHmax = (simd_int*) mem_align(ALIGN_INT, segSize * sizeof(simd_int)); @@ -118,10 +118,6 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo SmithWaterman::~SmithWaterman(){ free(vHStore); free(vHLoad); -#ifdef AVX512BW - free(vHStoreBW); - free(vHLoadBW); -#endif free(vE); free(vHmax); free(target_profile_byte); @@ -1806,145 +1802,89 @@ inline F simd_hmax(const F * in, unsigned int n) { } #ifdef AVX512BW - __m512i shift_right_1_byte(__m512i a) { - __m128i zero = _mm_setzero_si128(); - __m128i lane0 = _mm512_extracti32x4_epi32(a, 0); - __m128i lane1 = _mm512_extracti32x4_epi32(a, 1); - __m128i lane2 = _mm512_extracti32x4_epi32(a, 2); - __m128i lane3 = _mm512_extracti32x4_epi32(a, 3); - - // For lane0: result[0]=0, result[1..15]=lane0[0..14] - __m128i res0 = _mm_alignr_epi8(lane0, zero, 15); - // For lane1: result[0]=lane0[15], result[1..15]=lane1[0..14] - __m128i res1 = _mm_alignr_epi8(lane1, lane0, 15); - // For lane2: result[0]=lane1[15], result[1..15]=lane2[0..14] - __m128i res2 = _mm_alignr_epi8(lane2, lane1, 15); - // For lane3: result[0]=lane2[15], result[1..15]=lane3[0..14] - __m128i res3 = _mm_alignr_epi8(lane3, lane2, 15); - - __m512i result = _mm512_castsi128_si512(res0); - result = _mm512_inserti32x4(result, res1, 1); - result = _mm512_inserti32x4(result, res2, 2); - result = _mm512_inserti32x4(result, res3, 3); - return result; - } +template +__m512i shift_right(__m512i a, __m512i carry = _mm512_setzero_si512()) { + static_assert(0 <= N && N <= 64); + if constexpr (N == 0) return a; + if constexpr (N == 64) return carry; + if constexpr (N % 4 == 0) return _mm512_alignr_epi32(carry, a, N / 4); + else { + __m512i a0 = shift_right<(N / 16 + 1) * 16>(a, carry); + __m512i a1 = shift_right<(N / 16) * 16>(a, carry); + return _mm512_alignr_epi8(a0, a1, N % 16); + } +} +template +__m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) { + return shift_right<64 - N>(carry, a); +} +#endif - int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { - #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; - - int i; // position in query bands (0,..,W-1) - int j; // position in db sequence (0,..,dbseq_length-1) - int element_count = (AVX512_VECSIZE_INT * 4); // 8 bit datatype - const int W = (profile->query_length + (element_count - 1)) / - element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 - - // simd_int *p; - __m512i *p; - __m512i S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) - __m512i Smax = _mm512_setzero_si512(); - __m512i Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values - __m512i *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. - __m512i *qji; // query profile score in row j (for residue x_j) - __m512i *s_prev_it, *s_curr_it; - __m512i *query_profile_it = (__m512i *) profile->profile_byteBW; - - // Load the score offset to all 16 unsigned byte elements of Soffset - Soffset = _mm512_set1_epi8(profile->bias); - s_curr = vHStoreBW; - s_prev = vHLoadBW; - - memset(vHStoreBW, 0, W * sizeof(__m512i)); - memset(vHLoadBW, 0, W * sizeof(__m512i)); - - for (j = 0; j < db_length; ++j) // loop over db sequence positions - { - // Get address of query scores for row j - qji = query_profile_it + db_sequence[j] * W; - - // Load the next S value - S = _mm512_load_si512(s_curr + W - 1); - S = shift_right_1_byte(S); - - // Swap s_prev and s_curr, smax_prev and smax_curr - SWAP(p, s_prev, s_curr); - - s_curr_it = s_curr; - s_prev_it = s_prev; - - for (i = 0; i < W; ++i) // loop over query band positions - { - S = _mm512_adds_epu8(S, *(qji++)); - S = _mm512_subs_epu8(S, Soffset); - _mm512_store_si512(s_curr_it++, S); - Smax = _mm512_max_epu8(Smax, S); +int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { + // AVX512BW + #ifdef AVX512BW + int element_count = AVX512_VECSIZE_INT * 4; + const int W = (profile->query_length + (element_count - 1)) / element_count; + __m512i *p; + __m512i S, Smax = _mm512_setzero_si512(); + __m512i *s_prev = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); + __m512i *s_curr = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); + memset(s_prev, 0, W * sizeof(__m512i)); + memset(s_curr, 0, W * sizeof(__m512i)); + __m512i *qji, *s_prev_it, *s_curr_it; + __m512i *query_profile_it = (__m512i*)profile->profile_byteBW; + __m512i Soffset = _mm512_set1_epi8(profile->bias); + #else // AVX2 or SSE2 + int element_count = VECSIZE_INT * 4; + const int W = (profile->query_length + (element_count - 1)) / element_count; + simd_int *p; + simd_int S, Smax = simdi_setzero(); + simd_int *s_prev = vHLoad, *s_curr = vHStore; + memset(vHLoad, 0, W * sizeof(simd_int)); + memset(vHStore, 0, W * sizeof(simd_int)); + simd_int *qji, *s_prev_it, *s_curr_it; + simd_int *query_profile_it = (simd_int*)profile->profile_byte; + simd_int Soffset = simdi8_set(profile->bias); + #endif - S = _mm512_load_si512(s_prev_it++); - } - } - int score = simd_hmax((unsigned char *) &Smax, element_count); // done + // main Smith-Waterman loop + for (int j = 0; j < db_length; ++j) { + qji = query_profile_it + db_sequence[j] * W; - return score; - #undef SWAP - } -#else - int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { - #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; - - int i; // position in query bands (0,..,W-1) - int j; // position in db sequence (0,..,dbseq_length-1) - int element_count = (VECSIZE_INT * 4); - const int W = (profile->query_length + (element_count - 1)) / - element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 - - simd_int *p; - simd_int S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) - simd_int Smax = simdi_setzero(); - simd_int Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values - simd_int *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. - simd_int *qji; // query profile score in row j (for residue x_j) - simd_int *s_prev_it, *s_curr_it; - simd_int *query_profile_it = (simd_int *) profile->profile_byte; - - // Load the score offset to all 16 unsigned byte elements of Soffset - Soffset = simdi8_set(profile->bias); - s_curr = vHStore; - s_prev = vHLoad; - - memset(vHStore, 0, W * sizeof(simd_int)); - memset(vHStore, 0, W * sizeof(simd_int)); - - for (j = 0; j < db_length; ++j) // loop over db sequence positions - { - - // Get address of query scores for row j - qji = query_profile_it + db_sequence[j] * W; - - // Load the next S value - S = simdi_load(s_curr + W - 1); - S = simdi8_shiftl(S, 1); - - // Swap s_prev and s_curr, smax_prev and smax_curr - SWAP(p, s_prev, s_curr); - - s_curr_it = s_curr; - s_prev_it = s_prev; - - for (i = 0; i < W; ++i) // loop over query band positions - { - // Saturated addition and subtraction to score S(i,j) - S = simdui8_adds(S, *(qji++)); // S(i,j) = S(i-1,j-1) + (q(i,x_j) + Soffset) - S = simdui8_subs(S, Soffset); // S(i,j) = max(0, S(i,j) - Soffset) - simdi_store(s_curr_it++, S); // store S to s_curr[i] - Smax = simdui8_max(Smax, S); // Smax(i,j) = max(Smax(i,j), S(i,j)) - - // Load the next S and Smax values - S = simdi_load(s_prev_it++); - } - } - int score = simd_hmax((unsigned char *) &Smax, element_count); // done - - /* return largest score */ - return score; - #undef SWAP - } -#endif \ No newline at end of file + #ifdef AVX512BW + S = _mm512_load_si512(s_curr + W - 1); + S = shift_right_AVX512<1>(S); + #else + S = simdi_load(s_curr + W - 1); + S = simdi8_shiftl(S, 1); + #endif + + // swap the buffers + std::swap(s_prev, s_curr); + + s_curr_it = s_curr; + s_prev_it = s_prev; + + for (int i = 0; i < W; ++i) { + #ifdef AVX512BW + S = _mm512_adds_epu8(S, *qji++); + S = _mm512_subs_epu8(S, Soffset); + _mm512_store_si512(s_curr_it++, S); + Smax = _mm512_max_epu8(Smax, S); + S = _mm512_load_si512(s_prev_it++); + #else + S = simdui8_adds(S, *qji++); + S = simdui8_subs(S, Soffset); + simdi_store(s_curr_it++, S); + Smax = simdui8_max(Smax, S); + S = simdi_load(s_prev_it++); + #endif + } + } + + int score = simd_hmax((unsigned char*)&Smax, element_count); + + free(s_curr); free(s_prev); + + return score; +} \ No newline at end of file From 52b22021add6377e4bd663b3c53255ce1766db7d Mon Sep 17 00:00:00 2001 From: gsc74 Date: Sun, 25 May 2025 19:27:56 +0530 Subject: [PATCH 4/5] Reuse memory for AVX2 --- src/alignment/StripedSmithWaterman.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 3068a53ba..07d03d365 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -1884,7 +1884,10 @@ int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t int score = simd_hmax((unsigned char*)&Smax, element_count); - free(s_curr); free(s_prev); + + #ifdef AVX512BW + free(s_curr); free(s_prev); + #endif return score; } \ No newline at end of file From ddf5dc0e63464751648016477e4a81c9914b9f20 Mon Sep 17 00:00:00 2001 From: gsc74 Date: Sun, 25 May 2025 19:42:50 +0530 Subject: [PATCH 5/5] removed unused variable --- src/alignment/StripedSmithWaterman.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 07d03d365..7807dc380 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -1825,7 +1825,6 @@ int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t #ifdef AVX512BW int element_count = AVX512_VECSIZE_INT * 4; const int W = (profile->query_length + (element_count - 1)) / element_count; - __m512i *p; __m512i S, Smax = _mm512_setzero_si512(); __m512i *s_prev = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); __m512i *s_curr = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); @@ -1837,7 +1836,6 @@ int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t #else // AVX2 or SSE2 int element_count = VECSIZE_INT * 4; const int W = (profile->query_length + (element_count - 1)) / element_count; - simd_int *p; simd_int S, Smax = simdi_setzero(); simd_int *s_prev = vHLoad, *s_curr = vHStore; memset(vHLoad, 0, W * sizeof(simd_int));