88 * - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
99 * - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
1010 *
11- * Those are mostly providing specialized overloads of the @b `sz::openmp::global_score_kernel` template.
12- *
11+ * Those are mostly providing specialized overloads of the @b `sz::openmp::score_diagonally` template.
1312 */
1413#ifndef STRINGZILLA_SIMILARITY_HPP_
1514#define STRINGZILLA_SIMILARITY_HPP_
@@ -25,23 +24,26 @@ struct uniform_substitution_cost_t {
2524};
2625
2726/* *
28- * @brief Levenshtein edit-distance computation using skewed diagonal order evaluation on CPU.
29- * Type-agnostic, unlike `sz_levenshtein_distance`, and annotated for OpenMP parallelization.
27+ * @brief Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
28+ * @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
29+ * @sa sz_levenshtein_distance, sz_levenshtein_distance_utf8, sz_needleman_wunsch_score
3030 *
3131 * @param[in] first The first string.
3232 * @param[in] first_length The length of the first string.
3333 * @param[in] second The second string.
3434 * @param[in] second_length The length of the second string.
3535 *
36- * There are smarter algorithms for computing the Levenshtein distance, but this one is extremely flexible.
36+ * There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
37+ * Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
38+ * This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
3739 */
3840template < //
3941 typename char_type_ = char , //
4042 typename distance_type_ = sz_size_t , //
4143 typename get_substitution_cost_ = uniform_substitution_cost_t , //
4244 sz_error_cost_t gap_cost_ = 1 //
4345 >
44- sz_status_t global_score_kernel ( //
46+ sz_status_t score_diagonally ( //
4547 char_type_ const *shorter, sz_size_t shorter_length, //
4648 char_type_ const *longer, sz_size_t longer_length, //
4749 get_substitution_cost_ &&get_substitution_cost, //
@@ -95,13 +97,16 @@ sz_status_t global_score_kernel( //
9597 for (sz_size_t offset_in_diagonal = 1 ; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
9698 // ? Note that here we are still traversing both buffers in the same order,
9799 // ? because the shorter string has been reversed into `shorter_reversed`.
98- char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal - 1 ];
99- char_type_ longer_char = longer[offset_in_diagonal];
100+ char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_within_diagonal ];
101+ char_type_ longer_char = longer[offset_in_diagonal - 1 ];
100102 sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
101103 distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
102- distance_type_ cost_if_deletion_or_insertion =
103- gap_cost_ +
104- sz_min_of_two (current_distances[offset_in_diagonal - 1 ], current_distances[offset_in_diagonal]);
104+ distance_type_ cost_if_deletion_or_insertion = //
105+ sz_min_of_two ( //
106+ current_distances[offset_in_diagonal - 1 ], //
107+ current_distances[offset_in_diagonal] //
108+ ) +
109+ gap_cost_;
105110 next_distances[offset_in_diagonal] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
106111 }
107112 // Don't forget to populate the first row and the first column of the Levenshtein matrix.
@@ -116,19 +121,50 @@ sz_status_t global_score_kernel( //
116121 // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
117122 for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
118123 sz_size_t const next_diagonal_length = shorter_dim;
119- for (sz_size_t offset_in_diagonal = 1 ; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
120- char_type_ shorter_char = shorter[shorter_length - 1 - i]; // ! Walks in reverse order.
121- char_type_ longer_char = longer[next_diagonal_index - n + i];
124+ #pragma omp simd
125+ for (sz_size_t offset_in_diagonal = 0 ; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
126+ char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1 ];
127+ char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
122128 sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
123- distance_type_ cost_if_substitution = previous_distances[i] + cost_of_substitution;
124- distance_type_ cost_if_deletion_or_insertion =
125- sz_min_of_two (current_distances[i], current_distances[i + 1 ]) + gap_cost_;
129+ distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
130+ distance_type_ cost_if_deletion_or_insertion = //
131+ sz_min_of_two ( //
132+ current_distances[offset_in_diagonal], //
133+ current_distances[offset_in_diagonal + 1 ] //
134+ ) +
135+ gap_cost_;
136+ next_distances[i] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
137+ }
138+ next_distances[next_diagonal_length - 1 ] = next_diagonal_index;
139+ // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
140+ // dropping the first element in the current array.
141+ distance_type_ *temporary = previous_distances;
142+ previous_distances = current_distances + 1 ; // ! Note how we shift forward here
143+ current_distances = next_distances;
144+ next_distances = temporary;
145+ }
146+
147+ // Now let's handle the bottom-right triangle of the matrix.
148+ for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
149+ sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
150+ #pragma omp simd
151+ for (sz_size_t offset_in_diagonal = 0 ; offset_in_diagonal < next_diagonal_length; ++offset_in_diagonal) {
152+ char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1 ];
153+ char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
154+ sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
155+ distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
156+ distance_type_ cost_if_deletion_or_insertion = //
157+ sz_min_of_two ( //
158+ current_distances[offset_in_diagonal], //
159+ current_distances[offset_in_diagonal + 1 ] //
160+ ) +
161+ gap_cost_;
126162 next_distances[i] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
127163 }
128164 // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
129165 // dropping the first element in the current array.
130166 distance_type_ *temporary = previous_distances;
131- previous_distances = current_distances + 1 ;
167+ previous_distances = current_distances + 1 ; // ! Note how we shift forward here
132168 current_distances = next_distances;
133169 next_distances = temporary;
134170 }
0 commit comments