1+ /* *
2+ * @brief OpenMP-accelerated string similarity utilities.
3+ * @file similarity.hpp
4+ * @author Ash Vardanian
5+ *
6+ * Includes core APIs:
7+ *
8+ * - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
9+ * - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
10+ *
11+ * Those are mostly providing specialized overloads of the @b `sz::openmp::global_score_kernel` template.
12+ *
13+ */
14+ #ifndef STRINGZILLA_SIMILARITY_HPP_
15+ #define STRINGZILLA_SIMILARITY_HPP_
16+
17+ #include " types.h"
18+
19+ namespace ashvardanian {
20+ namespace stringzilla {
21+ namespace openmp {
22+
23+ struct uniform_substitution_cost_t {
24+ sz_error_cost_t operator ()(char a, char b) const { return a == b ? 0 : 1 ; }
25+ };
26+
27+ /* *
28+ * @brief Levenshtein edit-distance computation using skewed diagonal order evaluation on CPU.
29+ * Type-agnostic, unlike `sz_levenshtein_distance`, and annotated for OpenMP parallelization.
30+ *
31+ * @param[in] first The first string.
32+ * @param[in] first_length The length of the first string.
33+ * @param[in] second The second string.
34+ * @param[in] second_length The length of the second string.
35+ *
36+ * There are smarter algorithms for computing the Levenshtein distance, but this one is extremely flexible.
37+ */
38+ template < //
39+ typename char_type_ = char , //
40+ typename distance_type_ = sz_size_t , //
41+ typename get_substitution_cost_ = uniform_substitution_cost_t , //
42+ sz_error_cost_t gap_cost_ = 1 //
43+ >
44+ sz_status_t global_score_kernel ( //
45+ char_type_ const *shorter, sz_size_t shorter_length, //
46+ char_type_ const *longer, sz_size_t longer_length, //
47+ get_substitution_cost_ &&get_substitution_cost, //
48+ sz_memory_allocator_t *alloc, //
49+ distance_type_ *result_ptr) {
50+
51+ // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
52+ // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
53+ sz_size_t const shorter_dim = shorter_length + 1 ;
54+ sz_size_t const longer_dim = longer_length + 1 ;
55+
56+ // Let's say we are dealing with 3 and 5 letter words.
57+ // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
58+ // It will have:
59+ // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
60+ // - 2 diagonals of fixed length, at positions: 4, 5.
61+ // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
62+ sz_size_t const diagonals_count = shorter_dim + longer_dim - 1 ;
63+ sz_size_t const max_diagonal_length = shorter_length + 1 ;
64+
65+ // We want to avoid reverse-order iteration over the shorter string.
66+ // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
67+ sz_size_t const buffer_length =
68+ sizeof (distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof (char_type_);
69+ distance_type_ *const distances = (distance_type_ *)alloc->allocate (buffer_length, alloc->handle );
70+ if (!distances) return sz_bad_alloc_k;
71+
72+ // The next few pointers will be swapped around.
73+ distance_type_ *previous_distances = distances;
74+ distance_type_ *current_distances = previous_distances + longer_dim;
75+ distance_type_ *next_distances = current_distances + longer_dim;
76+ char_type_ const *const shorter_reversed = (char_type_ const *)(next_distances + longer_dim);
77+
78+ // Export the reversed string into the buffer.
79+ for (sz_size_t i = 0 ; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
80+
81+ // Initialize the first two diagonals:
82+ previous_distances[0 ] = 0 ;
83+ current_distances[0 ] = current_distances[1 ] = 1 ;
84+
85+ // We skip diagonals 0 and 1, as they are trivial.
86+ // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
87+ // so we are effectively computing just one value, as will be marked by a single set bit in
88+ // the `next_diagonal_mask` on the very first iteration.
89+ sz_size_t next_diagonal_index = 2 ;
90+
91+ // Progress through the upper triangle of the Levenshtein matrix.
92+ for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
93+ sz_size_t const next_diagonal_length = next_diagonal_index + 1 ;
94+ #pragma omp simd
95+ for (sz_size_t offset_in_diagonal = 1 ; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
96+ // ? Note that here we are still traversing both buffers in the same order,
97+ // ? because the shorter string has been reversed into `shorter_reversed`.
98+ char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal - 1 ];
99+ char_type_ longer_char = longer[offset_in_diagonal];
100+ sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
101+ distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
102+ distance_type_ cost_if_deletion_or_insertion =
103+ gap_cost_ +
104+ sz_min_of_two (current_distances[offset_in_diagonal - 1 ], current_distances[offset_in_diagonal]);
105+ next_distances[offset_in_diagonal] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
106+ }
107+ // Don't forget to populate the first row and the first column of the Levenshtein matrix.
108+ next_distances[0 ] = next_distances[next_diagonal_length - 1 ] = next_diagonal_index;
109+ // Perform a circular rotation of those buffers, to reuse the memory.
110+ distance_type_ *temporary = previous_distances;
111+ previous_distances = current_distances;
112+ current_distances = next_distances;
113+ next_distances = temporary;
114+ }
115+
116+ // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
117+ for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
118+ sz_size_t const next_diagonal_length = shorter_dim;
119+ for (sz_size_t offset_in_diagonal = 1 ; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
120+ char_type_ shorter_char = shorter[shorter_length - 1 - i]; // ! Walks in reverse order.
121+ char_type_ longer_char = longer[next_diagonal_index - n + i];
122+ sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
123+ distance_type_ cost_if_substitution = previous_distances[i] + cost_of_substitution;
124+ distance_type_ cost_if_deletion_or_insertion =
125+ sz_min_of_two (current_distances[i], current_distances[i + 1 ]) + gap_cost_;
126+ next_distances[i] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
127+ }
128+ // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
129+ // dropping the first element in the current array.
130+ distance_type_ *temporary = previous_distances;
131+ previous_distances = current_distances + 1 ;
132+ current_distances = next_distances;
133+ next_distances = temporary;
134+ }
135+
136+ // Cache scalar before `free` call.
137+ distance_type_ result = current_distances[0 ];
138+ alloc->free (distances, buffer_length, alloc->handle );
139+ *result_ptr = result;
140+ return sz_success_k;
141+ }
142+
143+ } // namespace openmp
144+ } // namespace stringzilla
145+ } // namespace ashvardanian
146+
147+ #endif // STRINGZILLA_SIMILARITY_HPP_
0 commit comments