55 *
66 * Includes core APIs:
77 *
8- * - `sz::openmp:: levenshtein_distances` & `sz::openmp ::levenshtein_distances_utf8` for Levenshtein edit-distances.
9- * - `sz::openmp:: needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
8+ * - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
9+ * - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
1010 *
11- * Those are mostly providing specialized overloads of the @b `sz::openmp:: score_diagonally` template.
11+ * Those are mostly providing specialized overloads of the @b `sz::score_diagonally` template.
1212 */
1313#ifndef STRINGZILLA_SIMILARITY_HPP_
1414#define STRINGZILLA_SIMILARITY_HPP_
@@ -23,10 +23,14 @@ struct uniform_substitution_cost_t {
2323 sz_error_cost_t operator ()(char a, char b) const { return a == b ? 0 : 1 ; }
2424};
2525
26+ struct lookup_substitution_cost_t {
27+ sz_error_cost_t const *costs;
28+ sz_error_cost_t operator ()(char a, char b) const { return costs[(sz_u8_t )a * 256 + (sz_u8_t )b]; }
29+ };
30+
2631/* *
2732 * @brief Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
2833 * @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
29- * @sa sz_levenshtein_distance, sz_levenshtein_distance_utf8, sz_needleman_wunsch_score
3034 *
3135 * @param[in] first The first string.
3236 * @param[in] first_length The length of the first string.
@@ -36,25 +40,37 @@ struct uniform_substitution_cost_t {
3640 * There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
3741 * Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
3842 * This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
43+ *
44+ * @note The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
45+ * in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
46+ * @sa For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
47+ * @sa For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
48+ * @sa For bulk API, use `sz::levenshtein_distances[_utf8]`.
3949 */
4050template < //
4151 typename char_type_ = char , //
4252 typename distance_type_ = sz_size_t , //
4353 typename get_substitution_cost_ = uniform_substitution_cost_t , //
44- sz_error_cost_t gap_cost_ = 1 //
54+ typename allocator_type_ = std::allocator< char > //
4555 >
46- sz_status_t score_diagonally ( //
47- char_type_ const *shorter, sz_size_t shorter_length, //
48- char_type_ const *longer, sz_size_t longer_length, //
49- get_substitution_cost_ &&get_substitution_cost, //
50- sz_memory_allocator_t *alloc, //
51- distance_type_ *result_ptr) {
52-
56+ sz_status_t score_diagonally ( //
57+ char_type_ const *first, sz_size_t first_length, //
58+ char_type_ const *second, sz_size_t second_length, //
59+ distance_type_ *result_ptr,
60+ sz_error_cost_t gap_cost = 1 , //
61+ get_substitution_cost_ &&get_substitution_cost = uniform_substitution_cost_t {}, //
62+ allocator_type_ &&alloc = allocator_type_ {} //
63+ ) {
5364 // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
54- sz_memory_allocator_t global_alloc;
55- if (!alloc) {
56- sz_memory_allocator_init_default (&global_alloc);
57- alloc = &global_alloc;
65+ using allocated_type = typename allocator_type_::value_type;
66+ static_assert (sizeof (allocated_type) == sizeof (char ), " Allocator must be byte-aligned" );
67+
68+ // Make sure the size relation between the strings is correct.
69+ char_type_ const *shorter = first, *longer = second;
70+ sz_size_t shorter_length = first_length, longer_length = second_length;
71+ if (shorter_length > longer_length) {
72+ std::swap (shorter, longer);
73+ std::swap (shorter_length, longer_length);
5874 }
5975
6076 // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
@@ -75,14 +91,14 @@ sz_status_t score_diagonally( //
7591 // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
7692 sz_size_t const buffer_length =
7793 sizeof (distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof (char_type_);
78- distance_type_ *const distances = (distance_type_ *)alloc-> allocate (buffer_length, alloc-> handle );
94+ distance_type_ *const distances = (distance_type_ *)alloc. allocate (buffer_length);
7995 if (!distances) return sz_bad_alloc_k;
8096
8197 // The next few pointers will be swapped around.
8298 distance_type_ *previous_distances = distances;
83- distance_type_ *current_distances = previous_distances + longer_dim ;
84- distance_type_ *next_distances = current_distances + longer_dim ;
85- char_type_ *const shorter_reversed = (char_type_ *)(next_distances + longer_dim );
99+ distance_type_ *current_distances = previous_distances + max_diagonal_length ;
100+ distance_type_ *next_distances = current_distances + max_diagonal_length ;
101+ char_type_ *const shorter_reversed = (char_type_ *)(next_distances + max_diagonal_length );
86102
87103 // Export the reversed string into the buffer.
88104 for (sz_size_t i = 0 ; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -107,13 +123,13 @@ sz_status_t score_diagonally( //
107123 char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
108124 char_type_ longer_char = longer[offset_in_diagonal - 1 ];
109125 sz_error_cost_t cost_of_substitution = get_substitution_cost (shorter_char, longer_char);
110- distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
126+ distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal - 1 ] + cost_of_substitution;
111127 distance_type_ cost_if_deletion_or_insertion = //
112128 sz_min_of_two ( //
113129 current_distances[offset_in_diagonal - 1 ], //
114130 current_distances[offset_in_diagonal] //
115131 ) +
116- gap_cost_ ;
132+ gap_cost ;
117133 next_distances[offset_in_diagonal] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
118134 }
119135 // Don't forget to populate the first row and the first column of the Levenshtein matrix.
@@ -139,16 +155,19 @@ sz_status_t score_diagonally( //
139155 current_distances[offset_in_diagonal], //
140156 current_distances[offset_in_diagonal + 1 ] //
141157 ) +
142- gap_cost_ ;
158+ gap_cost ;
143159 next_distances[offset_in_diagonal] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
144160 }
145161 next_distances[next_diagonal_length - 1 ] = next_diagonal_index;
146162 // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
147163 // dropping the first element in the current array.
148164 distance_type_ *temporary = previous_distances;
149- previous_distances = current_distances + 1 ; // ! Note how we shift forward here
165+ previous_distances = current_distances;
150166 current_distances = next_distances;
151167 next_distances = temporary;
168+ // ! Drop the first entry among the current distances.
169+ sz_move ((sz_ptr_t )(previous_distances), (sz_ptr_t )(previous_distances + 1 ),
170+ (max_diagonal_length - 1 ) * sizeof (distance_type_));
152171 }
153172
154173 // Now let's handle the bottom-right triangle of the matrix.
@@ -165,69 +184,168 @@ sz_status_t score_diagonally( //
165184 current_distances[offset_in_diagonal], //
166185 current_distances[offset_in_diagonal + 1 ] //
167186 ) +
168- gap_cost_ ;
187+ gap_cost ;
169188 next_distances[offset_in_diagonal] = sz_min_of_two (cost_if_deletion_or_insertion, cost_if_substitution);
170189 }
171190 // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
172191 // dropping the first element in the current array.
173192 distance_type_ *temporary = previous_distances;
174- previous_distances = current_distances + 1 ; // ! Note how we shift forward here
193+ previous_distances = current_distances;
175194 current_distances = next_distances;
176195 next_distances = temporary;
196+ // ! Drop the first entry among the current distances.
197+ sz_move ((sz_ptr_t )(previous_distances), (sz_ptr_t )(previous_distances + 1 ),
198+ (max_diagonal_length - 1 ) * sizeof (distance_type_));
177199 }
178200
179201 // Cache scalar before `free` call.
180202 distance_type_ result = current_distances[0 ];
181- alloc-> free ( distances, buffer_length, alloc-> handle );
203+ alloc. deallocate ((allocated_type *) distances, buffer_length);
182204 *result_ptr = result;
183205 return sz_success_k;
184206}
185207
186- template <typename first_string_type_, typename second_string_type_>
187- inline std::size_t levenshtein_distance ( //
188- first_string_type_ const &first, second_string_type_ const &second) {
208+ template < //
209+ typename first_type_, //
210+ typename second_type_, //
211+ typename allocator_type_ = std::allocator<char > //
212+ >
213+ inline sz_size_t levenshtein_distance ( //
214+ first_type_ const &first, second_type_ const &second,
215+ allocator_type_ &&alloc = allocator_type_ {}) noexcept (false ) {
189216
190- std:: size_t const first_length = first.length ();
191- std:: size_t const second_length = second.length ();
217+ sz_size_t const first_length = first.length ();
218+ sz_size_t const second_length = second.length ();
192219 if (first_length == 0 ) return second_length;
193220 if (second_length == 0 ) return first_length;
194221
195- sz_size_t max_length = sz_max_of_two (first_length, second_length);
196- if (max_length < 256u ) {
197-
222+ // Estimate the maximum dimension of the DP matrix
223+ sz_size_t const max_dim = sz_max_of_two (first_length, second_length) + 1 ;
224+ if (max_dim < 256u ) {
198225 sz_u8_t result_u8;
199- sz_status_t status = score_diagonally<char , sz_u8_t , uniform_substitution_cost_t , 1 >(
200- first.data (), first_length, second.data (), second_length, {}, NULL , &result_u8);
201- sz_unused (status);
226+ sz_status_t status = score_diagonally<char , sz_u8_t , uniform_substitution_cost_t , allocator_type_>(
227+ first.data (), first_length, second.data (), second_length, &result_u8, 1 , uniform_substitution_cost_t {},
228+ std::forward<allocator_type_>(alloc));
229+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
202230 return result_u8;
203231 }
204- else if (max_length < 65536u ) {
232+ else if (max_dim < 65536u ) {
205233 sz_u16_t result_u16;
206- sz_status_t status = score_diagonally<char , sz_u16_t , uniform_substitution_cost_t , 1 >(
207- first.data (), first_length, second.data (), second_length, {}, NULL , &result_u16);
208- sz_unused (status);
234+ sz_status_t status = score_diagonally<char , sz_u16_t , uniform_substitution_cost_t , allocator_type_>(
235+ first.data (), first_length, second.data (), second_length, &result_u16, 1 , uniform_substitution_cost_t {},
236+ std::forward<allocator_type_>(alloc));
237+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
209238 return result_u16;
210239 }
211240 else {
212- sz_size_t result_size_t ;
213- sz_status_t status = score_diagonally<char , sz_size_t , uniform_substitution_cost_t , 1 >(
214- first.data (), first_length, second.data (), second_length, {}, NULL , &result_size_t );
215- sz_unused (status);
216- return result_size_t ;
241+ sz_size_t result_size;
242+ sz_status_t status = score_diagonally<char , sz_size_t , uniform_substitution_cost_t , allocator_type_>(
243+ first.data (), first_length, second.data (), second_length, &result_size, 1 , uniform_substitution_cost_t {},
244+ std::forward<allocator_type_>(alloc));
245+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
246+ return result_size;
217247 }
218248}
219249
220- inline void levenshtein_distance_utf8 ( //
221- sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
222- sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
223- sz_unused (a && b && a_length && b_length && alloc && result && bound);
250+ template < //
251+ typename first_type_, //
252+ typename second_type_, //
253+ typename allocator_type_ = std::allocator<char > //
254+ >
255+ inline sz_size_t levenshtein_distance_utf8 ( //
256+ first_type_ const &first, second_type_ const &second,
257+ allocator_type_ &&alloc = allocator_type_ {}) noexcept (false ) {
258+
259+ // Check if the strings are entirely composed of ASCII characters,
260+ // and default to a simpler algorithm in that case.
261+ if (sz_isascii (first.data (), first.length ()) && sz_isascii (second.data (), second.length ()))
262+ return levenshtein_distance (first, second);
263+
264+ // Allocate some memory to expand UTF-8 strings into UTF-32.
265+ sz_size_t const max_utf32_bytes = first.size () * 4 + second.size () * 4 ;
266+ sz_rune_t const *const first_utf32 = (sz_rune_t *)alloc.allocate (max_utf32_bytes);
267+ sz_rune_t const *const second_utf32 = first_utf32 + first.size ();
268+
269+ // Export into UTF-32 buffer.
270+ sz_rune_length_t rune_length;
271+ sz_size_t first_length_utf32 = 0 , second_length_utf32 = 0 ;
272+ for (sz_size_t progress_utf8 = 0 , progress_utf32 = 0 ; progress_utf8 < first.size ();
273+ progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
274+ sz_rune_parse (first.data () + progress_utf8, first_utf32 + progress_utf32, &rune_length);
275+ for (sz_size_t progress_utf8 = 0 , progress_utf32 = 0 ; progress_utf8 < second.size ();
276+ progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
277+ sz_rune_parse (second.data () + progress_utf8, second_utf32 + progress_utf32, &rune_length);
278+
279+ // Infer the largest distance type we may need fr aggregated error costs.
280+ // Estimate the maximum dimension of the DP matrix
281+ sz_size_t const max_dim = sz_max_of_two (first_length_utf32, second_length_utf32) + 1 ;
282+ if (max_dim < 256u ) {
283+ sz_u8_t result_u8;
284+ sz_status_t status = score_diagonally<sz_rune_t , sz_u8_t , uniform_substitution_cost_t , allocator_type_>(
285+ first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u8, 1 ,
286+ uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
287+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
288+ return result_u8;
289+ }
290+ else if (max_dim < 65536u ) {
291+ sz_u16_t result_u16;
292+ sz_status_t status = score_diagonally<sz_rune_t , sz_u16_t , uniform_substitution_cost_t , allocator_type_>(
293+ first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u16, 1 ,
294+ uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
295+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
296+ return result_u16;
297+ }
298+ else {
299+ sz_size_t result_size;
300+ sz_status_t status = score_diagonally<sz_rune_t , sz_size_t , uniform_substitution_cost_t , allocator_type_>(
301+ first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_size, 1 ,
302+ uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
303+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
304+ return result_size;
305+ }
224306}
225307
226- inline void needleman_wunsch_score ( //
227- sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
228- sz_error_cost_t const *subs, sz_error_cost_t gap, //
229- sz_memory_allocator_t *alloc, sz_ssize_t *result) {
230- sz_unused (a && b && a_length && b_length && subs && alloc && result && gap);
308+ template < //
309+ typename first_type_, //
310+ typename second_type_, //
311+ typename allocator_type_ = std::allocator<char > //
312+ >
313+ inline sz_ssize_t needleman_wunsch_score ( //
314+ first_type_ const &first, second_type_ const &second, //
315+ sz_error_cost_t const *subs, sz_error_cost_t gap, //
316+ allocator_type_ &&alloc = allocator_type_ {}) noexcept (false ) {
317+
318+ sz_size_t const first_length = first.length ();
319+ sz_size_t const second_length = second.length ();
320+ if (first_length == 0 ) return second_length;
321+ if (second_length == 0 ) return first_length;
322+
323+ // Estimate the maximum dimension of the DP matrix
324+ sz_size_t const max_dim = sz_max_of_two (first_length, second_length) + 1 ;
325+ if (max_dim < 256u ) {
326+ sz_u8_t result_u8;
327+ sz_status_t status = score_diagonally<char , sz_u8_t , lookup_substitution_cost_t , allocator_type_>(
328+ first.data (), first_length, second.data (), second_length, &result_u8, gap,
329+ lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
330+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
331+ return result_u8;
332+ }
333+ else if (max_dim < 65536u ) {
334+ sz_u16_t result_u16;
335+ sz_status_t status = score_diagonally<char , sz_u16_t , lookup_substitution_cost_t , allocator_type_>(
336+ first.data (), first_length, second.data (), second_length, &result_u16, gap,
337+ lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
338+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
339+ return result_u16;
340+ }
341+ else {
342+ sz_size_t result_size;
343+ sz_status_t status = score_diagonally<char , sz_size_t , lookup_substitution_cost_t , allocator_type_>(
344+ first.data (), first_length, second.data (), second_length, &result_size, gap,
345+ lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
346+ if (status == sz_bad_alloc_k) throw std::bad_alloc ();
347+ return result_size;
348+ }
231349}
232350
233351inline void levenshtein_distances () {}
0 commit comments