Skip to content

Commit 6b5ef98

Browse files
committed
Fix: Shifting Levenshtein diagonals in OpenMP
1 parent 37863c9 commit 6b5ef98

File tree

3 files changed

+419
-203
lines changed

3 files changed

+419
-203
lines changed

include/stringzilla/similarity.hpp

Lines changed: 174 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
*
66
* Includes core APIs:
77
*
8-
* - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
9-
* - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
8+
* - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
9+
* - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
1010
*
11-
* Those are mostly providing specialized overloads of the @b `sz::openmp::score_diagonally` template.
11+
* Those are mostly providing specialized overloads of the @b `sz::score_diagonally` template.
1212
*/
1313
#ifndef STRINGZILLA_SIMILARITY_HPP_
1414
#define STRINGZILLA_SIMILARITY_HPP_
@@ -23,10 +23,14 @@ struct uniform_substitution_cost_t {
2323
sz_error_cost_t operator()(char a, char b) const { return a == b ? 0 : 1; }
2424
};
2525

26+
struct lookup_substitution_cost_t {
27+
sz_error_cost_t const *costs;
28+
sz_error_cost_t operator()(char a, char b) const { return costs[(sz_u8_t)a * 256 + (sz_u8_t)b]; }
29+
};
30+
2631
/**
2732
* @brief Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
2833
* @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
29-
* @sa sz_levenshtein_distance, sz_levenshtein_distance_utf8, sz_needleman_wunsch_score
3034
*
3135
* @param[in] first The first string.
3236
* @param[in] first_length The length of the first string.
@@ -36,25 +40,37 @@ struct uniform_substitution_cost_t {
3640
* There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
3741
* Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
3842
* This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
43+
*
44+
* @note The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
45+
* in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
46+
* @sa For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
47+
* @sa For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
48+
* @sa For bulk API, use `sz::levenshtein_distances[_utf8]`.
3949
*/
4050
template < //
4151
typename char_type_ = char, //
4252
typename distance_type_ = sz_size_t, //
4353
typename get_substitution_cost_ = uniform_substitution_cost_t, //
44-
sz_error_cost_t gap_cost_ = 1 //
54+
typename allocator_type_ = std::allocator<char> //
4555
>
46-
sz_status_t score_diagonally( //
47-
char_type_ const *shorter, sz_size_t shorter_length, //
48-
char_type_ const *longer, sz_size_t longer_length, //
49-
get_substitution_cost_ &&get_substitution_cost, //
50-
sz_memory_allocator_t *alloc, //
51-
distance_type_ *result_ptr) {
52-
56+
sz_status_t score_diagonally( //
57+
char_type_ const *first, sz_size_t first_length, //
58+
char_type_ const *second, sz_size_t second_length, //
59+
distance_type_ *result_ptr,
60+
sz_error_cost_t gap_cost = 1, //
61+
get_substitution_cost_ &&get_substitution_cost = uniform_substitution_cost_t {}, //
62+
allocator_type_ &&alloc = allocator_type_ {} //
63+
) {
5364
// Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
54-
sz_memory_allocator_t global_alloc;
55-
if (!alloc) {
56-
sz_memory_allocator_init_default(&global_alloc);
57-
alloc = &global_alloc;
65+
using allocated_type = typename allocator_type_::value_type;
66+
static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
67+
68+
// Make sure the size relation between the strings is correct.
69+
char_type_ const *shorter = first, *longer = second;
70+
sz_size_t shorter_length = first_length, longer_length = second_length;
71+
if (shorter_length > longer_length) {
72+
std::swap(shorter, longer);
73+
std::swap(shorter_length, longer_length);
5874
}
5975

6076
// We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
@@ -75,14 +91,14 @@ sz_status_t score_diagonally( //
7591
// Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
7692
sz_size_t const buffer_length =
7793
sizeof(distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof(char_type_);
78-
distance_type_ *const distances = (distance_type_ *)alloc->allocate(buffer_length, alloc->handle);
94+
distance_type_ *const distances = (distance_type_ *)alloc.allocate(buffer_length);
7995
if (!distances) return sz_bad_alloc_k;
8096

8197
// The next few pointers will be swapped around.
8298
distance_type_ *previous_distances = distances;
83-
distance_type_ *current_distances = previous_distances + longer_dim;
84-
distance_type_ *next_distances = current_distances + longer_dim;
85-
char_type_ *const shorter_reversed = (char_type_ *)(next_distances + longer_dim);
99+
distance_type_ *current_distances = previous_distances + max_diagonal_length;
100+
distance_type_ *next_distances = current_distances + max_diagonal_length;
101+
char_type_ *const shorter_reversed = (char_type_ *)(next_distances + max_diagonal_length);
86102

87103
// Export the reversed string into the buffer.
88104
for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -107,13 +123,13 @@ sz_status_t score_diagonally( //
107123
char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
108124
char_type_ longer_char = longer[offset_in_diagonal - 1];
109125
sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
110-
distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
126+
distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
111127
distance_type_ cost_if_deletion_or_insertion = //
112128
sz_min_of_two( //
113129
current_distances[offset_in_diagonal - 1], //
114130
current_distances[offset_in_diagonal] //
115131
) +
116-
gap_cost_;
132+
gap_cost;
117133
next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
118134
}
119135
// Don't forget to populate the first row and the first column of the Levenshtein matrix.
@@ -139,16 +155,19 @@ sz_status_t score_diagonally( //
139155
current_distances[offset_in_diagonal], //
140156
current_distances[offset_in_diagonal + 1] //
141157
) +
142-
gap_cost_;
158+
gap_cost;
143159
next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
144160
}
145161
next_distances[next_diagonal_length - 1] = next_diagonal_index;
146162
// Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
147163
// dropping the first element in the current array.
148164
distance_type_ *temporary = previous_distances;
149-
previous_distances = current_distances + 1; // ! Note how we shift forward here
165+
previous_distances = current_distances;
150166
current_distances = next_distances;
151167
next_distances = temporary;
168+
// ! Drop the first entry among the current distances.
169+
sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
170+
(max_diagonal_length - 1) * sizeof(distance_type_));
152171
}
153172

154173
// Now let's handle the bottom-right triangle of the matrix.
@@ -165,69 +184,168 @@ sz_status_t score_diagonally( //
165184
current_distances[offset_in_diagonal], //
166185
current_distances[offset_in_diagonal + 1] //
167186
) +
168-
gap_cost_;
187+
gap_cost;
169188
next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
170189
}
171190
// Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
172191
// dropping the first element in the current array.
173192
distance_type_ *temporary = previous_distances;
174-
previous_distances = current_distances + 1; // ! Note how we shift forward here
193+
previous_distances = current_distances;
175194
current_distances = next_distances;
176195
next_distances = temporary;
196+
// ! Drop the first entry among the current distances.
197+
sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
198+
(max_diagonal_length - 1) * sizeof(distance_type_));
177199
}
178200

179201
// Cache scalar before `free` call.
180202
distance_type_ result = current_distances[0];
181-
alloc->free(distances, buffer_length, alloc->handle);
203+
alloc.deallocate((allocated_type *)distances, buffer_length);
182204
*result_ptr = result;
183205
return sz_success_k;
184206
}
185207

186-
template <typename first_string_type_, typename second_string_type_>
187-
inline std::size_t levenshtein_distance( //
188-
first_string_type_ const &first, second_string_type_ const &second) {
208+
template < //
209+
typename first_type_, //
210+
typename second_type_, //
211+
typename allocator_type_ = std::allocator<char> //
212+
>
213+
inline sz_size_t levenshtein_distance( //
214+
first_type_ const &first, second_type_ const &second,
215+
allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
189216

190-
std::size_t const first_length = first.length();
191-
std::size_t const second_length = second.length();
217+
sz_size_t const first_length = first.length();
218+
sz_size_t const second_length = second.length();
192219
if (first_length == 0) return second_length;
193220
if (second_length == 0) return first_length;
194221

195-
sz_size_t max_length = sz_max_of_two(first_length, second_length);
196-
if (max_length < 256u) {
197-
222+
// Estimate the maximum dimension of the DP matrix
223+
sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
224+
if (max_dim < 256u) {
198225
sz_u8_t result_u8;
199-
sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, 1>(
200-
first.data(), first_length, second.data(), second_length, {}, NULL, &result_u8);
201-
sz_unused(status);
226+
sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
227+
first.data(), first_length, second.data(), second_length, &result_u8, 1, uniform_substitution_cost_t {},
228+
std::forward<allocator_type_>(alloc));
229+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
202230
return result_u8;
203231
}
204-
else if (max_length < 65536u) {
232+
else if (max_dim < 65536u) {
205233
sz_u16_t result_u16;
206-
sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, 1>(
207-
first.data(), first_length, second.data(), second_length, {}, NULL, &result_u16);
208-
sz_unused(status);
234+
sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
235+
first.data(), first_length, second.data(), second_length, &result_u16, 1, uniform_substitution_cost_t {},
236+
std::forward<allocator_type_>(alloc));
237+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
209238
return result_u16;
210239
}
211240
else {
212-
sz_size_t result_size_t;
213-
sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, 1>(
214-
first.data(), first_length, second.data(), second_length, {}, NULL, &result_size_t);
215-
sz_unused(status);
216-
return result_size_t;
241+
sz_size_t result_size;
242+
sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
243+
first.data(), first_length, second.data(), second_length, &result_size, 1, uniform_substitution_cost_t {},
244+
std::forward<allocator_type_>(alloc));
245+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
246+
return result_size;
217247
}
218248
}
219249

220-
inline void levenshtein_distance_utf8( //
221-
sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
222-
sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
223-
sz_unused(a && b && a_length && b_length && alloc && result && bound);
250+
template < //
251+
typename first_type_, //
252+
typename second_type_, //
253+
typename allocator_type_ = std::allocator<char> //
254+
>
255+
inline sz_size_t levenshtein_distance_utf8( //
256+
first_type_ const &first, second_type_ const &second,
257+
allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
258+
259+
// Check if the strings are entirely composed of ASCII characters,
260+
// and default to a simpler algorithm in that case.
261+
if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
262+
return levenshtein_distance(first, second);
263+
264+
// Allocate some memory to expand UTF-8 strings into UTF-32.
265+
sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
266+
sz_rune_t const *const first_utf32 = (sz_rune_t *)alloc.allocate(max_utf32_bytes);
267+
sz_rune_t const *const second_utf32 = first_utf32 + first.size();
268+
269+
// Export into UTF-32 buffer.
270+
sz_rune_length_t rune_length;
271+
sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
272+
for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
273+
progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
274+
sz_rune_parse(first.data() + progress_utf8, first_utf32 + progress_utf32, &rune_length);
275+
for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
276+
progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
277+
sz_rune_parse(second.data() + progress_utf8, second_utf32 + progress_utf32, &rune_length);
278+
279+
// Infer the largest distance type we may need fr aggregated error costs.
280+
// Estimate the maximum dimension of the DP matrix
281+
sz_size_t const max_dim = sz_max_of_two(first_length_utf32, second_length_utf32) + 1;
282+
if (max_dim < 256u) {
283+
sz_u8_t result_u8;
284+
sz_status_t status = score_diagonally<sz_rune_t, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
285+
first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u8, 1,
286+
uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
287+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
288+
return result_u8;
289+
}
290+
else if (max_dim < 65536u) {
291+
sz_u16_t result_u16;
292+
sz_status_t status = score_diagonally<sz_rune_t, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
293+
first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u16, 1,
294+
uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
295+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
296+
return result_u16;
297+
}
298+
else {
299+
sz_size_t result_size;
300+
sz_status_t status = score_diagonally<sz_rune_t, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
301+
first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_size, 1,
302+
uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
303+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
304+
return result_size;
305+
}
224306
}
225307

226-
inline void needleman_wunsch_score( //
227-
sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
228-
sz_error_cost_t const *subs, sz_error_cost_t gap, //
229-
sz_memory_allocator_t *alloc, sz_ssize_t *result) {
230-
sz_unused(a && b && a_length && b_length && subs && alloc && result && gap);
308+
template < //
309+
typename first_type_, //
310+
typename second_type_, //
311+
typename allocator_type_ = std::allocator<char> //
312+
>
313+
inline sz_ssize_t needleman_wunsch_score( //
314+
first_type_ const &first, second_type_ const &second, //
315+
sz_error_cost_t const *subs, sz_error_cost_t gap, //
316+
allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
317+
318+
sz_size_t const first_length = first.length();
319+
sz_size_t const second_length = second.length();
320+
if (first_length == 0) return second_length;
321+
if (second_length == 0) return first_length;
322+
323+
// Estimate the maximum dimension of the DP matrix
324+
sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
325+
if (max_dim < 256u) {
326+
sz_u8_t result_u8;
327+
sz_status_t status = score_diagonally<char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
328+
first.data(), first_length, second.data(), second_length, &result_u8, gap,
329+
lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
330+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
331+
return result_u8;
332+
}
333+
else if (max_dim < 65536u) {
334+
sz_u16_t result_u16;
335+
sz_status_t status = score_diagonally<char, sz_u16_t, lookup_substitution_cost_t, allocator_type_>(
336+
first.data(), first_length, second.data(), second_length, &result_u16, gap,
337+
lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
338+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
339+
return result_u16;
340+
}
341+
else {
342+
sz_size_t result_size;
343+
sz_status_t status = score_diagonally<char, sz_size_t, lookup_substitution_cost_t, allocator_type_>(
344+
first.data(), first_length, second.data(), second_length, &result_size, gap,
345+
lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
346+
if (status == sz_bad_alloc_k) throw std::bad_alloc();
347+
return result_size;
348+
}
231349
}
232350

233351
inline void levenshtein_distances() {}

scripts/bench_similarity.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
#include "bench.hpp"
5252
#include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
5353

54+
#include "stringzilla/similarity.hpp"
55+
5456
using namespace ashvardanian::stringzilla::scripts;
5557

5658
#pragma region Hamming Distance
@@ -141,6 +143,26 @@ struct alignment_score_from_sz {
141143
}
142144
};
143145

146+
/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
147+
template <sz_levenshtein_distance_t levenshtein_distance_>
148+
struct score_from_sz_cpp {
149+
150+
environment_t const &env;
151+
sz_size_t bound = SZ_SIZE_MAX;
152+
153+
inline call_result_t operator()(std::size_t token_index) const noexcept {
154+
return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
155+
}
156+
157+
inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept(false) {
158+
sz_size_t result_distance = sz::openmp::levenshtein_distance(a, b);
159+
do_not_optimize(result_distance);
160+
std::size_t bytes_passed = std::min(a.size(), b.size());
161+
std::size_t cells_passed = a.size() * b.size();
162+
return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
163+
}
164+
};
165+
144166
void bench_edits(environment_t const &env) {
145167
auto base_call = levenshtein_from_sz<sz_levenshtein_distance_serial>(env);
146168
bench_result_t base = bench_unary(env, "sz_levenshtein_distance_serial", base_call).log();

0 commit comments

Comments
 (0)