Skip to content

Commit d44beb4

Browse files
committed
Break: sz::edit_distance -> Levenshtein
1 parent 4b3847d commit d44beb4

File tree

3 files changed

+47
-43
lines changed

3 files changed

+47
-43
lines changed

include/stringzilla/stringzilla.hpp

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,7 +2030,7 @@ class basic_string_slice {
20302030
* * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
20312031
* * `try_` exception-free "try" operations that returning non-zero values on success,
20322032
* * `replace_all` and `erase_all` similar to Boost,
2033-
* * `edit_distance` - Levenshtein distance computation reusing the allocator,
2033+
* * `levenshtein_distance` - Levenshtein distance computation reusing the allocator,
20342034
* * `translate` - character mapping,
20352035
* * `randomize`, `random` - for fast random string generation.
20362036
*
@@ -3360,11 +3360,12 @@ class basic_string {
33603360

33613361
concatenation<string_view, string_view> operator|(string_view other) const noexcept { return {view(), other}; }
33623362

3363-
size_type edit_distance(string_view other, size_type bound = 0) const noexcept {
3364-
size_type result;
3365-
_with_alloc([&](sz_alloc_type &alloc) {
3363+
size_type levenshtein_distance(string_view other, size_type bound = std::numeric_limits<size_type>::max()) const
3364+
noexcept(false) {
3365+
size_type result = std::numeric_limits<size_type>::max();
3366+
raise(_with_alloc([&](sz_alloc_type &alloc) {
33663367
return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result);
3367-
});
3368+
}));
33683369
return result;
33693370
}
33703371

@@ -3839,7 +3840,7 @@ typename concatenation_result<first_type, second_type, following_types...>::type
38393840
template <typename char_type_>
38403841
std::size_t hamming_distance( //
38413842
basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
3842-
std::size_t bound = 0) noexcept {
3843+
std::size_t bound = SZ_SIZE_MAX) noexcept {
38433844
std::size_t result;
38443845
sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound, &result);
38453846
return result;
@@ -3852,7 +3853,7 @@ std::size_t hamming_distance(
38523853
template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
38533854
std::size_t hamming_distance( //
38543855
basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
3855-
std::size_t bound = 0) noexcept {
3856+
std::size_t bound = SZ_SIZE_MAX) noexcept {
38563857
return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
38573858
}
38583859

@@ -3862,7 +3863,8 @@ std::size_t hamming_distance(
38623863
*/
38633864
template <typename char_type_>
38643865
std::size_t hamming_distance_utf8( //
3865-
basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = 0) noexcept {
3866+
basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
3867+
std::size_t bound = SZ_SIZE_MAX) noexcept {
38663868
std::size_t result;
38673869
sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &result);
38683870
return result;
@@ -3875,7 +3877,7 @@ std::size_t hamming_distance_utf8( //
38753877
template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
38763878
std::size_t hamming_distance_utf8( //
38773879
basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b,
3878-
std::size_t bound = 0) noexcept {
3880+
std::size_t bound = SZ_SIZE_MAX) noexcept {
38793881
return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
38803882
}
38813883

@@ -3884,10 +3886,10 @@ std::size_t hamming_distance_utf8( //
38843886
* @sa sz_levenshtein_distance
38853887
*/
38863888
template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
3887-
std::size_t edit_distance( //
3889+
std::size_t levenshtein_distance( //
38883890
basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = SZ_SIZE_MAX,
38893891
allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
3890-
std::size_t result;
3892+
std::size_t result = SZ_SIZE_MAX;
38913893
raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
38923894
return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
38933895
}));
@@ -3899,21 +3901,21 @@ std::size_t edit_distance( //
38993901
* @sa sz_levenshtein_distance
39003902
*/
39013903
template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
3902-
std::size_t edit_distance( //
3904+
std::size_t levenshtein_distance( //
39033905
basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
39043906
std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
3905-
return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), bound, a.get_allocator());
3907+
return ashvardanian::stringzilla::levenshtein_distance(a.view(), b.view(), bound, a.get_allocator());
39063908
}
39073909

39083910
/**
39093911
* @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
39103912
* @sa sz_levenshtein_distance_utf8
39113913
*/
39123914
template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
3913-
std::size_t edit_distance_utf8( //
3915+
std::size_t levenshtein_distance_utf8( //
39143916
basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
39153917
std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
3916-
std::size_t result;
3918+
std::size_t result = SZ_SIZE_MAX;
39173919
raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
39183920
return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
39193921
}));
@@ -3925,10 +3927,10 @@ std::size_t edit_distance_utf8(
39253927
* @sa sz_levenshtein_distance_utf8
39263928
*/
39273929
template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
3928-
std::size_t edit_distance_utf8( //
3930+
std::size_t levenshtein_distance_utf8( //
39293931
basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
39303932
std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
3931-
return ashvardanian::stringzilla::edit_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
3933+
return ashvardanian::stringzilla::levenshtein_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
39323934
}
39333935

39343936
/**
@@ -3945,7 +3947,7 @@ std::ptrdiff_t alignment_score(
39453947
static_assert(std::is_signed<sz_error_cost_t>() == std::is_signed<std::int8_t>(),
39463948
"sz_error_cost_t must be signed.");
39473949

3948-
std::ptrdiff_t result;
3950+
std::ptrdiff_t result = SZ_SSIZE_MIN;
39493951
raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
39503952
return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc, &result);
39513953
}));

include/stringzilla/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
811811
#define SZ_CACHE_LINE_WIDTH (64) // bytes
812812
#define SZ_SIZE_MAX ((sz_size_t)(-1))
813813
#define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))
814+
#define SZ_SSIZE_MIN ((sz_ssize_t)(-SZ_SSIZE_MAX - 1))
814815

815816
SZ_INTERNAL sz_size_t _sz_size_max(void) { return SZ_SIZE_MAX; }
816817
SZ_INTERNAL sz_ssize_t _sz_ssize_max(void) { return SZ_SSIZE_MAX; }

scripts/test.cpp

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -975,27 +975,28 @@ static void test_non_stl_extensions_for_reads() {
975975
assert(sz::hamming_distance_utf8(str("abcdefgh"), str("_bcdefg_")) == 2); // replace ASCI prefix and suffix
976976
assert(sz::hamming_distance_utf8(str("αβγδ"), str("αγγδ")) == 1); // replace Beta UTF8 codepoint
977977

978-
assert(sz::edit_distance(str("hello"), str("hello")) == 0);
979-
assert(sz::edit_distance(str("hello"), str("hell")) == 1);
980-
assert(sz::edit_distance(str(""), str("")) == 0);
981-
assert(sz::edit_distance(str(""), str("abc")) == 3);
982-
assert(sz::edit_distance(str("abc"), str("")) == 3);
983-
assert(sz::edit_distance(str("abc"), str("ac")) == 1); // one deletion
984-
assert(sz::edit_distance(str("abc"), str("a_bc")) == 1); // one insertion
985-
assert(sz::edit_distance(str("abc"), str("adc")) == 1); // one substitution
986-
assert(sz::edit_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
987-
assert(sz::edit_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
988-
989-
assert(sz::edit_distance_utf8(str("hello"), str("hell")) == 1); // no unicode symbols, just ASCII
990-
assert(sz::edit_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
991-
assert(sz::edit_distance_utf8(str("💖"), str("💗")) == 1);
992-
993-
assert(sz::edit_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
994-
assert(sz::edit_distance_utf8(str("école"), str("école")) == 2); // etter "é" as a single character vs "e" + "´"
995-
assert(sz::edit_distance_utf8(str("façade"), str("facade")) == 1); // "ç" with cedilla vs. plain
996-
assert(sz::edit_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
997-
assert(sz::edit_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
998-
assert(sz::edit_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
978+
assert(sz::levenshtein_distance(str("hello"), str("hello")) == 0);
979+
assert(sz::levenshtein_distance(str("hello"), str("hell")) == 1);
980+
assert(sz::levenshtein_distance(str(""), str("")) == 0);
981+
assert(sz::levenshtein_distance(str(""), str("abc")) == 3);
982+
assert(sz::levenshtein_distance(str("abc"), str("")) == 3);
983+
assert(sz::levenshtein_distance(str("abc"), str("ac")) == 1); // one deletion
984+
assert(sz::levenshtein_distance(str("abc"), str("a_bc")) == 1); // one insertion
985+
assert(sz::levenshtein_distance(str("abc"), str("adc")) == 1); // one substitution
986+
assert(sz::levenshtein_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
987+
assert(sz::levenshtein_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
988+
989+
assert(sz::levenshtein_distance_utf8(str("hello"), str("hell")) == 1); // no unicode symbols, just ASCII
990+
assert(sz::levenshtein_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
991+
assert(sz::levenshtein_distance_utf8(str("💖"), str("💗")) == 1);
992+
993+
assert(sz::levenshtein_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
994+
assert(sz::levenshtein_distance_utf8(str("école"), str("école")) ==
995+
2); // etter "é" as a single character vs "e" + "´"
996+
assert(sz::levenshtein_distance_utf8(str("façade"), str("facade")) == 1); // "ç" with cedilla vs. plain
997+
assert(sz::levenshtein_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
998+
assert(sz::levenshtein_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
999+
assert(sz::levenshtein_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
9991000

10001001
// Computing alignment scores.
10011002
using matrix_t = std::int8_t[256][256];
@@ -1645,20 +1646,20 @@ static void test_levenshtein_distances() {
16451646
};
16461647

16471648
auto test_distance = [&](sz::string const &l, sz::string const &r, std::size_t expected) {
1648-
auto received = sz::edit_distance(l, r);
1649+
auto received = sz::levenshtein_distance(l, r);
16491650
auto received_score = sz::alignment_score(l, r, costs, -1);
16501651
if (received != expected) print_failure("Levenshtein", l, r, expected, received);
16511652
if ((std::size_t)(-received_score) != expected) print_failure("Scoring", l, r, expected, received_score);
16521653
// The distance relation commutes
1653-
received = sz::edit_distance(r, l);
1654+
received = sz::levenshtein_distance(r, l);
16541655
received_score = sz::alignment_score(r, l, costs, -1);
16551656
if (received != expected) print_failure("Levenshtein", r, l, expected, received);
16561657
if ((std::size_t)(-received_score) != expected) print_failure("Scoring", r, l, expected, received_score);
16571658

16581659
// Validate the bounded variants:
16591660
if (received > 1) {
1660-
assert(sz::edit_distance(l, r, received) == received);
1661-
assert(sz::edit_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
1661+
assert(sz::levenshtein_distance(l, r, received) == received);
1662+
assert(sz::levenshtein_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
16621663
}
16631664
};
16641665

0 commit comments

Comments
 (0)