Skip to content

Commit b4eb6a4

Browse files
committed
Improve: Measure gap magnitude
In some scenarios the cost of a gap can be higher than a mismatch and must be included in estimates.
1 parent 9b2b4e5 commit b4eb6a4

File tree

2 files changed

+69
-63
lines changed

2 files changed

+69
-63
lines changed

include/stringcuzilla/similarity.hpp

Lines changed: 68 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,15 @@ void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
8383

8484
struct linear_gap_costs_t {
8585
error_cost_t open_or_extend = 1;
86+
87+
constexpr sz_size_t magnitude() const noexcept { return std::abs(open_or_extend); }
8688
};
8789

8890
struct affine_gap_costs_t {
8991
error_cost_t open = 1;
9092
error_cost_t extend = 1;
93+
94+
constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(open), std::abs(extend)); }
9195
};
9296

9397
template <typename gap_costs_type_>
@@ -109,7 +113,7 @@ struct uniform_substitution_costs_t {
109113

110114
constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? match : mismatch; }
111115
constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? match : mismatch; }
112-
constexpr sz_size_t max_magnitude_change() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
116+
constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
113117
};
114118

115119
#pragma region - Algorithm Building Blocks
@@ -133,23 +137,22 @@ struct similarity_memory_requirements {
133137
size_t total = 0;
134138

135139
/**
136-
* @param[in] first_length The length of the first string in characters/codepoints.
137-
* @param[in] second_length The length of the second string in characters/codepoints.
138-
* @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
140+
* @param[in] first_length,second_length The lengths of strings in characters/codepoints/runes.
141+
* @param[in] substitute_magnitude,gap_magnitude The absolute value of the maximum change in nearby cells.
139142
* @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
140143
* @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
141144
*
142-
* To understand the @p max_magnitude_change parameter, consider the following example:
145+
* To understand the @p magnitude() parameter, consider the following example:
143146
* - substitution costs ranging from -16 to +15
144147
* - gap costs equal to -10
145148
* In that case, the biggest change will be `abs(-16) = 16`, so the passed argument should be 16.
146149
* In case of default Levenshtein distance, the maximum change is 1, so the passed argument should be 1.
147150
*/
148-
constexpr similarity_memory_requirements( //
149-
size_t first_length, size_t second_length, //
150-
sz_similarity_gaps_t gap_type, //
151-
size_t max_magnitude_change, //
152-
size_t bytes_per_char, //
151+
constexpr similarity_memory_requirements( //
152+
size_t first_length, size_t second_length, //
153+
sz_similarity_gaps_t gap_type, //
154+
size_t substitute_magnitude, size_t gap_magnitude, //
155+
size_t bytes_per_char, //
153156
size_t register_width) noexcept {
154157

155158
// Each diagonal in the DP matrix is only by 1 longer than the shorter string.
@@ -159,7 +162,8 @@ struct similarity_memory_requirements {
159162

160163
// The amount of memory we need per diagonal, depends on the maximum number of the differences
161164
// between 2 strings and the maximum cost of each change.
162-
size_t max_cell_value = (longer_length + 1) * max_magnitude_change;
165+
size_t magnitude = sz_max_of_two(substitute_magnitude, gap_magnitude);
166+
size_t max_cell_value = (longer_length + 1) * magnitude;
163167
if constexpr (!is_signed_k)
164168
this->bytes_per_cell = //
165169
max_cell_value < 256 ? 1
@@ -1401,9 +1405,9 @@ struct levenshtein_distance {
14011405

14021406
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
14031407
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
1404-
similarity_memory_requirements_t requirements( //
1405-
first.size(), second.size(), //
1406-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
1408+
similarity_memory_requirements_t requirements( //
1409+
first.size(), second.size(), //
1410+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
14071411
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
14081412

14091413
// When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1531,9 +1535,9 @@ struct levenshtein_distance_utf8 {
15311535

15321536
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
15331537
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
1534-
similarity_memory_requirements_t requirements( //
1535-
first.size(), second.size(), //
1536-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
1538+
similarity_memory_requirements_t requirements( //
1539+
first.size(), second.size(), //
1540+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
15371541
sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
15381542

15391543
span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
@@ -1630,9 +1634,9 @@ struct needleman_wunsch_score {
16301634

16311635
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
16321636
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
1633-
similarity_memory_requirements_t requirements( //
1634-
first.size(), second.size(), //
1635-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
1637+
similarity_memory_requirements_t requirements( //
1638+
first.size(), second.size(), //
1639+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
16361640
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
16371641

16381642
// When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1724,9 +1728,9 @@ struct smith_waterman_score {
17241728

17251729
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
17261730
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
1727-
similarity_memory_requirements_t requirements( //
1728-
first.size(), second.size(), //
1729-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
1731+
similarity_memory_requirements_t requirements( //
1732+
first.size(), second.size(), //
1733+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
17301734
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
17311735

17321736
// When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1783,7 +1787,7 @@ status_t _score_in_parallel( //
17831787
core_per_input_type_ &&core_per_input, //
17841788
all_cores_per_input_type_ &&all_cores_per_input, //
17851789
first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
1786-
sz_size_t max_magnitude_change, cpu_specs_t specs = {}) noexcept {
1790+
size_t substitute_magnitude, size_t gap_magnitude, cpu_specs_t specs = {}) noexcept {
17871791

17881792
using score_t = score_type_;
17891793
constexpr bool score_is_signed_k = std::is_signed_v<score_t>;
@@ -1808,9 +1812,9 @@ status_t _score_in_parallel( //
18081812
auto const &second = second_strings[i];
18091813

18101814
// ! Longer strings will be handled separately
1811-
similarity_memory_requirements_t requirements( //
1812-
first.size(), second.size(), //
1813-
gap_type<gap_costs_t>(), max_magnitude_change, //
1815+
similarity_memory_requirements_t requirements( //
1816+
first.size(), second.size(), //
1817+
gap_type<gap_costs_t>(), substitute_magnitude, gap_magnitude, //
18141818
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
18151819

18161820
if (requirements.total >= specs.l1_bytes) continue;
@@ -1824,9 +1828,9 @@ status_t _score_in_parallel( //
18241828
score_t result = 0;
18251829
auto const &first = first_strings[i];
18261830
auto const &second = second_strings[i];
1827-
similarity_memory_requirements_t requirements( //
1828-
first.size(), second.size(), //
1829-
gap_type<gap_costs_t>(), max_magnitude_change, //
1831+
similarity_memory_requirements_t requirements( //
1832+
first.size(), second.size(), //
1833+
gap_type<gap_costs_t>(), substitute_magnitude, gap_magnitude, //
18301834
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
18311835

18321836
if (requirements.total < specs.l1_bytes) continue;
@@ -1901,10 +1905,11 @@ struct levenshtein_distances {
19011905
results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
19021906

19031907
if constexpr (capability_k & sz_cap_parallel_k)
1904-
return _score_in_parallel<sz_size_t>( //
1905-
core_per_input_t {substituter_, gap_costs_, alloc_}, //
1906-
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
1907-
first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
1908+
return _score_in_parallel<sz_size_t>( //
1909+
core_per_input_t {substituter_, gap_costs_, alloc_}, //
1910+
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
1911+
first_strings, second_strings, std::forward<results_type_>(results), //
1912+
substituter_.magnitude(), gap_costs_.magnitude(), specs);
19081913
else
19091914
return _score_sequentially<sz_size_t>( //
19101915
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -1945,10 +1950,11 @@ struct levenshtein_distances_utf8 {
19451950
results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
19461951

19471952
if constexpr (capability_k & sz_cap_parallel_k)
1948-
return _score_in_parallel<sz_size_t>( //
1949-
core_per_input_t {substituter_, gap_costs_, alloc_}, //
1950-
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
1951-
first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
1953+
return _score_in_parallel<sz_size_t>( //
1954+
core_per_input_t {substituter_, gap_costs_, alloc_}, //
1955+
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
1956+
first_strings, second_strings, std::forward<results_type_>(results), //
1957+
substituter_.magnitude(), gap_costs_.magnitude(), specs);
19521958
else
19531959
return _score_sequentially<sz_size_t>( //
19541960
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -1991,11 +1997,11 @@ struct needleman_wunsch_scores {
19911997
results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
19921998

19931999
if constexpr (capability_k & sz_cap_parallel_k)
1994-
return _score_in_parallel<sz_ssize_t>( //
1995-
core_per_input_t {substituter_, gap_costs_, alloc_}, //
1996-
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
1997-
first_strings, second_strings, std::forward<results_type_>(results),
1998-
substituter_.max_magnitude_change(), specs);
2000+
return _score_in_parallel<sz_ssize_t>( //
2001+
core_per_input_t {substituter_, gap_costs_, alloc_}, //
2002+
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
2003+
first_strings, second_strings, std::forward<results_type_>(results), //
2004+
substituter_.magnitude(), gap_costs_.magnitude(), specs);
19992005
else
20002006
return _score_sequentially<sz_ssize_t>( //
20012007
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -2038,11 +2044,11 @@ struct smith_waterman_scores {
20382044
results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
20392045

20402046
if constexpr (capability_k & sz_cap_parallel_k)
2041-
return _score_in_parallel<sz_ssize_t>( //
2042-
core_per_input_t {substituter_, gap_costs_, alloc_}, //
2043-
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
2044-
first_strings, second_strings, std::forward<results_type_>(results),
2045-
substituter_.max_magnitude_change(), specs);
2047+
return _score_in_parallel<sz_ssize_t>( //
2048+
core_per_input_t {substituter_, gap_costs_, alloc_}, //
2049+
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
2050+
first_strings, second_strings, std::forward<results_type_>(results), //
2051+
substituter_.magnitude(), gap_costs_.magnitude(), specs);
20462052
else
20472053
return _score_sequentially<sz_ssize_t>( //
20482054
all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -2081,7 +2087,7 @@ struct error_costs_256x256_t {
20812087
return result;
20822088
}
20832089

2084-
constexpr sz_size_t max_magnitude_change() const noexcept {
2090+
constexpr sz_size_t magnitude() const noexcept {
20852091
sz_size_t max_magnitude = 0;
20862092
for (int i = 0; i != 256; ++i)
20872093
for (int j = 0; j != 256; ++j) //
@@ -2170,7 +2176,7 @@ struct error_costs_26x26ascii_t {
21702176
return result;
21712177
}
21722178

2173-
constexpr sz_size_t max_magnitude_change() const noexcept {
2179+
constexpr sz_size_t magnitude() const noexcept {
21742180
sz_size_t max_magnitude = 0;
21752181
for (int i = 0; i != 26; ++i)
21762182
for (int j = 0; j != 26; ++j) //
@@ -2728,9 +2734,9 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
27282734

27292735
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
27302736
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
2731-
similarity_memory_requirements_t requirements( //
2732-
first.size(), second.size(), //
2733-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
2737+
similarity_memory_requirements_t requirements( //
2738+
first.size(), second.size(), //
2739+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
27342740
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
27352741

27362742
// When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -2833,9 +2839,9 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
28332839

28342840
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
28352841
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
2836-
similarity_memory_requirements_t requirements( //
2837-
first.size(), second.size(), //
2838-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
2842+
similarity_memory_requirements_t requirements( //
2843+
first.size(), second.size(), //
2844+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
28392845
sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
28402846

28412847
// When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -3121,9 +3127,9 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
31213127

31223128
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
31233129
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
3124-
similarity_memory_requirements_t requirements( //
3125-
first.size(), second.size(), //
3126-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
3130+
similarity_memory_requirements_t requirements( //
3131+
first.size(), second.size(), //
3132+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
31273133
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
31283134

31293135
// When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -3195,9 +3201,9 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
31953201

31963202
// Estimate the maximum dimension of the DP matrix and choose the best type for it.
31973203
using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
3198-
similarity_memory_requirements_t requirements( //
3199-
first.size(), second.size(), //
3200-
gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
3204+
similarity_memory_requirements_t requirements( //
3205+
first.size(), second.size(), //
3206+
gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
32013207
sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
32023208

32033209
// When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.

include/stringzilla/types.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ enum class status_t {
130130
struct error_costs_unary_t {
131131
constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
132132
constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? 0 : 1; }
133-
constexpr sz_size_t max_magnitude_change() const noexcept { return 1; }
133+
constexpr sz_size_t magnitude() const noexcept { return 1; }
134134
};
135135

136136
template <typename value_type_>

0 commit comments

Comments
 (0)