Skip to content

Commit fea39ed

Browse files
committed
Fix: NW/SW test correspondence
1 parent c96c5ed commit fea39ed

File tree

2 files changed

+64
-39
lines changed

2 files changed

+64
-39
lines changed

scripts/test.cu

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ template <typename value_type_>
5454
using unified_vector = std::vector<value_type_, sz::cuda::unified_alloc<value_type_>>;
5555
#endif
5656

57-
struct levenshtein_baseline_t {
57+
struct levenshtein_baselines_t {
58+
template <typename results_type_>
5859
sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
59-
sz_size_t *results) const {
60+
results_type_ *results) const {
6061
_sz_assert(first.size() == second.size());
6162
#pragma omp parallel for
6263
for (std::size_t i = 0; i != first.size(); ++i)
@@ -66,9 +67,9 @@ struct levenshtein_baseline_t {
6667
}
6768
};
6869

69-
struct needleman_wunsch_baseline_t {
70+
struct needleman_wunsch_baselines_t {
7071

71-
sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
72+
sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
7273
sz::error_cost_t gap_cost = -1;
7374

7475
sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
@@ -85,9 +86,9 @@ struct needleman_wunsch_baseline_t {
8586
}
8687
};
8788

88-
struct smith_waterman_baseline_t {
89+
struct smith_waterman_baselines_t {
8990

90-
sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
91+
sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
9192
sz::error_cost_t gap_cost = -1;
9293

9394
sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
@@ -146,7 +147,7 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
146147
format_string = "Edit Distance error (got %zd, expected %zd): \"%.22s%s\"\"%.22s%s\" \n";
147148
}
148149
else { format_string = "Edit Distance error (got %zu, expected %zu): \"%.22s%s\"\"%.22s%s\" \n"; }
149-
std::printf(format_string, first.c_str(), ellipsis, second.c_str(), ellipsis, result_base, result_simd);
150+
std::printf(format_string, result_simd, result_base, first.c_str(), ellipsis, second.c_str(), ellipsis);
150151
}
151152

152153
/**
@@ -262,46 +263,68 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
262263

263264
static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_string_length = 100) {
264265

266+
// Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
267+
// It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
268+
// In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
269+
constexpr sz::error_cost_t unary_match_score = 1;
270+
constexpr sz::error_cost_t unary_mismatch_score = 0;
271+
constexpr sz::error_cost_t unary_gap_score = 0;
272+
using substituter_256x256_t = sz::error_costs_256x256_lookup_t;
273+
sz::scripts::error_costs_256x256_t costs_unary =
274+
sz::scripts::error_costs_256x256_diagonal(unary_match_score, unary_mismatch_score);
275+
substituter_256x256_t substituter_unary(costs_unary.data());
276+
{
277+
auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
278+
auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
279+
auto similarity_sw = smith_waterman_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
280+
// Distance can be computed from the similarity, by inverting the sign around the length of the longest string:
281+
auto distance_nw = std::max(7, 7) - similarity_nw;
282+
auto distance_sw = std::max(7, 7) - similarity_sw;
283+
_sz_assert(distance_l == 1);
284+
_sz_assert(distance_nw == 1);
285+
_sz_assert(distance_sw == 1);
286+
}
287+
288+
// Now systematically compare the results of the baseline and SIMD implementations
265289
constexpr sz_capability_t serial_k = sz_cap_serial_k;
266290
constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
267291

268292
edit_distances_compare<sz_size_t>( //
269-
levenshtein_baseline_t {}, //
293+
levenshtein_baselines_t {}, //
270294
sz::openmp::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
271295
batch_size, max_string_length);
272296

273297
edit_distances_compare<sz_size_t>( //
274-
levenshtein_baseline_t {}, //
298+
levenshtein_baselines_t {}, //
275299
sz::openmp::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
276300
batch_size, max_string_length);
277301

278-
constexpr sz::error_cost_t blosum62_gap_opening_cost = 11;
279-
constexpr sz::error_cost_t blosum62_gap_extension_cost = 1;
302+
// Now let's take non-unary substitution costs, like BLOSUM62
303+
constexpr sz::error_cost_t blosum62_gap_extension_cost = 4; // ? The inverted typical (-4) value
280304
sz::scripts::error_costs_256x256_t blosum62 = sz::scripts::error_costs_256x256_blosum62();
281-
using substituter_t = sz::error_costs_256x256_lookup_t;
282305

283-
edit_distances_compare<sz_ssize_t>( //
284-
needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
285-
sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_t, std::allocator<char>> {
286-
substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
306+
edit_distances_compare<sz_ssize_t>( //
307+
needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
308+
sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
309+
substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
287310
batch_size, max_string_length);
288311

289-
edit_distances_compare<sz_ssize_t>( //
290-
needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
291-
sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_t, std::allocator<char>> {
292-
substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
312+
edit_distances_compare<sz_ssize_t>( //
313+
needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
314+
sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
315+
substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
293316
batch_size, max_string_length);
294317

295-
edit_distances_compare<sz_ssize_t>( //
296-
smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
297-
sz::openmp::smith_waterman_scores<serial_k, char, substituter_t, std::allocator<char>> {
298-
substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
318+
edit_distances_compare<sz_ssize_t>( //
319+
smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
320+
sz::openmp::smith_waterman_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
321+
substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
299322
batch_size, max_string_length);
300323

301-
edit_distances_compare<sz_ssize_t>( //
302-
smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
303-
sz::openmp::smith_waterman_scores<parallel_k, char, substituter_t, std::allocator<char>> {
304-
substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
324+
edit_distances_compare<sz_ssize_t>( //
325+
smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
326+
sz::openmp::smith_waterman_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
327+
substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
305328
batch_size, max_string_length);
306329
};
307330

@@ -347,7 +370,7 @@ static void test_non_stl_extensions_for_reads() {
347370

348371
// Computing alignment scores.
349372
using matrix_t = std::int8_t[256][256];
350-
sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_unary();
373+
sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_diagonal();
351374
matrix_t &costs = *reinterpret_cast<matrix_t *>(substitution_costs.data());
352375

353376
_sz_assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);

scripts/test.hpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,9 @@ inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1
142142
std::ptrdiff_t *row = &matrix_buffer[i * cols];
143143
for (std::size_t j = 1; j < cols; ++j) {
144144
std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
145-
std::ptrdiff_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
146-
row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
145+
std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
146+
std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
147+
row[j] = std::max(if_deletion_or_insertion, if_substitution);
147148
}
148149
}
149150

@@ -162,7 +163,7 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
162163
std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
163164

164165
// Unlike the global alignment we need to track the largest score in the matrix.
165-
std::ptrdiff_t max_score = 0;
166+
std::ptrdiff_t best_score = 0;
166167

167168
// Initialize the borders of the matrix to 0.
168169
for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
@@ -175,15 +176,15 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
175176
for (std::size_t j = 1; j < cols; ++j) {
176177
std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
177178
std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
178-
std::ptrdiff_t if_deletion = last_row[j] + gap_cost;
179-
std::ptrdiff_t if_insertion = row[j - 1] + gap_cost;
180-
std::ptrdiff_t score = std::max({std::ptrdiff_t(0), if_substitution, if_deletion, if_insertion});
179+
std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
180+
std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
181+
std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
181182
row[j] = score;
182-
max_score = std::max(max_score, score);
183+
best_score = std::max(best_score, score);
183184
}
184185
}
185186

186-
return max_score;
187+
return best_score;
187188
}
188189

189190
using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
@@ -192,11 +193,12 @@ using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
192193
* @brief Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
193194
* that would yield the same result as the negative Levenshtein distance.
194195
*/
195-
inline error_costs_256x256_t error_costs_256x256_unary() noexcept {
196+
inline error_costs_256x256_t error_costs_256x256_diagonal(error_cost_t match_score = 0,
197+
error_cost_t mismatch_score = -1) noexcept {
196198
error_costs_256x256_t result;
197199
for (std::size_t i = 0; i != 256; ++i)
198200
for (std::size_t j = 0; j != 256; ++j) //
199-
result[i * 256 + j] = i == j ? 0 : -1;
201+
result[i * 256 + j] = i == j ? match_score : mismatch_score;
200202
return result;
201203
}
202204

0 commit comments

Comments
 (0)