@@ -40,7 +40,7 @@ struct similarities_callable {
4040 if (env.tokens .size () <= batch_size) throw std::runtime_error (" Batch size is too large." );
4141 }
4242
43- inline call_result_t operator ()(std::size_t batch_index) noexcept (false ) {
43+ call_result_t operator ()(std::size_t batch_index) noexcept (false ) {
4444 std::size_t const batch_size = results.size ();
4545 std::size_t const forward_token_index = (batch_index * batch_size) % (env.tokens .size () - batch_size);
4646 std::size_t const backward_token_index = env.tokens .size () - forward_token_index - batch_size;
@@ -49,10 +49,11 @@ struct similarities_callable {
4949 {env.tokens .data () + backward_token_index, batch_size});
5050 }
5151
52- inline call_result_t operator ()(std::span<token_view_t const > a, std::span<token_view_t const > b) noexcept (false ) {
52+ call_result_t operator ()(std::span<token_view_t const > a, std::span<token_view_t const > b) noexcept (false ) {
5353 // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
5454 sz::status_t status =
5555 std::apply ([&](auto &&...rest ) { return engine (a, b, results.data (), rest...); }, extra_args);
56+ do_not_optimize (status);
5657
5758 if (status != sz::status_t ::success_k) throw std::runtime_error (" Failed to compute Levenshtein distance." );
5859 do_not_optimize (results);
@@ -71,7 +72,7 @@ struct similarities_callable {
7172};
7273
7374struct similarities_equality_t {
74- bool operator ()(check_value_t const &a, check_value_t const &b) const {
75+ bool operator ()(check_value_t const &a, check_value_t const &b) const noexcept {
7576 similarities_t const &a_ = *reinterpret_cast <similarities_t const *>(a);
7677 similarities_t const &b_ = *reinterpret_cast <similarities_t const *>(b);
7778 if (a_.size () != b_.size ()) return false ;
@@ -133,10 +134,56 @@ void bench_levenshtein(environment_t const &env) {
133134 bench_result_t utf8_baseline = bench_unary (env, name_utf8_baseline, call_utf8_baseline).log ();
134135
135136#if SZ_USE_ICE
136- bench_unary (env, " levenshtein_utf8_ice:batch" s + std::to_string (batch_size), call_baseline ,
137+ bench_unary (env, " levenshtein_utf8_ice:batch" s + std::to_string (batch_size), call_utf8_baseline ,
137138 similarities_callable<levenshtein_utf8_ice_t >(env, results_accelerated, batch_size),
138139 callable_no_op_t {}, // preprocessing
139140 similarities_equality_t {}) // equality check
141+ .log (utf8_baseline);
142+ scramble_accelerated_results ();
143+ #endif
144+ }
145+ }
146+
147+ void bench_needleman_wunsch (environment_t const &env) {
148+
149+ using namespace std ::string_literals; // for "s" suffix
150+
151+ #if SZ_USE_CUDA
152+ sz::gpu_specs_t specs = *sz::gpu_specs ();
153+ #endif
154+ std::vector<std::size_t > batch_sizes = {1024 / 32 , 1024 , 1024 * 32 };
155+ #if SZ_DEBUG
156+ batch_sizes = {1 , 2 , 32 };
157+ #endif
158+ similarities_t results_baseline, results_accelerated;
159+
160+ auto scramble_accelerated_results = [&] {
161+ std::shuffle (results_accelerated.begin (), results_accelerated.end (), global_random_generator ());
162+ };
163+
164+ for (std::size_t batch_size : batch_sizes) {
165+ results_baseline.resize (batch_size);
166+ results_accelerated.resize (batch_size);
167+
168+ auto call_baseline = similarities_callable<needleman_wunsch_serial_t >(env, results_baseline, batch_size);
169+ auto name_baseline = " needleman_wunsch_serial:batch" s + std::to_string (batch_size);
170+ bench_result_t baseline = bench_unary (env, name_baseline, call_baseline).log ();
171+
172+ #if SZ_USE_ICE
173+ bench_unary (env, " needleman_wunsch_ice:batch" s + std::to_string (batch_size), call_baseline,
174+ similarities_callable<needleman_wunsch_ice_t >(env, results_accelerated, batch_size),
175+ callable_no_op_t {}, // preprocessing
176+ similarities_equality_t {}) // equality check
177+ .log (baseline);
178+ scramble_accelerated_results ();
179+ #endif
180+
181+ #if SZ_USE_CUDA
182+ bench_unary (env, " needleman_wunsch_cuda:batch" s + std::to_string (batch_size), call_baseline,
183+ similarities_callable<needleman_wunsch_cuda_t , sz::gpu_specs_t >(env, results_accelerated,
184+ batch_size, specs),
185+ callable_no_op_t {}, // preprocessing
186+ similarities_equality_t {}) // equality check
140187 .log (baseline);
141188 scramble_accelerated_results ();
142189#endif
0 commit comments