Skip to content

Commit 9dd78c8

Browse files
committed
[cudaaligner] banded Myers: allow individual bandwidth
1 parent 44315a7 commit 9dd78c8

File tree

5 files changed

+68
-23
lines changed

5 files changed

+68
-23
lines changed

cudaaligner/include/claraparabricks/genomeworks/cudaaligner/aligner.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,20 @@ class FixedBandAligner : public Aligner
140140
/// Resets all data of the aligner and resets the bandwidth to the given argument.
141141
/// \param max_bandwidth The new maximal bandwidth to use for the fixed diagonal band of the Needleman-Wunsch matrix. Is not allowed to be a (multiple of 32) + 1. If such a value is passed it will throw and std::invalid_argument exception.
142142
virtual void reset_max_bandwidth(int32_t max_bandwidth) = 0;
143+
144+
using Aligner::add_alignment;
145+
/// \brief Add new alignment object. Only strings with characters
146+
/// from the alphabet [ACGT] are guaranteed to provide correct results.
147+
///
148+
/// \param max_bandwidth The new maximal bandwidth to use for the fixed diagonal band of the Needleman-Wunsch matrix of the alignment. Is not allowed to be a (multiple of 32) + 1. If such a value is passed it will throw and std::invalid_argument exception.
149+
/// \param query Query string
150+
/// \param query_length Query string length
151+
/// \param target Target string
152+
/// \param target_length Target string length
153+
/// \param reverse_complement_query Reverse complement the query string
154+
/// \param reverse_complement_target Reverse complement the target string
155+
virtual StatusType add_alignment(int32_t max_bandwidth, const char* query, int32_t query_length, const char* target, int32_t target_length,
156+
bool reverse_complement_query = false, bool reverse_complement_target = false) = 0;
143157
};
144158

145159
/// \brief Created Aligner object - DEPRECATED API

cudaaligner/src/aligner_global_myers_banded.cpp

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ struct AlignerGlobalMyersBanded::InternalData
106106
InternalData(const DefaultDeviceAllocator& allocator, cudaStream_t stream)
107107
: seq_d(0, allocator, stream)
108108
, seq_starts_d(0, allocator, stream)
109+
, max_bandwidths_d(0, allocator, stream)
109110
, scheduling_index_d(0, allocator, stream)
110111
, scheduling_atomic_d(0, allocator, stream)
111112
, results_d(0, allocator, stream)
@@ -117,13 +118,15 @@ struct AlignerGlobalMyersBanded::InternalData
117118

118119
pinned_host_vector<char> seq_h;
119120
pinned_host_vector<int64_t> seq_starts_h;
121+
pinned_host_vector<int32_t> max_bandwidths_h;
120122
pinned_host_vector<int32_t> scheduling_index_h;
121123
pinned_host_vector<int8_t> results_h;
122124
pinned_host_vector<int32_t> result_counts_h;
123125
pinned_host_vector<int32_t> result_lengths_h;
124126
pinned_host_vector<int64_t> result_starts_h;
125127
device_buffer<char> seq_d;
126128
device_buffer<int64_t> seq_starts_d;
129+
device_buffer<int32_t> max_bandwidths_d;
127130
device_buffer<int32_t> scheduling_index_d;
128131
device_buffer<int32_t> scheduling_atomic_d;
129132
device_buffer<int8_t> results_d;
@@ -143,6 +146,7 @@ void AlignerGlobalMyersBanded::reallocate_internal_data(InternalData* const data
143146

144147
data->seq_h.clear();
145148
data->seq_starts_h.clear();
149+
data->max_bandwidths_h.clear();
146150
data->scheduling_index_h.clear();
147151
data->results_h.clear();
148152
data->result_counts_h.clear();
@@ -154,12 +158,14 @@ void AlignerGlobalMyersBanded::reallocate_internal_data(InternalData* const data
154158
data->seq_h.resize(2 * mem.sequence_memory / sizeof(char));
155159

156160
data->seq_starts_h.reserve(2 * n_alignments_initial + 1);
161+
data->max_bandwidths_h.reserve(n_alignments_initial);
157162
data->scheduling_index_h.reserve(n_alignments_initial);
158163
data->result_lengths_h.reserve(n_alignments_initial);
159164
data->result_starts_h.reserve(n_alignments_initial + 1);
160165

161166
data->seq_d.free();
162167
data->seq_starts_d.free();
168+
data->max_bandwidths_d.free();
163169
data->scheduling_index_d.free();
164170
data->scheduling_atomic_d.free();
165171
data->results_d.free();
@@ -186,6 +192,7 @@ void AlignerGlobalMyersBanded::reallocate_internal_data(InternalData* const data
186192

187193
data->seq_d.clear_and_resize(2 * mem.sequence_memory / sizeof(char));
188194
data->seq_starts_d.clear_and_resize(2 * n_alignments_initial + 1);
195+
data->max_bandwidths_d.clear_and_resize(n_alignments_initial);
189196
data->scheduling_index_d.clear_and_resize(n_alignments_initial);
190197
data->scheduling_atomic_d.clear_and_resize(1);
191198
data->results_d.clear_and_resize(mem.results_memory / sizeof(char));
@@ -215,16 +222,23 @@ AlignerGlobalMyersBanded::AlignerGlobalMyersBanded(int64_t max_device_memory, in
215222
AlignerGlobalMyersBanded::~AlignerGlobalMyersBanded() = default;
216223

217224
StatusType AlignerGlobalMyersBanded::add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length, bool reverse_complement_query, bool reverse_complement_target)
225+
{
226+
return AlignerGlobalMyersBanded::add_alignment(max_bandwidth_, query, query_length, target, target_length, reverse_complement_query, reverse_complement_target);
227+
}
228+
229+
StatusType AlignerGlobalMyersBanded::add_alignment(int32_t max_bandwidth, const char* query, int32_t query_length, const char* target, int32_t target_length, bool reverse_complement_query, bool reverse_complement_target)
218230
{
219231
GW_NVTX_RANGE(profiler, "AlignerGlobalMyersBanded::add_alignment");
232+
throw_on_negative(max_bandwidth, "query_length should not be negative");
220233
throw_on_negative(query_length, "query_length should not be negative");
221234
throw_on_negative(target_length, "target_length should not be negative");
222235
if (query == nullptr || target == nullptr)
223236
return StatusType::generic_error;
224237

225-
auto& seq_h = data_->seq_h;
226-
auto& seq_starts_h = data_->seq_starts_h;
227-
auto& result_starts_h = data_->result_starts_h;
238+
auto& seq_h = data_->seq_h;
239+
auto& seq_starts_h = data_->seq_starts_h;
240+
auto& max_bandwidths_h = data_->max_bandwidths_h;
241+
auto& result_starts_h = data_->result_starts_h;
228242

229243
batched_device_matrices<WordType>& pvs = data_->pvs;
230244
batched_device_matrices<WordType>& mvs = data_->mvs;
@@ -243,7 +257,13 @@ StatusType AlignerGlobalMyersBanded::add_alignment(const char* query, int32_t qu
243257
assert(mvs.number_of_matrices() == scores.number_of_matrices());
244258
assert(query_patterns.number_of_matrices() == scores.number_of_matrices());
245259

246-
const int64_t matrix_size = compute_matrix_size_for_alignment(query_length, target_length, max_bandwidth_);
260+
if (max_bandwidth > query_length)
261+
{
262+
// we need to quarantee max_bandwidth % word_size != 1.
263+
max_bandwidth = (query_length % word_size == 1 ? query_length + 1 : query_length);
264+
}
265+
266+
const int64_t matrix_size = compute_matrix_size_for_alignment(query_length, target_length, max_bandwidth);
247267
const int32_t n_words_query = ceiling_divide(query_length, word_size);
248268
const int32_t query_pattern_size = n_words_query * 4;
249269

@@ -266,6 +286,7 @@ StatusType AlignerGlobalMyersBanded::add_alignment(const char* query, int32_t qu
266286
genomeutils::copy_sequence(target, target_length, seq_h.data() + seq_start + query_length, reverse_complement_target);
267287
seq_starts_h.push_back(seq_start + query_length);
268288
seq_starts_h.push_back(seq_start + query_length + target_length);
289+
max_bandwidths_h.push_back(max_bandwidth);
269290
result_starts_h.push_back(result_starts_h.back() + query_length + target_length);
270291

271292
bool success = pvs.append_matrix(matrix_size);
@@ -299,13 +320,15 @@ StatusType AlignerGlobalMyersBanded::align_all()
299320
data_->scores.construct_device_matrices_async(stream_);
300321
data_->query_patterns.construct_device_matrices_async(stream_);
301322

302-
const auto& seq_h = data_->seq_h;
303-
const auto& seq_starts_h = data_->seq_starts_h;
304-
auto& scheduling_index_h = data_->scheduling_index_h;
305-
const auto& result_starts_h = data_->result_starts_h;
323+
const auto& seq_h = data_->seq_h;
324+
const auto& seq_starts_h = data_->seq_starts_h;
325+
const auto& max_bandwidths_h = data_->max_bandwidths_h;
326+
auto& scheduling_index_h = data_->scheduling_index_h;
327+
const auto& result_starts_h = data_->result_starts_h;
306328

307329
auto& seq_d = data_->seq_d;
308330
auto& seq_starts_d = data_->seq_starts_d;
331+
auto& max_bandwidths_d = data_->max_bandwidths_d;
309332
auto& scheduling_index_d = data_->scheduling_index_d;
310333
auto& results_d = data_->results_d;
311334
auto& result_counts_d = data_->result_counts_d;
@@ -322,6 +345,10 @@ StatusType AlignerGlobalMyersBanded::align_all()
322345
{
323346
result_starts_d.clear_and_resize(n_alignments + 1);
324347
}
348+
if (get_size(max_bandwidths_d) < n_alignments)
349+
{
350+
max_bandwidths_d.clear_and_resize(n_alignments);
351+
}
325352
if (get_size(result_lengths_d) < n_alignments)
326353
{
327354
result_lengths_d.clear_and_resize(n_alignments);
@@ -331,20 +358,22 @@ StatusType AlignerGlobalMyersBanded::align_all()
331358
scheduling_index_d.clear_and_resize(n_alignments);
332359
}
333360

361+
device_copy_n_async(seq_h.data(), seq_starts_h.back(), seq_d.data(), stream_);
362+
device_copy_n_async(seq_starts_h.data(), 2 * n_alignments + 1, seq_starts_d.data(), stream_);
363+
device_copy_n_async(max_bandwidths_h.data(), n_alignments, max_bandwidths_d.data(), stream_);
364+
device_copy_n_async(result_starts_h.data(), n_alignments + 1, result_starts_d.data(), stream_);
365+
334366
// Create an index of alignment tasks, which determines the processing
335367
// order. Sort this index by the sum of query and target lengths such
336368
// that large alignments get processed first.
337369
scheduling_index_h.resize(n_alignments);
338370
std::iota(begin(scheduling_index_h), end(scheduling_index_h), 0);
339371
std::sort(begin(scheduling_index_h), end(scheduling_index_h), [&seq_starts_h](int32_t i, int32_t j) { return seq_starts_h[2 * i + 2] - seq_starts_h[2 * i] > seq_starts_h[2 * j + 2] - seq_starts_h[2 * j]; });
340372

341-
device_copy_n_async(seq_h.data(), seq_starts_h.back(), seq_d.data(), stream_);
342-
device_copy_n_async(seq_starts_h.data(), 2 * n_alignments + 1, seq_starts_d.data(), stream_);
343373
device_copy_n_async(scheduling_index_h.data(), n_alignments, scheduling_index_d.data(), stream_);
344-
device_copy_n_async(result_starts_h.data(), n_alignments + 1, result_starts_d.data(), stream_);
345374

346375
myers_banded_gpu(results_d.data(), result_counts_d.data(), result_lengths_d.data(), result_starts_d.data(),
347-
seq_d.data(), seq_starts_d.data(), scheduling_index_d.data(), data_->scheduling_atomic_d.data(), n_alignments, max_bandwidth_,
376+
seq_d.data(), seq_starts_d.data(), max_bandwidths_d.data(), scheduling_index_d.data(), data_->scheduling_atomic_d.data(), n_alignments,
348377
data_->pvs, data_->mvs, data_->scores, data_->query_patterns,
349378
stream_);
350379
return StatusType::success;
@@ -434,6 +463,7 @@ void AlignerGlobalMyersBanded::reset_max_bandwidth(const int32_t max_bandwidth)
434463
void AlignerGlobalMyersBanded::reset_data()
435464
{
436465
data_->seq_starts_h.clear();
466+
data_->max_bandwidths_h.clear();
437467
data_->result_lengths_h.clear();
438468
data_->result_starts_h.clear();
439469

cudaaligner/src/aligner_global_myers_banded.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class AlignerGlobalMyersBanded : public FixedBandAligner
3838
void reset() override;
3939

4040
StatusType add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length, bool reverse_complement_query, bool reverse_complement_target) override;
41+
StatusType add_alignment(int32_t max_bandwidth, const char* query, int32_t query_length, const char* target, int32_t target_length, bool reverse_complement_query, bool reverse_complement_target) override;
4142

4243
const std::vector<std::shared_ptr<Alignment>>& get_alignments() const override
4344
{

cudaaligner/src/myers_gpu.cu

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -854,26 +854,26 @@ __global__ void myers_banded_kernel(
854854
batched_device_matrices<WordType>::device_interface* mvi,
855855
batched_device_matrices<int32_t>::device_interface* scorei,
856856
batched_device_matrices<WordType>::device_interface* query_patternsi,
857-
char const* sequences_d, int64_t const* sequence_starts_d,
857+
char const* sequences_d, int64_t const* sequence_starts_d, int32_t const* max_bandwidths_d,
858858
const int32_t* scheduling_index_d, cuda::atomic<int32_t, cuda::thread_scope_device>* scheduling_atomic_d,
859-
const int32_t max_bandwidth,
860859
const int32_t n_alignments)
861860
{
862861
assert(warpSize == warp_size);
863862
assert(threadIdx.x < warp_size);
864-
assert(max_bandwidth % word_size != 1); // we need at least two bits in the last word
865863

866864
if (blockIdx.x >= n_alignments)
867865
return;
868866
const int32_t alignment_idx = get_alignment_task(scheduling_index_d, scheduling_atomic_d);
869867
assert(alignment_idx < n_alignments);
870868
if (alignment_idx >= n_alignments)
871869
return;
872-
const char* const query = sequences_d + sequence_starts_d[2 * alignment_idx];
873-
const char* const target = sequences_d + sequence_starts_d[2 * alignment_idx + 1];
874-
const int32_t query_size = target - query;
875-
const int32_t target_size = sequences_d + sequence_starts_d[2 * alignment_idx + 2] - target;
876-
const int32_t n_words = ceiling_divide(query_size, word_size);
870+
const char* const query = sequences_d + sequence_starts_d[2 * alignment_idx];
871+
const char* const target = sequences_d + sequence_starts_d[2 * alignment_idx + 1];
872+
const int32_t query_size = target - query;
873+
const int32_t target_size = sequences_d + sequence_starts_d[2 * alignment_idx + 2] - target;
874+
const int32_t n_words = ceiling_divide(query_size, word_size);
875+
const int32_t max_bandwidth = max_bandwidths_d[alignment_idx];
876+
assert(max_bandwidth % word_size != 1); // we need at least two bits in the last word
877877
if (max_bandwidth - 1 < abs(target_size - query_size) && query_size != 0 && target_size != 0)
878878
{
879879
if (threadIdx.x == 0)
@@ -1095,10 +1095,10 @@ void myers_gpu(int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length
10951095
void myers_banded_gpu(int8_t* paths_d, int32_t* path_counts_d, int32_t* path_lengths_d, int64_t const* path_starts_d,
10961096
char const* sequences_d,
10971097
int64_t const* sequence_starts_d,
1098+
int32_t const* max_bandwidths_d,
10981099
int32_t const* scheduling_index_d,
10991100
int32_t* scheduling_atomic_int_d,
11001101
int32_t n_alignments,
1101-
int32_t max_bandwidth,
11021102
batched_device_matrices<myers::WordType>& pv,
11031103
batched_device_matrices<myers::WordType>& mv,
11041104
batched_device_matrices<int32_t>& score,
@@ -1116,7 +1116,7 @@ void myers_banded_gpu(int8_t* paths_d, int32_t* path_counts_d, int32_t* path_len
11161116
myers::init_atomic<<<1, 1, 0, stream>>>(scheduling_atomic_d);
11171117
myers::myers_banded_kernel<<<blocks, threads, 0, stream>>>(paths_d, path_counts_d, path_lengths_d, path_starts_d,
11181118
pv.get_device_interface(), mv.get_device_interface(), score.get_device_interface(), query_patterns.get_device_interface(),
1119-
sequences_d, sequence_starts_d, scheduling_index_d, scheduling_atomic_d, max_bandwidth, n_alignments);
1119+
sequences_d, sequence_starts_d, max_bandwidths_d, scheduling_index_d, scheduling_atomic_d, n_alignments);
11201120
GW_CU_CHECK_ERR(cudaPeekAtLastError());
11211121
}
11221122

cudaaligner/src/myers_gpu.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ void myers_gpu(int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length
5353
void myers_banded_gpu(int8_t* paths_d, int32_t* path_counts_d, int32_t* path_lengths_d, int64_t const* path_starts_d,
5454
char const* sequences_d,
5555
int64_t const* sequence_starts_d,
56+
int32_t const* max_bandwidths_d,
5657
int32_t const* scheduling_index_d,
5758
int32_t* scheduling_atomic_d,
5859
int32_t n_alignments,
59-
int32_t max_bandwidth,
6060
batched_device_matrices<myers::WordType>& pv,
6161
batched_device_matrices<myers::WordType>& mv,
6262
batched_device_matrices<int32_t>& score,

0 commit comments

Comments
 (0)