kaldi-asr
diff --git a/‎src/cudadecoder/cuda-decoder.cc‎
Lines changed: 96 additions & 78 deletions b/‎src/cudadecoder/cuda-decoder.cc‎
Lines changed: 96 additions & 78 deletions
@@ -22,14 +22,27 @@
 #include "cudadecoder/cuda-decoder.h"
 
 #include <algorithm>
-#include <map>
-#include <sstream>
+#include <atomic>
+#include <functional>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <thread>
 #include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
 
+#include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-decoder-kernels.h"
+#include "cudamatrix/cu-common.h"
 #include "online2/online-endpoint.h"
 #include "util/text-utils.h"
 
@@ -62,10 +75,10 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
   KALDI_ASSERT(nchannels_ > 0);
   KALDI_ASSERT(nlanes_ <= nchannels_);
   // All GPU work in decoder will be sent to compute_st_
-  cudaStreamCreate(&compute_st_);
+  CU_SAFE_CALL(cudaStreamCreate(&compute_st_));
   // Copies D2H of tokens for storage on host are done on
   // copy_st_, in parallel with compute_st_
-  cudaStreamCreate(&copy_st_);
+  CU_SAFE_CALL(cudaStreamCreate(&copy_st_));
   // For all the allocating/initializing process
   // We create a special channel
   // containing the exact state a channel should have when starting a new
@@ -96,7 +109,7 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
   --nchannels_;  // removing the special initial channel from the count
 
   // Making sure that everything is ready to use
-  cudaStreamSynchronize(compute_st_);
+  CU_SAFE_CALL(cudaStreamSynchronize(compute_st_));
   KALDI_DECODER_CUDA_CHECK_ERROR();
 }
 
@@ -252,8 +265,8 @@ void CudaDecoder::InitDeviceParams() {
   // Setting Kernel Params
   // Sent to cuda kernels by copy
   // Making sure we'll be able to send it to the kernels
-  KALDI_ASSERT((sizeof(KernelParams) + sizeof(DeviceParams)) <
-               KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE);
+  KALDI_COMPILE_TIME_ASSERT((sizeof(KernelParams) + sizeof(DeviceParams)) <
+                            KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE);
 
   h_device_params_->d_channels_counters = d_channels_counters_.GetView();
   h_device_params_->d_lanes_counters = d_lanes_counters_.GetView();
@@ -322,12 +335,20 @@ void CudaDecoder::InitDeviceParams() {
 }
 
 CudaDecoder::~CudaDecoder() noexcept(false) {
-  // Stopping h2h tasks
+  // Wait for D2H copies before stopping H2H tasks.
+  CU_SAFE_CALL(cudaStreamSynchronize(compute_st_));
+  CU_SAFE_CALL(cudaStreamSynchronize(copy_st_));
+  // Stop h2h tasks.
+  WaitForInitDecodingH2HCopies();
+  WaitForH2HCopies();
   h2h_threads_running_ = false;
   n_h2h_main_task_todo_cv_.notify_all();
   for (std::thread &thread : cpu_dedicated_threads_) thread.join();
-  cudaStreamDestroy(compute_st_);
-  cudaStreamDestroy(copy_st_);
+  KALDI_ASSERT(n_h2h_main_task_todo_ <= 0);
+  KALDI_ASSERT(n_h2h_task_not_done_ == 0);
+
+  CU_SAFE_CALL(cudaStreamDestroy(compute_st_));
+  CU_SAFE_CALL(cudaStreamDestroy(copy_st_));
 
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_channels_counters_));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
@@ -354,26 +375,27 @@ CudaDecoder::~CudaDecoder() noexcept(false) {
 
 void CudaDecoder::ComputeInitialChannel() {
   KALDI_ASSERT(nlanes_ > 0);
-  const int32 ilane = 0;
-  KALDI_ASSERT(ilane == 0);
+  const LaneId kLane0 = 0;
+
   // Following kernels working channel_id
   std::vector<ChannelId> channels = {init_channel_id_};
   num_frames_decoded_[init_channel_id_] = 0;
   SetChannelsInKernelParams(channels);  // not calling LoadChannelsStateToLanes,
                                         // init_channel_id_ is a special case
-  h_lanes_counters_.lane(ilane)->channel_to_compute = init_channel_id_;
+  h_lanes_counters_.lane(kLane0)->channel_to_compute = init_channel_id_;
 
-  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
-                  1 * sizeof(*h_lanes_counters_.lane(0)),
-                  cudaMemcpyHostToDevice, compute_st_);
-  h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y = 0;
+  CU_SAFE_CALL(cudaMemcpyAsync(d_lanes_counters_.MutableData(),
+                               h_lanes_counters_.lane(0),
+                               1 * sizeof(*h_lanes_counters_.lane(0)),
+                               cudaMemcpyHostToDevice, compute_st_));
+  h_lanes_counters_.lane(kLane0)->main_q_narcs_and_end.y = 0;
 
   // Adding the start state to the initial token queue
   InitializeInitialLaneKernel(KaldiCudaDecoderNumBlocks(1, 1),
                               KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
                               *h_device_params_);
 
-  h_lanes_counters_.lane(ilane)->post_expand_aux_q_end = 1;
+  h_lanes_counters_.lane(kLane0)->post_expand_aux_q_end = 1;
 
   PruneAndPreprocess();
   FinalizeProcessNonEmittingKernel(
@@ -386,7 +408,7 @@ void CudaDecoder::ComputeInitialChannel() {
   CopyLaneCountersToHostSync();
 
   const int32 main_q_end =
-      h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
+      h_lanes_counters_.lane(kLane0)->main_q_narcs_and_end.y;
   KALDI_ASSERT(main_q_end > 0);
 
   // Moving all data linked to init_channel_id_ to host
@@ -406,9 +428,10 @@ void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
   const int nlanes_used = channels.size();
   // Getting *h_kernel_params ready to use
   LoadChannelsStateToLanes(channels);
-  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
-                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
-                  cudaMemcpyHostToDevice, compute_st_);
+  CU_SAFE_CALL(cudaMemcpyAsync(d_lanes_counters_.MutableData(),
+                               h_lanes_counters_.lane(0),
+                               nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                               cudaMemcpyHostToDevice, compute_st_));
 
   // Size of the initial main_q
   ChannelCounters &init_channel_counters =
@@ -424,8 +447,7 @@ void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
       *h_kernel_params_);
 
   {
-    std::lock_guard<std::mutex> n_h2h_not_done_lk(
-        n_init_decoding_h2h_task_not_done_mutex_);
+    std::lock_guard<std::mutex> lk(n_init_decoding_h2h_task_not_done_mutex_);
     n_init_decoding_h2h_task_not_done_ += channels.size();
   }
   for (ChannelId ichannel : channels) {
@@ -549,14 +571,15 @@ int32 CudaDecoder::GetMaxForAllLanes(
 }
 
 void CudaDecoder::CopyLaneCountersToHostAsync() {
-  cudaMemcpyAsync(h_lanes_counters_.lane(0), d_lanes_counters_.MutableData(),
-                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
-                  cudaMemcpyDeviceToHost, compute_st_);
+  CU_SAFE_CALL(cudaMemcpyAsync(h_lanes_counters_.lane(0),
+                               d_lanes_counters_.MutableData(),
+                               nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                               cudaMemcpyDeviceToHost, compute_st_));
 }
 
 void CudaDecoder::CopyLaneCountersToHostSync() {
   CopyLaneCountersToHostAsync();
-  cudaStreamSynchronize(compute_st_);
+  CU_SAFE_CALL(cudaStreamSynchronize(compute_st_));
 }
 
 // One sync has to happen between PerformConcatenatedCopy and
@@ -643,7 +666,7 @@ void CudaDecoder::PostProcessingMainQueue() {
   ComputeLaneOffsetsKernel(KaldiCudaDecoderNumBlocks(1, 1),  // One CTA
                            KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
                            *h_device_params_, *h_kernel_params_);
-  cudaEventRecord(lane_offsets_ready_evt_, compute_st_);
+  CU_SAFE_CALL(cudaEventRecord(lane_offsets_ready_evt_, compute_st_));
 
   EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
       KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
@@ -659,11 +682,11 @@ void CudaDecoder::PostProcessingMainQueue() {
 }
 
 void CudaDecoder::CopyMainQueueDataToHost() {
-  cudaEventRecord(concatenated_data_ready_evt_, compute_st_);
-  cudaStreamWaitEvent(copy_st_, concatenated_data_ready_evt_,
-                      0);  // the copies on copy_st will wait on compute_st_
-  cudaEventSynchronize(lane_offsets_ready_evt_);  // we need the total
-                                                  // size of each segments
+  CU_SAFE_CALL(cudaEventRecord(concatenated_data_ready_evt_, compute_st_));
+  // The copies on copy_st will wait on compute_st_.
+  CU_SAFE_CALL(cudaStreamWaitEvent(copy_st_, concatenated_data_ready_evt_, 0));
+  // We need the total size of each segment on the host.
+  CU_SAFE_CALL(cudaEventSynchronize(lane_offsets_ready_evt_));
   LaunchD2HCopies();
 
   // Making sure the previous H2H copies are done
@@ -711,7 +734,7 @@ void CudaDecoder::LaunchD2HCopies() {
         nelements_acoustic_costs * sizeof(*d_acoustic_cost_concat_),
         cudaMemcpyDeviceToHost, copy_st_));
   }
-  cudaEventRecord(d2h_copy_acoustic_evt_, copy_st_);
+  CU_SAFE_CALL(cudaEventRecord(d2h_copy_acoustic_evt_, copy_st_));
 
   int32 nelements_infotoken =
       h_lanes_counters_.lane(nlanes_used_)->main_q_end_lane_offset;
@@ -721,7 +744,7 @@ void CudaDecoder::LaunchD2HCopies() {
                         nelements_infotoken * sizeof(*d_infotoken_concat_),
                         cudaMemcpyDeviceToHost, copy_st_));
   }
-  cudaEventRecord(d2h_copy_infotoken_evt_, copy_st_);
+  CU_SAFE_CALL(cudaEventRecord(d2h_copy_infotoken_evt_, copy_st_));
   int32 nelements_extra_prev_tokens =
       h_lanes_counters_.lane(nlanes_used_)
           ->main_q_n_extra_prev_tokens_lane_offset;
@@ -737,7 +760,7 @@ void CudaDecoder::LaunchD2HCopies() {
                             sizeof(*d_extra_and_acoustic_cost_concat_),
                         cudaMemcpyDeviceToHost, copy_st_));
   }
-  cudaEventRecord(d2h_copy_extra_prev_tokens_evt_, copy_st_);
+  CU_SAFE_CALL(cudaEventRecord(d2h_copy_extra_prev_tokens_evt_, copy_st_));
 }
 
 void CudaDecoder::ConcatenateData() {
@@ -817,12 +840,13 @@ void CudaDecoder::AdvanceDecoding(
   }
   LoadChannelsStateToLanes(channels);
   KALDI_ASSERT(nlanes_used_ > 0);
-  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
-                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
-                  cudaMemcpyHostToDevice, compute_st_);
+  CU_SAFE_CALL(cudaMemcpyAsync(d_lanes_counters_.MutableData(),
+                               h_lanes_counters_.lane(0),
+                               nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                               cudaMemcpyHostToDevice, compute_st_));
   // compute_st_ will wait for nnet3 to complete
-  cudaEventRecord(nnet3_done_evt_, cudaStreamPerThread);
-  cudaStreamWaitEvent(compute_st_, nnet3_done_evt_, 0);
+  CU_SAFE_CALL(cudaEventRecord(nnet3_done_evt_, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaStreamWaitEvent(compute_st_, nnet3_done_evt_, 0));
 
   // Estimating cutoff using argmin from last frame
   ResetForFrameAndEstimateCutoffKernel(
@@ -874,7 +898,8 @@ void CudaDecoder::AdvanceDecoding(
   PostProcessingMainQueue();
 
   // Waiting on previous d2h before writing on same device memory
-  cudaStreamWaitEvent(compute_st_, d2h_copy_extra_prev_tokens_evt_, 0);
+  CU_SAFE_CALL(cudaStreamWaitEvent(compute_st_,
+                                   d2h_copy_extra_prev_tokens_evt_, 0));
   // Concatenating the data that will be moved to host into large arrays
   ConcatenateData();
   // Copying the final lane counters for that frame
@@ -902,9 +927,10 @@ void CudaDecoder::AdvanceDecoding(
 
 void CudaDecoder::WaitForPartialHypotheses() {
   if (!generate_partial_hypotheses_) return;
-  while (n_partial_traceback_threads_not_done_.load(std::memory_order_acquire) >
-         0)
+  while (n_partial_traceback_threads_not_done_
+             .load(std::memory_order_acquire) > 0) {
     Sleep(200e-6);
+  }
 }
 
 void CudaDecoder::CheckOverflow() {
@@ -918,27 +944,15 @@ void CudaDecoder::CheckOverflow() {
 
       if ((q_overflow & OVERFLOW_MAIN_Q) == OVERFLOW_MAIN_Q) {
         // overflowed main_q
-        KALDI_WARN << "Preventing overflow of main_q. "
-                      "Continuing "
-                   << "execution but the quality of "
-                      "the output may be decreased. "
-                   << "To prevent this from happening, "
-                      "please increase the "
-                      "parameter "
-                      "--main-q-capacity"
-                   << " and/or decrease --max-active";
+        KALDI_WARN << ("Preventing overflow of main_q. The quality of the"
+                       " output may be reduced. Increase --main-q-capacity"
+                       " and/or decrease --max-active");
       }
       if ((q_overflow & OVERFLOW_AUX_Q) == OVERFLOW_AUX_Q) {
         // overflowed aux_q
-        KALDI_WARN << "Preventing overflow of aux_q. "
-                      "Continuing "
-                   << "execution but the quality of "
-                      "the output may be decreased. "
-                   << "To prevent this from happening, "
-                      "please increase the "
-                      "parameter "
-                      "--aux-q-capacity"
-                   << " and/or decrease --beam";
+        KALDI_WARN << ("Preventing overflow of aux_q. The quality of the output"
+                       " may be reduced. Increase --aux-q-capacity and/or"
+                       " decrease --beam");
       }
 
       KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y < main_q_capacity_);
@@ -967,9 +981,10 @@ void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
   if (channels.size() == 0) return;
   // Getting the lanes ready to be used with those channels
   LoadChannelsStateToLanes(channels);
-  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
-                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
-                  cudaMemcpyHostToDevice, compute_st_);
+  CU_SAFE_CALL(cudaMemcpyAsync(d_lanes_counters_.MutableData(),
+                               h_lanes_counters_.lane(0),
+                               nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                               cudaMemcpyHostToDevice, compute_st_));
 
   auto func_main_q_end = [](const LaneCounters &c) {
     return c.main_q_narcs_and_end.y;
@@ -1010,7 +1025,7 @@ void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
   // best+lattice_beam]
   list_finals_token_idx_and_cost->resize(nlanes_used_);
   // Waiting for the copy
-  cudaStreamSynchronize(compute_st_);
+  CU_SAFE_CALL(cudaStreamSynchronize(compute_st_));
   for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
     int ichannel = channels[ilane];
     KALDI_ASSERT(NumFramesDecoded(ichannel) > 0);
@@ -1076,8 +1091,7 @@ void CudaDecoder::GetBestPredecessor(int32 ichannel, int32 curr_token_idx,
               .size());
       CostType arc_extra_cost =
           h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
-                                                                 [offset + i]
-                                                                     .x;
+                                                                 [offset + i].x;
       // Picking one arc on the best path
       // (extra_cost == 0)
       if (arc_extra_cost == 0.0f) {
@@ -1280,14 +1294,17 @@ void CudaDecoder::AddFinalTokensToLattice(
       // next_state[arc_idx] we just need a valid arc_idx
       int32 arc_idx;
       if (final_token.IsUniqueTokenForStateAndFrame()) {
-        // If unique, we can directly use this arc_idx
+        // If unique, we can directly use this arc_idx.
         arc_idx = final_token.arc_idx;
       } else {
-        // If we have multiple tokens associated to that
-        // fst state, just pick the first one from the
-        // list
+        // If we have multiple tokens associated to that FST state, just pick
+        // the first one from the list.
         int32 offset, size;
         std::tie(offset, size) = final_token.GetSameFSTStateTokensList();
+        KALDI_ASSERT(!h_all_tokens_extra_prev_tokens_.empty());
+        KALDI_ASSERT(ichannel < h_all_tokens_extra_prev_tokens_.size());
+        KALDI_ASSERT(!h_all_tokens_extra_prev_tokens_[ichannel].empty());
+        KALDI_ASSERT(offset < h_all_tokens_extra_prev_tokens_[ichannel].size());
         InfoToken prev_token =
             h_all_tokens_extra_prev_tokens_[ichannel][offset];
         arc_idx = prev_token.arc_idx;
@@ -1797,8 +1814,9 @@ void CudaDecoder::CheckStaticAsserts() {
 
   // We need that because we need to be able to do the scan in one pass in
   // the kernel update_beam_using_histogram_kernel
-  KALDI_ASSERT(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
-  KALDI_ASSERT(KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS > 0);
+  KALDI_COMPILE_TIME_ASSERT(
+    KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+  KALDI_COMPILE_TIME_ASSERT(KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS > 0);
 }
 
 void CudaDecoder::LaunchH2HCopies() {
@@ -2006,7 +2024,7 @@ void CudaDecoder::ComputeH2HCopies() {
   }
   // Waiting for the D2H copies. This is threadsafe
   // Step 1: acoustic costs
-  cudaEventSynchronize(d2h_copy_acoustic_evt_);
+  CU_SAFE_CALL(cudaEventSynchronize(d2h_copy_acoustic_evt_));
   while ((ilane = n_acoustic_h2h_copies_todo_.fetch_sub(1)) >= 0) {
     int32 ichannel = lanes2channels_todo_[ilane];
     // Lock Channel
@@ -2025,7 +2043,7 @@ void CudaDecoder::ComputeH2HCopies() {
   }
 
   // Step 2: infotoken
-  cudaEventSynchronize(d2h_copy_infotoken_evt_);
+  CU_SAFE_CALL(cudaEventSynchronize(d2h_copy_infotoken_evt_));
   while ((ilane = n_infotoken_h2h_copies_todo_.fetch_sub(1)) >= 0) {
     int32 ichannel = lanes2channels_todo_[ilane];
     // Lock Channel
@@ -2037,7 +2055,7 @@ void CudaDecoder::ComputeH2HCopies() {
   // Step 3:
   // - extra prev tokens
   // - partial path and endpointing
-  cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_);
+  CU_SAFE_CALL(cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_));
   while ((ilane = n_extra_prev_tokens_h2h_copies_todo_.fetch_sub(1)) >= 0) {
     int32 ichannel = lanes2channels_todo_[ilane];
     // Lock Channel