kaldi-asr
diff --git a/‎src/chain/chain-denominator.cc‎
Lines changed: 7 additions & 0 deletions b/‎src/chain/chain-denominator.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/chain/chain-generic-numerator.cc‎
Lines changed: 60 additions & 15 deletions b/‎src/chain/chain-generic-numerator.cc‎
Lines changed: 60 additions & 15 deletions
diff --git a/‎src/chain/chain-generic-numerator.h‎
Lines changed: 21 additions & 1 deletion b/‎src/chain/chain-generic-numerator.h‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎src/chain/chain-training.cc‎
Lines changed: 4 additions & 1 deletion b/‎src/chain/chain-training.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/chain/chain-training.h‎
Lines changed: 6 additions & 0 deletions b/‎src/chain/chain-training.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/chainbin/nnet3-chain-train.cc‎
Lines changed: 3 additions & 1 deletion b/‎src/chainbin/nnet3-chain-train.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/cudamatrix/cu-common.cc‎
Lines changed: 21 additions & 1 deletion b/‎src/cudamatrix/cu-common.cc‎
Lines changed: 21 additions & 1 deletion
@@ -108,6 +108,7 @@ void DenominatorComputation::AlphaFirstFrame() {
 
 // the alpha computation for some 0 < t <= num_time_steps_.
 void DenominatorComputation::AlphaGeneralFrame(int32 t) {
+  NVTX_RANGE(__func__);
   KALDI_ASSERT(t > 0 && t <= frames_per_sequence_);
   BaseFloat *this_alpha = alpha_.RowData(t);
   const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1);
@@ -186,6 +187,7 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
 }
 
 void DenominatorComputation::AlphaDash(int32 t) {
+  NVTX_RANGE(__func__);
   BaseFloat *this_alpha = alpha_.RowData(t);
 
   // create a 'fake matrix' for the regular alphas- view this row as a matrix.
@@ -209,6 +211,7 @@ void DenominatorComputation::AlphaDash(int32 t) {
 
 // compute beta from beta-dash.
 void DenominatorComputation::Beta(int32 t) {
+  NVTX_RANGE(__func__);
   BaseFloat *this_beta_dash = beta_.RowData(t % 2);
   // create a 'fake matrix' for the regular beta-dash (which is
   // the counterpart of alpha-dash)- view this row as a matrix.
@@ -231,6 +234,7 @@ void DenominatorComputation::Beta(int32 t) {
 }
 
 BaseFloat DenominatorComputation::Forward() {
+  NVTX_RANGE(__func__);
   AlphaFirstFrame();
   AlphaDash(0);
   for (int32 t = 1; t <= frames_per_sequence_; t++) {
@@ -241,6 +245,7 @@ BaseFloat DenominatorComputation::Forward() {
 }
 
 BaseFloat DenominatorComputation::ComputeTotLogLike() {
+  NVTX_RANGE(__func__);
   tot_prob_.Resize(num_sequences_);
   // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences.
   CuSubMatrix<BaseFloat> last_alpha_dash(
@@ -281,6 +286,7 @@ BaseFloat DenominatorComputation::ComputeTotLogLike() {
 bool DenominatorComputation::Backward(
     BaseFloat deriv_weight,
     CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+  NVTX_RANGE(__func__);
   BetaDashLastFrame();
   Beta(frames_per_sequence_);
   for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) {
@@ -332,6 +338,7 @@ void DenominatorComputation::BetaDashLastFrame() {
 }
 
 void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
+  NVTX_RANGE(__func__);
   KALDI_ASSERT(t >= 0 && t < frames_per_sequence_);
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows();
   // t_wrapped gives us the time-index we use when indexing
 
@@ -23,7 +23,6 @@
 
 #include <iterator>
 #include <limits>
-#include <algorithm>
 
 namespace kaldi {
 namespace chain {
@@ -34,13 +33,16 @@ namespace chain {
 // for end-to-end training 'supervision's.
 
 GenericNumeratorComputation::GenericNumeratorComputation(
+    const GenericNumeratorComputationOptions &opts,
     const Supervision &supervision,
     const CuMatrixBase<BaseFloat> &nnet_output):
     supervision_(supervision),
-    nnet_output_(nnet_output) {
+    nnet_output_(nnet_output),
+    opts_(opts) {
   KALDI_ASSERT(supervision.num_sequences *
                supervision.frames_per_sequence == nnet_output.NumRows() &&
                supervision.label_dim == nnet_output.NumCols());
+  NVTX_RANGE(__func__);
 
   using std::vector;
   int num_sequences = supervision_.num_sequences;
@@ -119,6 +121,7 @@ GenericNumeratorComputation::GenericNumeratorComputation(
 
 void GenericNumeratorComputation::AlphaFirstFrame(int seq,
                                                   Matrix<BaseFloat> *alpha) {
+  NVTX_RANGE(__func__);
   const int32 num_frames = supervision_.frames_per_sequence,
               num_states = supervision_.e2e_fsts[seq].NumStates();
   alpha->Resize(num_frames + 1,  num_states + 1, kSetZero);
@@ -133,6 +136,7 @@ void GenericNumeratorComputation::CopySpecificPdfsIndirect(
                                     const std::vector<MatrixIndexT> &indices,
                                     Matrix<BaseFloat> *out) {
   KALDI_ASSERT(nnet_output_stride_ == nnet_output_.Stride());
+  NVTX_RANGE(__func__);
   const int32 num_sequences = supervision_.num_sequences,
               frames_per_sequence = supervision_.frames_per_sequence;
 
@@ -156,6 +160,7 @@ void GenericNumeratorComputation::CopySpecificPdfsIndirect(
 BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq,
                                               const Matrix<BaseFloat> &probs,
                                               Matrix<BaseFloat> *alpha) {
+  NVTX_RANGE(__func__);
   // Define some variables to make things nicer
   const int32 num_sequences = supervision_.num_sequences,
               num_frames = supervision_.frames_per_sequence;
@@ -212,6 +217,7 @@ BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq,
 bool GenericNumeratorComputation::ForwardBackward(
                                  BaseFloat *total_loglike,
                                  CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+  NVTX_RANGE(__func__);
   KALDI_ASSERT(total_loglike != NULL);
   KALDI_ASSERT(nnet_output_deriv != NULL);
   KALDI_ASSERT(nnet_output_deriv->NumCols() == nnet_output_.NumCols());
@@ -221,35 +227,71 @@ bool GenericNumeratorComputation::ForwardBackward(
   const int32 num_sequences = supervision_.num_sequences;
 
   bool ok = true;
-  Matrix<BaseFloat> alpha;
-  Matrix<BaseFloat> beta;
   Matrix<BaseFloat> probs;
-  Matrix<BaseFloat> derivs;
+  Matrix<BaseFloat> derivs; // Don't need nthreads copies to avoid data
+                            // races since each sequence operates on a
+                            // distinct set of columns
 
   // We selectively copy only those pdfs we need
   CopySpecificPdfsIndirect(nnet_output_, index_to_pdf_, &probs);
 
   derivs.Resize(probs.NumRows(), probs.NumCols());
   derivs.Set(-std::numeric_limits<BaseFloat>::infinity());
 
-  for (int seq = 0; seq < num_sequences; ++seq) {
-    // Forward part
-    AlphaFirstFrame(seq, &alpha);
-    partial_loglike += AlphaRemainingFrames(seq, probs, &alpha);
-
-    // Backward part
-    BetaLastFrame(seq, alpha, &beta);
-    BetaRemainingFrames(seq, probs, alpha, &beta, &derivs);
-    if (GetVerboseLevel() >= 1)
-      ok = ok && CheckValues(seq, probs, alpha, beta, derivs);
+  // Set total number of workers to the available hardware concurrency
+  unsigned int nthreads = opts_.num_threads > 0 ? opts_.num_threads :
+                              std::thread::hardware_concurrency();
+  // Naive load balancing, each thread gets a chunk of the sequences to process
+  unsigned int num_sequences_per_thread = 
+    (num_sequences + nthreads - 1) / nthreads;
+
+  // Allocate one alpha and beta matrix per thread to avoid contention
+  std::vector<Matrix<BaseFloat>> alpha(nthreads);
+  std::vector<Matrix<BaseFloat>> beta(nthreads);
+  
+  // Per thread partial values and boolean
+  std::vector<BaseFloat> partial_loglike_mt(nthreads, static_cast<BaseFloat>(0));
+  std::vector<bool> ok_mt(nthreads, true);
+
+  // Lambda function for each thread's portion of the computation
+  auto thread_lambda = [&] (int thread, int num_sequences, int num_sequences_per_thread) {
+    int seq_st = thread * num_sequences_per_thread;
+    int seq_en = seq_st + num_sequences_per_thread;
+    seq_en = (seq_en <= num_sequences) ? seq_en : num_sequences;
+    for (int seq = seq_st; seq < seq_en; ++seq) {
+      // Forward part
+      AlphaFirstFrame(seq, &alpha[thread]);
+      partial_loglike_mt[thread] += AlphaRemainingFrames(seq, probs, &alpha[thread]);
+
+      // Backward part
+      BetaLastFrame(seq, alpha[thread], &beta[thread]);
+      BetaRemainingFrames(seq, probs, alpha[thread], &beta[thread], &derivs);
+      if (GetVerboseLevel() >= 1)
+        ok_mt[thread] = ok_mt[thread] && CheckValues(seq, probs, alpha[thread], beta[thread], derivs);
+    }
+    return;
+  };
+
+  std::vector<std::thread> workers(nthreads);
+  for (int thread = 0; thread < nthreads; ++thread)
+    // Launch all threads
+    workers[thread] = std::thread(thread_lambda, thread, num_sequences, num_sequences_per_thread);
+  for (int thread = 0; thread < nthreads; ++thread) {
+    // Join threads back in
+    workers[thread].join();
+    // Reduce thread values to a single value
+    partial_loglike += partial_loglike_mt[thread];
+    ok = ok && ok_mt[thread];
   }
+
   // Transfer and add the derivatives to the values in the matrix
   AddSpecificPdfsIndirect(&derivs, index_to_pdf_, nnet_output_deriv);
   *total_loglike = partial_loglike;
   return ok;
 }
 
 BaseFloat GenericNumeratorComputation::ComputeObjf() {
+  NVTX_RANGE(__func__);
   BaseFloat partial_loglike = 0;
   const int32 num_sequences = supervision_.num_sequences;
 
@@ -275,6 +317,7 @@ BaseFloat GenericNumeratorComputation::GetTotalProb(
 void GenericNumeratorComputation::BetaLastFrame(int seq,
                                                 const Matrix<BaseFloat> &alpha,
                                                 Matrix<BaseFloat> *beta) {
+  NVTX_RANGE(__func__);
   // Sets up the beta quantity on the last frame (frame ==
   // frames_per_sequence_).  Note that the betas we use here contain a
   // 1/(tot-prob) factor in order to simplify the backprop.
@@ -298,6 +341,7 @@ void GenericNumeratorComputation::BetaRemainingFrames(int seq,
                                                 const Matrix<BaseFloat> &alpha,
                                                 Matrix<BaseFloat> *beta,
                                                 Matrix<BaseFloat> *derivs) {
+  NVTX_RANGE(__func__);
   const int32
       num_sequences = supervision_.num_sequences,
       num_frames = supervision_.frames_per_sequence,
@@ -340,6 +384,7 @@ void GenericNumeratorComputation::AddSpecificPdfsIndirect(
                                  Matrix<BaseFloat> *logprobs,
                                  const std::vector<MatrixIndexT> &indices,
                                  CuMatrixBase<BaseFloat> *output) {
+  NVTX_RANGE(__func__);
   const int32 num_sequences = supervision_.num_sequences,
               frames_per_sequence = supervision_.frames_per_sequence;
 
 
@@ -25,6 +25,8 @@
 
 #include <vector>
 #include <map>
+#include <algorithm>
+#include <thread>
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
@@ -102,6 +104,20 @@ namespace chain {
  */
 
 
+struct GenericNumeratorComputationOptions {
+  unsigned int num_threads;
+  GenericNumeratorComputationOptions() : 
+    num_threads(std::min(static_cast<unsigned int>(4),
+                std::thread::hardware_concurrency())) { }
+  void Register(OptionsItf *opts) {
+    opts->Register("numerator-graph-threads", &num_threads, "Number of threads "
+                   "to use to parallelize the chain numerator graph computation. "
+                   "If 0, use available hardware concurrency.");
+  }
+
+};
+
+
 // This class is responsible for the forward-backward of the
 // end-to-end 'supervision' (numerator) FST. This kind of FST can
 // have self-loops.
@@ -112,7 +128,8 @@ namespace chain {
 class GenericNumeratorComputation {
  public:
   /// Initializes the object.
-  GenericNumeratorComputation(const Supervision &supervision,
+  GenericNumeratorComputation(const GenericNumeratorComputationOptions &opts,
+                              const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output);
 
   // Does the forward-backward computation. Returns the total log-prob
@@ -198,6 +215,9 @@ class GenericNumeratorComputation {
   // an offset subtracted from the logprobs of transitions out of the first
   // state of each graph to help reduce numerical problems.
   Vector<BaseFloat> offsets_;
+
+  // Configuration options
+  const GenericNumeratorComputationOptions &opts_;
 };
 
 }  // namespace chain
 
@@ -94,6 +94,7 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  BaseFloat *weight,
                                  CuMatrixBase<BaseFloat> *nnet_output_deriv,
                                  CuMatrix<BaseFloat> *xent_output_deriv) {
+  NVTX_RANGE(__func__);
   BaseFloat num_logprob_weighted, den_logprob_weighted;
   bool denominator_ok = true;
   bool numerator_ok = true;
@@ -136,7 +137,8 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
 
 
   {
-    GenericNumeratorComputation numerator(supervision, nnet_output);
+    GenericNumeratorComputation numerator(opts.numerator_opts,
+                                          supervision, nnet_output);
     // note: supervision.weight is included as a factor in the derivative from
     // the numerator object, as well as the returned logprob.
     if (xent_output_deriv) {
@@ -211,6 +213,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
                               CuMatrix<BaseFloat> *xent_output_deriv) {
+  NVTX_RANGE(__func__);
   if (!supervision.e2e_fsts.empty()) {
     ComputeChainObjfAndDerivE2e(opts, den_graph, supervision,
                                 nnet_output, objf, l2_term,
 
@@ -34,6 +34,7 @@
 #include "hmm/transition-model.h"
 #include "chain/chain-den-graph.h"
 #include "chain/chain-supervision.h"
+#include "chain/chain-generic-numerator.h"
 
 namespace kaldi {
 namespace chain {
@@ -93,7 +94,12 @@ struct ChainTrainingOptions {
                    "nonzero, the network is expected to have an output "
                    "named 'output-xent', which should have a softmax as "
                    "its final nonlinearity.");
+
+    numerator_opts.Register(opts);
   }
+  
+  // Config for numerator graph object
+  GenericNumeratorComputationOptions numerator_opts;
 };
 
 
 
@@ -22,7 +22,6 @@
 #include "nnet3/nnet-chain-training.h"
 #include "cudamatrix/cu-allocator.h"
 
-
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -53,6 +52,9 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     opts.Register(&po);
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
     RegisterCuAllocatorOptions(&po);
 
     po.Read(argc, argv);
 
@@ -31,10 +31,30 @@
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-matrixdim.h"
 
-
 namespace kaldi {
 
 #if HAVE_CUDA == 1
+
+#ifdef USE_NVTX
+NvtxTracer::NvtxTracer(const char* name) {
+  const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
+  const int num_colors = sizeof(colors)/sizeof(uint32_t);
+  int color_id = ((int)name[0])%num_colors;
+	nvtxEventAttributes_t eventAttrib = {0};
+	eventAttrib.version = NVTX_VERSION;
+	eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+	eventAttrib.colorType = NVTX_COLOR_ARGB;
+	eventAttrib.color = colors[color_id];
+	eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+	eventAttrib.message.ascii = name;
+	nvtxRangePushEx(&eventAttrib);
+  // nvtxRangePushA(name);
+}
+NvtxTracer::~NvtxTracer() {
+  nvtxRangePop();
+}
+#endif
+
 cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
   cublasOperation_t cublas_trans;