[src] Add binary for fbank feature extraction on GPU (#3921)

LeviBarnes · web-flow · commit 134b1e91858d · 2020-02-11T12:12:48.000+08:00
.. called cudafeatbin/compute-fbank-online-batched-cuda

Testing revealed a bug. If the output feature array is allocated
with kStrideEqualNumCols, then output features will have zero-
padding where they shouldn't. To fix this, we write Mel energies
directly into the output feature array for fbank features.
(If use_log is specified, this is applied during the MEL banks
computation kernel).

To run:
cd cudafeatbin
./compute-fbank-online-batched-cuda --config=&lt;path to your fbank.conf&gt; \
 --batch-size=50 scp:&lt;path to your wav.scp&gt; \
 ark,scp:feats-batch.ark,feats-batch.scp

To compare with CPU features:
pushd ../featbin/;make;popd
../featbin/compute-fbank-feats --config=&lt;path to your fbank.conf&gt; \
 scp:&lt;path to your wav.scp&gt; ark,scp:feats-cpu.ark,feats-cpu.scp
../featbin/compare-feats ark:feats-batch.ark ark:feats-cpu.ark

You should see a line that says
`Features are considered similar since 0.999998 &gt;= 0.99`
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.cc b/src/cudafeat/feature-online-batched-spectral-cuda.cc
@@ -176,17 +176,16 @@ void CudaOnlineBatchedSpectralFeatures::ComputeFinalFeaturesBatched(
                       cumfcc_opts_.use_power);
   CU_SAFE_CALL(cudaGetLastError());
 
-  // mel banks
   int num_bins = bin_size_;
-  cuda_mel_banks_compute(lanes, num_lanes, max_chunk_frames_, num_bins,
+  // mel banks plus optional dct transform
+  if (cumfcc_opts_.use_dct) {
+    // MFCC uses dct
+    cuda_mel_banks_compute(lanes, num_lanes, max_chunk_frames_, num_bins,
                          std::numeric_limits<float>::epsilon(), offsets_,
                          sizes_, vecs_, power_spectrum_.Data(),
                          power_spectrum_.Stride(), cu_mel_energies_.Data(),
                          cu_mel_energies_.Stride(), cumfcc_opts_.use_log_fbank);
-  CU_SAFE_CALL(cudaGetLastError());
-
-  // dct transform
-  if (cumfcc_opts_.use_dct) {
+    CU_SAFE_CALL(cudaGetLastError());
     if (cu_features->NumRows() > cu_mel_energies_.NumRows()) {
       CuSubMatrix<BaseFloat> cu_feats_sub(*cu_features, 0,
                                           cu_mel_energies_.NumRows(), 0,
@@ -202,12 +201,15 @@ void CudaOnlineBatchedSpectralFeatures::ComputeFinalFeaturesBatched(
         mfcc_opts.cepstral_lifter, mfcc_opts.use_energy, mfcc_opts.energy_floor,
         cu_signal_log_energy->Data(), cu_signal_log_energy->Stride(),
         cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
-
+    CU_SAFE_CALL(cudaGetLastError());
   } else {
-    cudaMemcpyAsync(cu_features->Data(), cu_mel_energies_.Data(),
-                    sizeof(BaseFloat) * max_chunk_frames_ * num_lanes *
-                        cu_features->Stride(),
-                    cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+    // fbank puts the result of mel_banks_compute directly into cu_features 
+    cuda_mel_banks_compute(lanes, num_lanes, max_chunk_frames_, num_bins,
+                         std::numeric_limits<float>::epsilon(), offsets_,
+                         sizes_, vecs_, power_spectrum_.Data(),
+                         power_spectrum_.Stride(), cu_features->Data(),
+                         cu_features->Stride(), cumfcc_opts_.use_log_fbank);
+    CU_SAFE_CALL(cudaGetLastError());
   }
   CU_SAFE_CALL(cudaGetLastError());
 }
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
@@ -15,6 +15,7 @@ ifeq ($(CUDA), true)
 							compute-fbank-feats-cuda \
 							apply-batched-cmvn-online-cuda \
 							compute-mfcc-online-batched-cuda \
+							compute-fbank-online-batched-cuda \
 							compute-online-feats-batched-cuda 
 endif
 
diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc