|
15 | 15 | // See the License for the specific language governing permissions and |
16 | 16 | // limitations under the License. |
17 | 17 |
|
18 | | -#ifndef KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_ |
19 | | -#define KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_ |
| 18 | +#ifndef KALDI_CUDADECODER_BATCHED_THREADED_NNET3_CUDA_ONLINE_PIPELINE_H_ |
| 19 | +#define KALDI_CUDADECODER_BATCHED_THREADED_NNET3_CUDA_ONLINE_PIPELINE_H_ |
20 | 20 |
|
21 | 21 | #if HAVE_CUDA |
22 | 22 |
|
@@ -66,24 +66,22 @@ struct BatchedThreadedNnet3CudaOnlinePipelineConfig { |
66 | 66 | use_gpu_feature_extraction(true) {} |
67 | 67 | void Register(OptionsItf *po) { |
68 | 68 | po->Register("max-batch-size", &max_batch_size, |
69 | | - "The maximum execution batch size. " |
70 | | - "Larger = Better throughput slower latency."); |
| 69 | + "The maximum execution batch size." |
| 70 | + " Larger = better throughput, but slower latency."); |
71 | 71 | po->Register("num-channels", &num_channels, |
72 | | - "The number of parallel audio channels. This is the maximum " |
73 | | - "number of parallel audio channels supported by the pipeline" |
74 | | - ". This should be larger " |
75 | | - "than max_batch_size."); |
| 72 | + "The number of parallel audio channels. This is the maximum" |
| 73 | + " number of parallel audio channels supported by the pipeline." |
| 74 | + " This should be larger than max_batch_size."); |
76 | 75 | po->Register("cuda-worker-threads", &num_worker_threads, |
77 | | - "(optional) The total number of CPU threads launched to " |
78 | | - "process CPU tasks. -1 = use std::hardware_concurrency()"); |
| 76 | + "The total number of CPU threads launched to process CPU" |
| 77 | + " tasks. -1 = use std::hardware_concurrency()."); |
79 | 78 | po->Register("determinize-lattice", &determinize_lattice, |
80 | 79 | "Determinize the lattice before output."); |
81 | 80 | po->Register("cuda-decoder-copy-threads", &num_decoder_copy_threads, |
82 | | - "Advanced - Number of worker threads used in the " |
83 | | - "decoder for " |
84 | | - "the host to host copies."); |
| 81 | + "Advanced - Number of worker threads used in the" |
| 82 | + " decoder for the host to host copies."); |
85 | 83 | po->Register("gpu-feature-extract", &use_gpu_feature_extraction, |
86 | | - "Use GPU feature extraction"); |
| 84 | + "Use GPU feature extraction."); |
87 | 85 |
|
88 | 86 | feature_opts.Register(po); |
89 | 87 | decoder_opts.Register(po); |
@@ -138,10 +136,9 @@ class BatchedThreadedNnet3CudaOnlinePipeline { |
138 | 136 | word_syms_(NULL) { |
139 | 137 | config_.compute_opts.CheckAndFixConfigs(am_nnet_->GetNnet().Modulus()); |
140 | 138 | config_.CheckAndFixConfigs(); |
141 | | - int num_worker_threads = config_.num_worker_threads; |
142 | | - thread_pool_.reset(new ThreadPoolLight(num_worker_threads)); |
143 | | - |
144 | 139 | Initialize(decode_fst); |
| 140 | + int num_worker_threads = config.num_worker_threads; |
| 141 | + thread_pool_ = std::make_unique<ThreadPoolLight>(num_worker_threads); |
145 | 142 | } |
146 | 143 |
|
147 | 144 | ~BatchedThreadedNnet3CudaOnlinePipeline(); |
@@ -415,22 +412,35 @@ class BatchedThreadedNnet3CudaOnlinePipeline { |
415 | 412 | // Only used if feature extraction is run on the CPU |
416 | 413 | std::vector<std::unique_ptr<OnlineNnet2FeaturePipeline>> feature_pipelines_; |
417 | 414 |
|
418 | | - // HCLG graph : CudaFst object is a host object, but contains |
419 | | - // data stored in |
420 | | - // GPU memory |
| 415 | + // Ordering of the cuda_fst_ w.r.t. thread_pool_ and the decoder is important: |
| 416 | + // order of destruction is bottom-up, opposite to the order of construction. |
| 417 | + // We want the FST object, which is entirely passive and only frees device |
| 418 | + // FST representation when destroyed, to survive both the thread pool and the |
| 419 | + // decoder, which both may perform pending work during destruction. Since no |
| 420 | + // new work may be fed into this object while it is being destroyed, the |
| 421 | + // relative order of the latter two is unimportant, but just in case, FST must |
| 422 | + // stay around until the other two are positively quiescent. |
| 423 | + |
| 424 | + // HCLG graph. CudaFst is a host object, but owns pointers to the data stored |
| 425 | + // in GPU memory. |
421 | 426 | std::unique_ptr<CudaFst> cuda_fst_; |
422 | | - std::unique_ptr<CudaDecoder> cuda_decoder_; |
423 | 427 |
|
| 428 | + // The thread pool receives data from device and post-processes it. This class |
| 429 | + // destructor blocks until the thread pool is drained of work items. |
424 | 430 | std::unique_ptr<ThreadPoolLight> thread_pool_; |
425 | 431 |
|
| 432 | + // The decoder owns thread(s) that reconstruct lattices transferred from the |
| 433 | + // device in a compacted form as arrays with offsets instead of pointers. |
| 434 | + std::unique_ptr<CudaDecoder> cuda_decoder_; |
| 435 | + |
426 | 436 | // Used for debugging |
427 | 437 | const fst::SymbolTable *word_syms_; |
428 | 438 | // Used when printing to stdout for debugging purposes |
429 | 439 | std::mutex stdout_m_; |
430 | 440 | }; |
431 | 441 |
|
432 | | -} // end namespace cuda_decoder |
433 | | -} // end namespace kaldi. |
| 442 | +} // namespace cuda_decoder |
| 443 | +} // namespace kaldi |
434 | 444 |
|
435 | 445 | #endif // HAVE_CUDA |
436 | | -#endif // KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_ |
| 446 | +#endif // KALDI_CUDADECODER_BATCHED_THREADED_NNET3_CUDA_ONLINE_PIPELINE_H_ |
0 commit comments