rmittal-github · anand-nv · May 2, 2025 · May 2, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1909,7 +1909,6 @@ class GenericLlmRequest
     std::shared_ptr<std::vector<bool>> mSequenceFinalVec;
 
     std::optional<TensorPtr> mSkipCrossAttnBlocks{std::nullopt};
-    SizeType32 mNumVocabs; 
 
     // Performance metrics.
     bool mReturnPerfMetrics{false};

diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -711,12 +711,7 @@ class Request
     [[nodiscard]] std::optional<SizeType32> getLanguageAdapterUid() const;
     [[nodiscard]] std::optional<MillisecondsType> getAllottedTimeMs() const;
     [[nodiscard]] std::optional<std::vector<std::string>> getAdditionalOutputNames() const;
-<<<<<<< HEAD
-
-=======
-    [[nodiscard]] std::optional<SizeType32> getLanguageAdapterUid() const;
     [[nodiscard]] SizeType32 getNumVocabs() const;
->>>>>>> Fixes to compilation
     void setStreaming(bool streaming);
     void setSamplingConfig(SamplingConfig const& config);
     void setOutputConfig(OutputConfig const& outputConfig);
@@ -748,12 +743,8 @@ class Request
     void setSkipCrossAttnBlocks(Tensor skipCrossAttnBlocks);
     void setGuidedDecodingParams(GuidedDecodingParams const& guidedDecodingParams);
     void setLanguageAdapterUid(SizeType32 languageAdapterUid);
-<<<<<<< HEAD
     void setAllottedTimeMs(MillisecondsType allottedTimeMs);
-
-=======
     void setNumVocabs(SizeType32 numVocabs);
->>>>>>> Fixes to compilation
 private:
     friend class Serialization;
     class Impl;

diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -2055,8 +2055,6 @@ runtime::CudaEvent TrtGptModelInflightBatching::updateDecoderBuffers(
 
     if (returnLogProbs)
     {
-        mDecoderBuffers[vocabId]->cumLogProbs = mDecoders[vocabId]->getDecoderState().getCumLogProbs();
-        mDecoderBuffers[vocabId]->logProbs = mDecoders[vocabId]->getDecoderState().getLogProbs();
         mCopyBufferManager.copy(
             *mDecoders[vocabId]->getDecoderState().getCumLogProbs(),
             *mDecoderBuffers[vocabId]->cumLogProbsHost

diff --git a/examples/models/core/t5tts/README.md b/examples/models/core/t5tts/README.md
@@ -0,0 +1,88 @@
+
+# Build TRTLLM
+
+This describes how to run the t5tts in TRTLLM.
+Build docker and compile TRTLLM as usual:
+
+```bash
+make -C docker build IMAGE_NAME=t5tts
+make -C docker run LOCAL_USER=1 IMAGE_NAME=t5tts CONTAINER_NAME=t5tts
+# 90-real - for H100
+python3 ./scripts/build_wheel.py --cuda_architectures "90-real" --benchmarks --trt_root /usr/local/tensorrt
+pip install build/tensorrt_llm-0.20.0rc0-cp312-cp312-linux_x86_64.whl
+```
+
+# Build Engine
+
+Convert the checkpoint and build the engine:
+```bash
+# required to pip install omegaconf
+# md5sum newmodels/t5tts.ckpt: fb177acdc447af56c8bbfa9d17c75f45
+python examples/models/core/t5tts/convert_checkpoint.py \
+    --model_path newmodels/t5tts.ckpt --output_dir newmodels/t5tts_convert
+
+trtllm-build --checkpoint_dir newmodels/t5tts_convert/encoder/ \
+--output_dir newmodels/t5tts_engine/encoder \
+--paged_kv_cache enable --moe_plugin disable --max_beam_width 1 \
+--max_batch_size 256 --max_input_len 128 --gemm_plugin float16 \
+--bert_attention_plugin float16 --gpt_attention_plugin float16 \
+--remove_input_padding enable --use_paged_context_fmha enable
+
+trtllm-build --checkpoint_dir newmodels/t5tts_convert/decoder \
+	--output_dir newmodels/t5tts_engine/decoder \
+	--moe_plugin disable \
+	--max_beam_width 1 \
+	--max_batch_size 64 \
+	--max_input_len 192 \
+	--max_seq_len 512 \
+	--max_encoder_input_len 512 \
+	--gemm_plugin float16 \
+	--bert_attention_plugin float16 \
+	--gpt_attention_plugin float16 \
+	--remove_input_padding enable \
+ 	--use_paged_context_fmha enable
+```
+
+# Toy inference
+
+Finally run the model on the dummy input:
+```bash
+python examples/models/core/t5tts/run.py
+```
+
+# Benchmark
+
+gpt manager benchmark is modified to run benchmark with context for decoder.
+
+```bash
+# prepare dummy inputs for inference
+# 128 - number of phonemes in avergage sentence
+# 160 - context length in frames, corresponds to 160 / 21.5 = 7.44 seconds
+# 640 - total sequence length in frames, means 640 - 160 = 480 frames of audio generated,
+# which corresponds to 480 / 21.5 = 22.33 seconds
+# 768 - batch_size * 3, measure performance on 3 batches at max utilization
+python examples/models/core/enc_dec/prepare_benchmark.py --output benchmark.json \
+    --samples 768 \
+    --max_input_id 98 \
+	--num_vocabs 8 \
+	--input_len 128 0 128 128 \
+	--context_len 160 0 160 160 \
+	--output_len 640 0 640 640
+
+# run benchmark using generated dummy inputs
+./cpp/build/benchmarks/gptManagerBenchmark \
+    --dataset benchmark.json \
+    --output_csv res.csv \
+    --max_batch_size 256 \
+    --concurrency 256 \
+    --streaming \
+    --num_vocabs 8 \
+    --enable_chunked_context \
+    --encoder_engine_dir newmodels/t5tts_engine/encoder \
+    --decoder_engine_dir newmodels/t5tts_engine/decoder 2>&1 > /dev/null
+
+# print results from res.csv
+python3 -c "import csv; f=open('res.csv'); r=csv.reader(f); h=next(r); v=next(r); [print(f'{h[i]:<50}: {v[i]}') for i in range(len(h))]"
+```
+
+