Skip to content
Open
1 change: 0 additions & 1 deletion cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -1909,7 +1909,6 @@ class GenericLlmRequest
std::shared_ptr<std::vector<bool>> mSequenceFinalVec;

std::optional<TensorPtr> mSkipCrossAttnBlocks{std::nullopt};
SizeType32 mNumVocabs;

// Performance metrics.
bool mReturnPerfMetrics{false};
Expand Down
9 changes: 0 additions & 9 deletions cpp/include/tensorrt_llm/executor/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -711,12 +711,7 @@ class Request
[[nodiscard]] std::optional<SizeType32> getLanguageAdapterUid() const;
[[nodiscard]] std::optional<MillisecondsType> getAllottedTimeMs() const;
[[nodiscard]] std::optional<std::vector<std::string>> getAdditionalOutputNames() const;
<<<<<<< HEAD

=======
[[nodiscard]] std::optional<SizeType32> getLanguageAdapterUid() const;
[[nodiscard]] SizeType32 getNumVocabs() const;
>>>>>>> Fixes to compilation
void setStreaming(bool streaming);
void setSamplingConfig(SamplingConfig const& config);
void setOutputConfig(OutputConfig const& outputConfig);
Expand Down Expand Up @@ -748,12 +743,8 @@ class Request
void setSkipCrossAttnBlocks(Tensor skipCrossAttnBlocks);
void setGuidedDecodingParams(GuidedDecodingParams const& guidedDecodingParams);
void setLanguageAdapterUid(SizeType32 languageAdapterUid);
<<<<<<< HEAD
void setAllottedTimeMs(MillisecondsType allottedTimeMs);

=======
void setNumVocabs(SizeType32 numVocabs);
>>>>>>> Fixes to compilation
private:
friend class Serialization;
class Impl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2055,8 +2055,6 @@ runtime::CudaEvent TrtGptModelInflightBatching::updateDecoderBuffers(

if (returnLogProbs)
{
mDecoderBuffers[vocabId]->cumLogProbs = mDecoders[vocabId]->getDecoderState().getCumLogProbs();
mDecoderBuffers[vocabId]->logProbs = mDecoders[vocabId]->getDecoderState().getLogProbs();
mCopyBufferManager.copy(
*mDecoders[vocabId]->getDecoderState().getCumLogProbs(),
*mDecoderBuffers[vocabId]->cumLogProbsHost
Expand Down
88 changes: 88 additions & 0 deletions examples/models/core/t5tts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

# Build TRTLLM

This describes how to run the t5tts in TRTLLM.
Build docker and compile TRTLLM as usual:

```bash
make -C docker build IMAGE_NAME=t5tts
make -C docker run LOCAL_USER=1 IMAGE_NAME=t5tts CONTAINER_NAME=t5tts
# 90-real - for H100
python3 ./scripts/build_wheel.py --cuda_architectures "90-real" --benchmarks --trt_root /usr/local/tensorrt
pip install build/tensorrt_llm-0.20.0rc0-cp312-cp312-linux_x86_64.whl
```

# Build Engine

Convert the checkpoint and build the engine:
```bash
# required to pip install omegaconf
# md5sum newmodels/t5tts.ckpt: fb177acdc447af56c8bbfa9d17c75f45
python examples/models/core/t5tts/convert_checkpoint.py \
--model_path newmodels/t5tts.ckpt --output_dir newmodels/t5tts_convert

trtllm-build --checkpoint_dir newmodels/t5tts_convert/encoder/ \
--output_dir newmodels/t5tts_engine/encoder \
--paged_kv_cache enable --moe_plugin disable --max_beam_width 1 \
--max_batch_size 256 --max_input_len 128 --gemm_plugin float16 \
--bert_attention_plugin float16 --gpt_attention_plugin float16 \
--remove_input_padding enable --use_paged_context_fmha enable

trtllm-build --checkpoint_dir newmodels/t5tts_convert/decoder \
--output_dir newmodels/t5tts_engine/decoder \
--moe_plugin disable \
--max_beam_width 1 \
--max_batch_size 64 \
--max_input_len 192 \
--max_seq_len 512 \
--max_encoder_input_len 512 \
--gemm_plugin float16 \
--bert_attention_plugin float16 \
--gpt_attention_plugin float16 \
--remove_input_padding enable \
--use_paged_context_fmha enable
```

# Toy inference

Finally run the model on the dummy input:
```bash
python examples/models/core/t5tts/run.py
```

# Benchmark

gpt manager benchmark is modified to run benchmark with context for decoder.

```bash
# prepare dummy inputs for inference
# 128 - number of phonemes in avergage sentence
# 160 - context length in frames, corresponds to 160 / 21.5 = 7.44 seconds
# 640 - total sequence length in frames, means 640 - 160 = 480 frames of audio generated,
# which corresponds to 480 / 21.5 = 22.33 seconds
# 768 - batch_size * 3, measure performance on 3 batches at max utilization
python examples/models/core/enc_dec/prepare_benchmark.py --output benchmark.json \
--samples 768 \
--max_input_id 98 \
--num_vocabs 8 \
--input_len 128 0 128 128 \
--context_len 160 0 160 160 \
--output_len 640 0 640 640

# run benchmark using generated dummy inputs
./cpp/build/benchmarks/gptManagerBenchmark \
--dataset benchmark.json \
--output_csv res.csv \
--max_batch_size 256 \
--concurrency 256 \
--streaming \
--num_vocabs 8 \
--enable_chunked_context \
--encoder_engine_dir newmodels/t5tts_engine/encoder \
--decoder_engine_dir newmodels/t5tts_engine/decoder 2>&1 > /dev/null

# print results from res.csv
python3 -c "import csv; f=open('res.csv'); r=csv.reader(f); h=next(r); v=next(r); [print(f'{h[i]:<50}: {v[i]}') for i in range(len(h))]"
```


Loading