Skip to content

Commit 201135e

Browse files
authored
TensorRT-LLM v0.13 Update (#2269)
1 parent 28fb9aa commit 201135e

File tree

592 files changed

+773759
-87300
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

592 files changed

+773759
-87300
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@
1111
[submodule "3rdparty/NVTX"]
1212
path = 3rdparty/NVTX
1313
url = https://github.com/NVIDIA/NVTX.git
14+
[submodule "3rdparty/ucxx"]
15+
path = 3rdparty/ucxx
16+
url = https://github.com/GuanLuo/ucxx.git

3rdparty/ucxx

Submodule ucxx added at b991817

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ TensorRT-LLM
77
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
88
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
99
[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
10-
[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
11-
[![version](https://img.shields.io/badge/release-0.12.0-green)](./tensorrt_llm/version.py)
10+
[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
11+
[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
1212
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1313

1414
[Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)
@@ -17,11 +17,11 @@ TensorRT-LLM
1717
<div align="left">
1818

1919
## Latest News
20+
* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
21+
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
22+
2023
* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
2124
[➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
22-
<div align="center">
23-
<img src="docs/source/media/picture-08-13-2024.png" width="50%">
24-
<div align="left">
2525

2626
* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
2727
🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)

benchmarks/cpp/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,10 @@ for nloras in ${NUM_LORAS[@]}; do
267267
--input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
268268
done
269269

270-
# Generate random lora weights for 256 adapters
270+
# Generate random lora weights for 16 adapters
271271
python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 16
272272

273-
# perform benchmarking
273+
# Perform benchmarking
274274

275275
# First run inference without LoRAs
276276
mkdir -p ${EG_DIR}/log-base-lora

benchmarks/cpp/gptManagerBenchmark.cpp

Lines changed: 106 additions & 36 deletions
Large diffs are not rendered by default.

benchmarks/cpp/gptSessionBenchmark.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,8 @@ int main(int argc, char* argv[])
427427

428428
options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
429429
options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
430-
options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
430+
options.add_options()(
431+
"max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
431432
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
432433
options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
433434
options.add_options()(
@@ -535,7 +536,7 @@ int main(int argc, char* argv[])
535536
// Argument: Max KV Cache Length
536537
if (result.count("max_attention_window"))
537538
{
538-
sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
539+
sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
539540
}
540541
// Argument: Sink token length
541542
if (result.count("sink_token_len"))

benchmarks/python/all_reduce.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
import tensorrt_llm as tllm
2525
from tensorrt_llm import Mapping, Tensor
26-
from tensorrt_llm._ipc_utils import peer_access
2726
from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
2827
from tensorrt_llm.functional import AllReduceStrategy, allreduce
2928
from tensorrt_llm.plugin.plugin import current_all_reduce_helper
@@ -106,18 +105,18 @@ def allreduce_benchmark(dtype: str,
106105
_, start = cuda.cuEventCreate(0)
107106
_, stop = cuda.cuEventCreate(0)
108107
runtimes = []
109-
with peer_access(mapping):
110-
tllm.mpi_barrier()
111-
112-
for _ in range(10):
113-
cuda.cuEventRecord(start, stream.cuda_stream)
114-
session.run(inputs=feed_dict,
115-
outputs={"output": output},
116-
stream=stream.cuda_stream)
117-
cuda.cuEventRecord(stop, stream.cuda_stream)
118-
torch.cuda.synchronize()
119-
_, ms = cuda.cuEventElapsedTime(start, stop)
120-
runtimes.append(ms)
108+
109+
tllm.mpi_barrier()
110+
111+
for _ in range(10):
112+
cuda.cuEventRecord(start, stream.cuda_stream)
113+
session.run(inputs=feed_dict,
114+
outputs={"output": output},
115+
stream=stream.cuda_stream)
116+
cuda.cuEventRecord(stop, stream.cuda_stream)
117+
torch.cuda.synchronize()
118+
_, ms = cuda.cuEventElapsedTime(start, stop)
119+
runtimes.append(ms)
121120

122121
median_ms = sorted(runtimes)[len(runtimes) // 2]
123122
assert torch.allclose(output, (input * world_size)**inner_loop)

benchmarks/python/check_accuracy_mlperf.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from transformers import AutoTokenizer, LlamaTokenizerFast
1010

1111
nltk.download("punkt", quiet=False)
12+
nltk.download('punkt_tab')
1213
import argparse
1314

1415

@@ -25,10 +26,9 @@ class Model(Enum):
2526
"tokens_per_sample": 294.45 * 0.9
2627
},
2728
Model.GPT_J: {
28-
"rouge1": 42.9435135,
29-
"rouge2": 20.1033765,
30-
"rougeL": 29.9581119,
31-
# "tokens_per_sample": ??
29+
"rouge1": 42.9865 * 0.99,
30+
"rouge2": 20.1235 * 0.99,
31+
"rougeL": 29.9881 * 0.99,
3232
}
3333
}
3434

@@ -138,7 +138,6 @@ def main():
138138
target_texts = get_reference_df(args.dataset)
139139
model = Model.Llama_v2_70B
140140
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
141-
relaxing_factor = 1.0
142141
elif args.dataset.lower().endswith(".json"):
143142
target_texts = get_reference_json(args.dataset)
144143
model = Model.GPT_J
@@ -147,7 +146,6 @@ def main():
147146
padding_side="left",
148147
use_fast=False)
149148
tokenizer.pad_token = tokenizer.eos_token
150-
relaxing_factor = 0.93
151149
else:
152150
raise RuntimeError(
153151
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
@@ -169,7 +167,7 @@ def main():
169167
print("Targets: ", targets)
170168

171169
for k, _ in targets.items():
172-
assert targets[k] * relaxing_factor <= achieved_scores[k]
170+
assert targets[k] <= achieved_scores[k]
173171

174172

175173
if __name__ == "__main__":

benchmarks/python/enc_dec_benchmark.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from tensorrt_llm.quantization import QuantMode
2626
from tensorrt_llm.runtime.session import TensorInfo
2727
from tensorrt_llm.runtime import ModelConfig
28+
from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy
2829

2930

3031
class EncDecBenchmark(BaseBenchmark):
@@ -100,6 +101,9 @@ def read_config(component):
100101
dtype = pretrained_config["dtype"]
101102

102103
paged_kv_cache = plugin_config['paged_kv_cache']
104+
kv_cache_type = get_kv_cache_type_from_legacy(
105+
True, paged_kv_cache)
106+
103107
tokens_per_block = plugin_config['tokens_per_block']
104108

105109
gather_context_logits = builder_config.get(
@@ -120,7 +124,7 @@ def read_config(component):
120124
num_layers=num_layers,
121125
gpt_attention_plugin=use_gpt_attention_plugin,
122126
remove_input_padding=remove_input_padding,
123-
paged_kv_cache=paged_kv_cache,
127+
kv_cache_type=kv_cache_type,
124128
tokens_per_block=tokens_per_block,
125129
cross_attention=cross_attention,
126130
has_position_embedding=has_position_embedding,

benchmarks/python/gpt_benchmark.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import torch
2121

2222
import tensorrt_llm
23+
from tensorrt_llm.bindings import KVCacheType
2324
from tensorrt_llm.builder import Engine
2425
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
2526
SamplingConfig)
@@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
7778
if hasattr(self, item):
7879
rnn_configs_kwargs[item] = getattr(self, item)
7980

81+
kv_cache_type = KVCacheType.CONTINUOUS
82+
if hasattr(self, 'kv_cache_type'):
83+
kv_cache_type = self.kv_cache_type
84+
else:
85+
if hasattr(self, 'paged_kv_cache'):
86+
kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS
87+
8088
model_config = tensorrt_llm.runtime.ModelConfig(
8189
max_batch_size=self.max_batch_size,
8290
max_beam_width=self.num_beams,
@@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
8694
num_kv_heads=ceil(self.num_kv_heads / self.world_size),
8795
hidden_size=self.hidden_size // self.world_size,
8896
gpt_attention_plugin=self.use_gpt_attention_plugin,
89-
paged_kv_cache=self.paged_kv_cache if hasattr(
90-
self, 'paged_kv_cache') else False,
97+
kv_cache_type=kv_cache_type,
9198
paged_state=self.paged_state
9299
if hasattr(self, 'paged_state') else False,
93100
dtype=self.dtype,

0 commit comments

Comments
 (0)