NVIDIA
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎3rdparty/ucxx‎ b/‎3rdparty/ucxx‎
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/cpp/README.md‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/cpp/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 106 additions & 36 deletions b/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 106 additions & 36 deletions
diff --git a/‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/python/all_reduce.py‎
Lines changed: 12 additions & 13 deletions b/‎benchmarks/python/all_reduce.py‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎benchmarks/python/check_accuracy_mlperf.py‎
Lines changed: 5 additions & 7 deletions b/‎benchmarks/python/check_accuracy_mlperf.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎benchmarks/python/enc_dec_benchmark.py‎
Lines changed: 5 additions & 1 deletion b/‎benchmarks/python/enc_dec_benchmark.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/python/gpt_benchmark.py‎
Lines changed: 9 additions & 2 deletions b/‎benchmarks/python/gpt_benchmark.py‎
Lines changed: 9 additions & 2 deletions
@@ -11,3 +11,6 @@
 [submodule "3rdparty/NVTX"]
 	path = 3rdparty/NVTX
 	url = https://github.com/NVIDIA/NVTX.git
+[submodule "3rdparty/ucxx"]
+	path = 3rdparty/ucxx
+	url = https://github.com/GuanLuo/ucxx.git
@@ -7,8 +7,8 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.12.0-green)](./tensorrt_llm/version.py)
+[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -17,11 +17,11 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
+* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
+[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
+
 * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
 [➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
-<div align="center">
-<img src="docs/source/media/picture-08-13-2024.png" width="50%">
-<div align="left">
 
 * [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
 🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)
 
@@ -267,10 +267,10 @@ for nloras in ${NUM_LORAS[@]}; do
         --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
 done
 
-# Generate random lora weights for 256 adapters
+# Generate random lora weights for 16 adapters
 python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 16
 
-# perform benchmarking
+# Perform benchmarking
 
 # First run inference without LoRAs
 mkdir -p ${EG_DIR}/log-base-lora
 
@@ -427,7 +427,8 @@ int main(int argc, char* argv[])
 
     options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
     options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
-    options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
+    options.add_options()(
+        "max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
     options.add_options()(
@@ -535,7 +536,7 @@ int main(int argc, char* argv[])
     // Argument: Max KV Cache Length
     if (result.count("max_attention_window"))
     {
-        sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
+        sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
     }
     // Argument: Sink token length
     if (result.count("sink_token_len"))
 
@@ -23,7 +23,6 @@
 
 import tensorrt_llm as tllm
 from tensorrt_llm import Mapping, Tensor
-from tensorrt_llm._ipc_utils import peer_access
 from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
 from tensorrt_llm.functional import AllReduceStrategy, allreduce
 from tensorrt_llm.plugin.plugin import current_all_reduce_helper
@@ -106,18 +105,18 @@ def allreduce_benchmark(dtype: str,
             _, start = cuda.cuEventCreate(0)
             _, stop = cuda.cuEventCreate(0)
             runtimes = []
-            with peer_access(mapping):
-                tllm.mpi_barrier()
-
-                for _ in range(10):
-                    cuda.cuEventRecord(start, stream.cuda_stream)
-                    session.run(inputs=feed_dict,
-                                outputs={"output": output},
-                                stream=stream.cuda_stream)
-                    cuda.cuEventRecord(stop, stream.cuda_stream)
-                    torch.cuda.synchronize()
-                    _, ms = cuda.cuEventElapsedTime(start, stop)
-                    runtimes.append(ms)
+
+            tllm.mpi_barrier()
+
+            for _ in range(10):
+                cuda.cuEventRecord(start, stream.cuda_stream)
+                session.run(inputs=feed_dict,
+                            outputs={"output": output},
+                            stream=stream.cuda_stream)
+                cuda.cuEventRecord(stop, stream.cuda_stream)
+                torch.cuda.synchronize()
+                _, ms = cuda.cuEventElapsedTime(start, stop)
+                runtimes.append(ms)
 
             median_ms = sorted(runtimes)[len(runtimes) // 2]
             assert torch.allclose(output, (input * world_size)**inner_loop)
 
@@ -9,6 +9,7 @@
 from transformers import AutoTokenizer, LlamaTokenizerFast
 
 nltk.download("punkt", quiet=False)
+nltk.download('punkt_tab')
 import argparse
 
 
@@ -25,10 +26,9 @@ class Model(Enum):
         "tokens_per_sample": 294.45 * 0.9
     },
     Model.GPT_J: {
-        "rouge1": 42.9435135,
-        "rouge2": 20.1033765,
-        "rougeL": 29.9581119,
-        # "tokens_per_sample": ??
+        "rouge1": 42.9865 * 0.99,
+        "rouge2": 20.1235 * 0.99,
+        "rougeL": 29.9881 * 0.99,
     }
 }
 
@@ -138,7 +138,6 @@ def main():
         target_texts = get_reference_df(args.dataset)
         model = Model.Llama_v2_70B
         tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
-        relaxing_factor = 1.0
     elif args.dataset.lower().endswith(".json"):
         target_texts = get_reference_json(args.dataset)
         model = Model.GPT_J
@@ -147,7 +146,6 @@ def main():
                                                   padding_side="left",
                                                   use_fast=False)
         tokenizer.pad_token = tokenizer.eos_token
-        relaxing_factor = 0.93
     else:
         raise RuntimeError(
             "Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
@@ -169,7 +167,7 @@ def main():
     print("Targets: ", targets)
 
     for k, _ in targets.items():
-        assert targets[k] * relaxing_factor <= achieved_scores[k]
+        assert targets[k] <= achieved_scores[k]
 
 
 if __name__ == "__main__":
 
@@ -25,6 +25,7 @@
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime.session import TensorInfo
 from tensorrt_llm.runtime import ModelConfig
+from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy
 
 
 class EncDecBenchmark(BaseBenchmark):
@@ -100,6 +101,9 @@ def read_config(component):
                 dtype = pretrained_config["dtype"]
 
                 paged_kv_cache = plugin_config['paged_kv_cache']
+                kv_cache_type = get_kv_cache_type_from_legacy(
+                    True, paged_kv_cache)
+
                 tokens_per_block = plugin_config['tokens_per_block']
 
                 gather_context_logits = builder_config.get(
@@ -120,7 +124,7 @@ def read_config(component):
                     num_layers=num_layers,
                     gpt_attention_plugin=use_gpt_attention_plugin,
                     remove_input_padding=remove_input_padding,
-                    paged_kv_cache=paged_kv_cache,
+                    kv_cache_type=kv_cache_type,
                     tokens_per_block=tokens_per_block,
                     cross_attention=cross_attention,
                     has_position_embedding=has_position_embedding,
 
@@ -20,6 +20,7 @@
 import torch
 
 import tensorrt_llm
+from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.builder import Engine
 from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
                                   SamplingConfig)
@@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
             if hasattr(self, item):
                 rnn_configs_kwargs[item] = getattr(self, item)
 
+        kv_cache_type = KVCacheType.CONTINUOUS
+        if hasattr(self, 'kv_cache_type'):
+            kv_cache_type = self.kv_cache_type
+        else:
+            if hasattr(self, 'paged_kv_cache'):
+                kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS
+
         model_config = tensorrt_llm.runtime.ModelConfig(
             max_batch_size=self.max_batch_size,
             max_beam_width=self.num_beams,
@@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
             num_kv_heads=ceil(self.num_kv_heads / self.world_size),
             hidden_size=self.hidden_size // self.world_size,
             gpt_attention_plugin=self.use_gpt_attention_plugin,
-            paged_kv_cache=self.paged_kv_cache if hasattr(
-                self, 'paged_kv_cache') else False,
+            kv_cache_type=kv_cache_type,
             paged_state=self.paged_state
             if hasattr(self, 'paged_state') else False,
             dtype=self.dtype,