niukuo
diff --git a/‎3rdparty/ucxx‎ b/‎3rdparty/ucxx‎
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/python/all_reduce.py‎
Lines changed: 16 additions & 23 deletions b/‎benchmarks/python/all_reduce.py‎
Lines changed: 16 additions & 23 deletions
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 17 additions & 2 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 4 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 8 additions & 4 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 8 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/cudaFp8Utils.h‎
Lines changed: 85 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/cudaFp8Utils.h‎
Lines changed: 85 additions & 0 deletions
@@ -5,10 +5,10 @@ TensorRT-LLM
 <h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
-[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
-[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
+[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
+[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
+[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.18.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 
@@ -19,14 +19,15 @@
 import torch
 # isort: on
 from cuda import cuda, cudart
-from polygraphy.backend.trt import CreateConfig, EngineFromNetwork
 
 import tensorrt_llm as tllm
 from tensorrt_llm import Mapping, Tensor
 from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
 from tensorrt_llm.functional import (AllReduceParams, AllReduceStrategy,
                                      allreduce)
-from tensorrt_llm.plugin.plugin import current_all_reduce_helper
+from tensorrt_llm.plugin.plugin import (current_all_reduce_helper,
+                                        init_all_reduce_helper)
+from tensorrt_llm.runtime import Session
 
 
 def allreduce_benchmark(dtype: str,
@@ -68,11 +69,13 @@ def allreduce_benchmark(dtype: str,
         ]:
             builder = tllm.Builder()
             net = builder.create_network()
+            net.plugin_config.set_nccl_plugin(dtype)
+            init_all_reduce_helper()
             _buffers, workspace = current_all_reduce_helper(
             ).allocate_workspace(mapping, size * dtype_size)
 
             with tllm.net_guard(net):
-                network = tllm.default_trtnet()
+                tllm.default_trtnet()
 
                 x = Tensor(name='x',
                            shape=input.shape,
@@ -86,32 +89,20 @@ def allreduce_benchmark(dtype: str,
                         current,
                         mapping.tp_group,
                         all_reduce_params=AllReduceParams(strategy=strategy))
-                output = current.trt_tensor
-
-                network.mark_output(output)
-                output.name = 'output'
-                output.dtype = tllm.str_dtype_to_trt(dtype)
-
-            build_engine = EngineFromNetwork(
-                (builder.trt_builder, net.trt_network),
-                config=CreateConfig(
-                    fp16=(dtype == 'float16'),
-                    bf16=(dtype == 'bfloat16'),
-                    precision_constraints='obey',
-                ))
-
-            output = torch.zeros_like(input)
-
-            stream = torch.cuda.current_stream()
+                current.mark_output('output', dtype)
             feed_dict = {'x': input, 'all_reduce_workspace': workspace}
+            builder_config = builder.create_builder_config(precision=dtype)
+            engine = builder.build_engine(net, builder_config)
+            assert engine is not None, "Failed to build engine"
+            session = Session.from_serialized_engine(engine)
 
-            session = tllm.runtime.Session.from_engine(build_engine())
             _, start = cuda.cuEventCreate(0)
             _, stop = cuda.cuEventCreate(0)
             runtimes = []
 
             tllm.mpi_barrier()
-
+            output = torch.empty(input.shape, dtype=torch_dtype, device='cuda')
+            stream = torch.cuda.current_stream()
             for _ in range(10):
                 cuda.cuEventRecord(start, stream.cuda_stream)
                 session.run(inputs=feed_dict,
@@ -123,7 +114,9 @@ def allreduce_benchmark(dtype: str,
                 runtimes.append(ms)
 
             median_ms = sorted(runtimes)[len(runtimes) // 2]
-            assert torch.allclose(output, (input * world_size)**inner_loop)
+
+            allreduce_ref = (input * world_size)**inner_loop
+            assert torch.allclose(output, allreduce_ref)
 
             if mapping.rank == 0:
                 print(
 
@@ -170,7 +170,16 @@ configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/include/tensorrt_llm/executor/version.h)
 
 # Determine CUDA version before enabling the language extension
+# check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER is
+# not set
+if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER)
+  set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER})
+endif()
 check_language(CUDA)
+if(CMAKE_CUDA_HOST_COMPILER_BACKUP)
+  set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER_BACKUP})
+  check_language(CUDA)
+endif()
 if(CMAKE_CUDA_COMPILER)
   message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
   if(NOT WIN32) # Linux
@@ -613,11 +622,17 @@ if(ENABLE_UCX)
     # that change in USE_CXX11_ABI will not be ignored.
     execute_process(
       COMMAND
+        ${CMAKE_COMMAND} -E env LIB_BUILD_DIR=${CMAKE_BINARY_DIR}/ucxx/build
         ${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
         --cmake-args=\"-DBUILD_SHARED_LIBS=OFF
         -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
-        OUTPUT_QUIET)
-    find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
+      OUTPUT_VARIABLE UCXX_BUILD_OUTPUT
+      RESULT_VARIABLE UCXX_BUILD_RESULT)
+    if(UCXX_BUILD_RESULT)
+      message(${UCXX_BUILD_OUTPUT})
+      message(FATAL_ERROR "ucxx build failed")
+    endif()
+    find_package(ucxx REQUIRED PATHS ${CMAKE_BINARY_DIR}/ucxx/build
                  NO_DEFAULT_PATH)
   endif()
 endif()
 
@@ -453,7 +453,7 @@ class BlockManager
         SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
         CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
 
     ~BlockManager();
 
@@ -735,6 +735,9 @@ class BlockManager
     SizeType32 mMissedBlocks;
     std::set<KVCacheBlock::IdType> reusedBlockIds;
 
+    // Whether or not to maintain a hashmap of blocks.
+    bool mEnableHashKey;
+
 private:
     friend class KVCacheManager;
 };
 
@@ -1536,7 +1536,7 @@ class GenericLlmRequest
     {
         TLLM_CHECK_WITH_INFO(
             isContextInitState() || isDisaggGenerationInitState() || isDisaggGenerationTransmissionComplete(),
-            "getContextChunkSize is only possible during the context phase.");
+            "getContextChunkSize is only possible during the context phase or generation init phase.");
         return mContextChunkSize;
     }
 
@@ -1545,7 +1545,9 @@ class GenericLlmRequest
     /// remaining length.
     void setContextChunkSize(SizeType32 size)
     {
-        TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase.");
+        TLLM_CHECK_WITH_INFO(
+            isContextInitState() || isDisaggGenerationInitState() || isDisaggGenerationTransmissionComplete(),
+            "setContextChunkSize is only possible during the context phase or generation init phase.");
         TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);
         mContextChunkSize = std::min(size, getContextRemainingLength());
     }
@@ -1721,18 +1723,20 @@ class GenericLlmRequest
 
     void updatePerfMetrics(executor::IterationType iter)
     {
+        auto const currentTokenTime = std::chrono::steady_clock::now();
+
         if (!mPerfMetrics.firstIter)
         {
             mPerfMetrics.firstIter = iter;
-            mPerfMetrics.timingMetrics.firstTokenTime = std::chrono::steady_clock::now();
+            mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
         }
 
         mPerfMetrics.iter = iter;
 
         if (isFinished())
         {
             mPerfMetrics.lastIter = iter;
-            mPerfMetrics.timingMetrics.lastTokenTime = std::chrono::steady_clock::now();
+            mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
         }
     }
 
 
@@ -59,6 +59,8 @@ class BasePeftCacheManager
     using RequestVector = std::vector<LlmRequestPtr>;
     using PeftTable = std::map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
 
+    virtual ~BasePeftCacheManager() = default;
+
     /**
      * \brief add PEFT weights from llmRequest if any.  This will kickoff background copy tasks.
      * \param[in] llmRequest: the request
@@ -100,6 +102,8 @@ class PeftCacheManager : public BasePeftCacheManager
     PeftCacheManager(PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
         runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
+    ~PeftCacheManager() override = default;
+
     void addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bool tryGpuCache = true) override;
 
     PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,
@@ -166,6 +170,10 @@ class PeftCacheManager : public BasePeftCacheManager
 
 class NoOpPeftCacheManager : public BasePeftCacheManager
 {
+public:
+    ~NoOpPeftCacheManager() override = default;
+
+private:
     void addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bool tryGpuCache = true) override;
 
     PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,
 
@@ -141,6 +141,78 @@ struct PackType
     using type = float;
 };
 
+template <typename T>
+struct PackType<T, 1>
+{
+    struct __CUDA_ALIGN__(std::alignment_of_v<T>) type
+    {
+        T array[1];
+    };
+};
+
+template <>
+struct PackType<float, 2>
+{
+    struct __CUDA_ALIGN__(8) type
+    {
+        float array[2];
+    };
+};
+
+template <>
+struct PackType<float, 4>
+{
+    struct __CUDA_ALIGN__(16) type
+    {
+        float array[4];
+    };
+};
+
+template <>
+struct PackType<float, 8>
+{
+    struct __CUDA_ALIGN__(32) type
+    {
+        float array[8];
+    };
+};
+
+template <>
+struct PackType<float, 16>
+{
+    struct __CUDA_ALIGN__(64) type
+    {
+        float array[16];
+    };
+};
+
+template <>
+struct PackType<half, 2>
+{
+    struct __CUDA_ALIGN__(4) type
+    {
+        half array[2];
+    };
+};
+
+template <>
+struct PackType<half, 4>
+{
+    struct __CUDA_ALIGN__(8) type
+    {
+        half array[4];
+    };
+};
+
+template <>
+struct PackType<half, 8>
+{
+    struct __CUDA_ALIGN__(16) type
+    {
+        half array[8];
+    };
+};
+
 #ifdef ENABLE_BF16
 template <>
 struct PackType<__nv_bfloat16, 2>
@@ -159,6 +231,12 @@ struct PackType<__nv_bfloat16, 8>
 {
     using type = __nv_bfloat168;
 };
+
+template <>
+struct PackType<__nv_bfloat16, 16>
+{
+    using type = __nv_bfloat1616;
+};
 #endif
 
 #ifdef ENABLE_FP8
@@ -179,6 +257,13 @@ struct PackType<__nv_fp8_e4m3, 8>
 {
     using type = __nv_fp8_8_e4m3;
 };
+
+template <>
+struct PackType<__nv_fp8_e4m3, 16>
+{
+    using type = __nv_fp8x16_e4m3;
+};
+
 #endif
 
 __inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, __nv_fp8x4_e4m3 const* in)
Original file line number	Diff line number	Diff line change
`@@ -1536,7 +1536,7 @@ class GenericLlmRequest`
`1536`	`1536`	`{`
`1537`	`1537`	`TLLM_CHECK_WITH_INFO(`
`1538`	`1538`	`isContextInitState() \|\| isDisaggGenerationInitState() \|\| isDisaggGenerationTransmissionComplete(),`
`1539`		`- "getContextChunkSize is only possible during the context phase.");`
	`1539`	`+ "getContextChunkSize is only possible during the context phase or generation init phase.");`
`1540`	`1540`	`return mContextChunkSize;`
`1541`	`1541`	`}`
`1542`	`1542`
`@@ -1545,7 +1545,9 @@ class GenericLlmRequest`
`1545`	`1545`	`/// remaining length.`
`1546`	`1546`	`void setContextChunkSize(SizeType32 size)`
`1547`	`1547`	`{`
`1548`		`- TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase.");`
	`1548`	`+ TLLM_CHECK_WITH_INFO(`
	`1549`	`+ isContextInitState() \|\| isDisaggGenerationInitState() \|\| isDisaggGenerationTransmissionComplete(),`
	`1550`	`+ "setContextChunkSize is only possible during the context phase or generation init phase.");`
`1549`	`1551`	`TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);`
`1550`	`1552`	`mContextChunkSize = std::min(size, getContextRemainingLength());`
`1551`	`1553`	`}`
`@@ -1721,18 +1723,20 @@ class GenericLlmRequest`
`1721`	`1723`
`1722`	`1724`	`void updatePerfMetrics(executor::IterationType iter)`
`1723`	`1725`	`{`
	`1726`	`+ auto const currentTokenTime = std::chrono::steady_clock::now();`
	`1727`	`+`
`1724`	`1728`	`if (!mPerfMetrics.firstIter)`
`1725`	`1729`	`{`
`1726`	`1730`	`mPerfMetrics.firstIter = iter;`
`1727`		`- mPerfMetrics.timingMetrics.firstTokenTime = std::chrono::steady_clock::now();`
	`1731`	`+ mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;`
`1728`	`1732`	`}`
`1729`	`1733`
`1730`	`1734`	`mPerfMetrics.iter = iter;`
`1731`	`1735`
`1732`	`1736`	`if (isFinished())`
`1733`	`1737`	`{`
`1734`	`1738`	`mPerfMetrics.lastIter = iter;`
`1735`		`- mPerfMetrics.timingMetrics.lastTokenTime = std::chrono::steady_clock::now();`
	`1739`	`+ mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;`
`1736`	`1740`	`}`
`1737`	`1741`	`}`
`1738`	`1742`