Skip to content

Commit e88da96

Browse files
authored
Update TensorRT-LLM (NVIDIA#2783)
1 parent 16d2467 commit e88da96

File tree

254 files changed

+71866
-29540
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

254 files changed

+71866
-29540
lines changed

3rdparty/ucxx

Submodule ucxx updated 94 files

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ TensorRT-LLM
55
<h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
66

77
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
8-
[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
9-
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
10-
[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
11-
[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
8+
[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
9+
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
10+
[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
11+
[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
1212
[![version](https://img.shields.io/badge/release-0.18.0.dev-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

benchmarks/python/all_reduce.py

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@
1919
import torch
2020
# isort: on
2121
from cuda import cuda, cudart
22-
from polygraphy.backend.trt import CreateConfig, EngineFromNetwork
2322

2423
import tensorrt_llm as tllm
2524
from tensorrt_llm import Mapping, Tensor
2625
from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
2726
from tensorrt_llm.functional import (AllReduceParams, AllReduceStrategy,
2827
allreduce)
29-
from tensorrt_llm.plugin.plugin import current_all_reduce_helper
28+
from tensorrt_llm.plugin.plugin import (current_all_reduce_helper,
29+
init_all_reduce_helper)
30+
from tensorrt_llm.runtime import Session
3031

3132

3233
def allreduce_benchmark(dtype: str,
@@ -68,11 +69,13 @@ def allreduce_benchmark(dtype: str,
6869
]:
6970
builder = tllm.Builder()
7071
net = builder.create_network()
72+
net.plugin_config.set_nccl_plugin(dtype)
73+
init_all_reduce_helper()
7174
_buffers, workspace = current_all_reduce_helper(
7275
).allocate_workspace(mapping, size * dtype_size)
7376

7477
with tllm.net_guard(net):
75-
network = tllm.default_trtnet()
78+
tllm.default_trtnet()
7679

7780
x = Tensor(name='x',
7881
shape=input.shape,
@@ -86,32 +89,20 @@ def allreduce_benchmark(dtype: str,
8689
current,
8790
mapping.tp_group,
8891
all_reduce_params=AllReduceParams(strategy=strategy))
89-
output = current.trt_tensor
90-
91-
network.mark_output(output)
92-
output.name = 'output'
93-
output.dtype = tllm.str_dtype_to_trt(dtype)
94-
95-
build_engine = EngineFromNetwork(
96-
(builder.trt_builder, net.trt_network),
97-
config=CreateConfig(
98-
fp16=(dtype == 'float16'),
99-
bf16=(dtype == 'bfloat16'),
100-
precision_constraints='obey',
101-
))
102-
103-
output = torch.zeros_like(input)
104-
105-
stream = torch.cuda.current_stream()
92+
current.mark_output('output', dtype)
10693
feed_dict = {'x': input, 'all_reduce_workspace': workspace}
94+
builder_config = builder.create_builder_config(precision=dtype)
95+
engine = builder.build_engine(net, builder_config)
96+
assert engine is not None, "Failed to build engine"
97+
session = Session.from_serialized_engine(engine)
10798

108-
session = tllm.runtime.Session.from_engine(build_engine())
10999
_, start = cuda.cuEventCreate(0)
110100
_, stop = cuda.cuEventCreate(0)
111101
runtimes = []
112102

113103
tllm.mpi_barrier()
114-
104+
output = torch.empty(input.shape, dtype=torch_dtype, device='cuda')
105+
stream = torch.cuda.current_stream()
115106
for _ in range(10):
116107
cuda.cuEventRecord(start, stream.cuda_stream)
117108
session.run(inputs=feed_dict,
@@ -123,7 +114,9 @@ def allreduce_benchmark(dtype: str,
123114
runtimes.append(ms)
124115

125116
median_ms = sorted(runtimes)[len(runtimes) // 2]
126-
assert torch.allclose(output, (input * world_size)**inner_loop)
117+
118+
allreduce_ref = (input * world_size)**inner_loop
119+
assert torch.allclose(output, allreduce_ref)
127120

128121
if mapping.rank == 0:
129122
print(

cpp/CMakeLists.txt

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,16 @@ configure_file(
170170
${CMAKE_CURRENT_SOURCE_DIR}/include/tensorrt_llm/executor/version.h)
171171

172172
# Determine CUDA version before enabling the language extension
173+
# check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER is
174+
# not set
175+
if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER)
176+
set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER})
177+
endif()
173178
check_language(CUDA)
179+
if(CMAKE_CUDA_HOST_COMPILER_BACKUP)
180+
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER_BACKUP})
181+
check_language(CUDA)
182+
endif()
174183
if(CMAKE_CUDA_COMPILER)
175184
message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
176185
if(NOT WIN32) # Linux
@@ -613,11 +622,17 @@ if(ENABLE_UCX)
613622
# that change in USE_CXX11_ABI will not be ignored.
614623
execute_process(
615624
COMMAND
625+
${CMAKE_COMMAND} -E env LIB_BUILD_DIR=${CMAKE_BINARY_DIR}/ucxx/build
616626
${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
617627
--cmake-args=\"-DBUILD_SHARED_LIBS=OFF
618628
-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
619-
OUTPUT_QUIET)
620-
find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
629+
OUTPUT_VARIABLE UCXX_BUILD_OUTPUT
630+
RESULT_VARIABLE UCXX_BUILD_RESULT)
631+
if(UCXX_BUILD_RESULT)
632+
message(${UCXX_BUILD_OUTPUT})
633+
message(FATAL_ERROR "ucxx build failed")
634+
endif()
635+
find_package(ucxx REQUIRED PATHS ${CMAKE_BINARY_DIR}/ucxx/build
621636
NO_DEFAULT_PATH)
622637
endif()
623638
endif()

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ class BlockManager
453453
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
454454
CacheType cacheType = CacheType::kSELF,
455455
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
456-
std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
456+
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
457457

458458
~BlockManager();
459459

@@ -735,6 +735,9 @@ class BlockManager
735735
SizeType32 mMissedBlocks;
736736
std::set<KVCacheBlock::IdType> reusedBlockIds;
737737

738+
// Whether or not to maintain a hashmap of blocks.
739+
bool mEnableHashKey;
740+
738741
private:
739742
friend class KVCacheManager;
740743
};

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,7 @@ class GenericLlmRequest
15361536
{
15371537
TLLM_CHECK_WITH_INFO(
15381538
isContextInitState() || isDisaggGenerationInitState() || isDisaggGenerationTransmissionComplete(),
1539-
"getContextChunkSize is only possible during the context phase.");
1539+
"getContextChunkSize is only possible during the context phase or generation init phase.");
15401540
return mContextChunkSize;
15411541
}
15421542

@@ -1545,7 +1545,9 @@ class GenericLlmRequest
15451545
/// remaining length.
15461546
void setContextChunkSize(SizeType32 size)
15471547
{
1548-
TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase.");
1548+
TLLM_CHECK_WITH_INFO(
1549+
isContextInitState() || isDisaggGenerationInitState() || isDisaggGenerationTransmissionComplete(),
1550+
"setContextChunkSize is only possible during the context phase or generation init phase.");
15491551
TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);
15501552
mContextChunkSize = std::min(size, getContextRemainingLength());
15511553
}
@@ -1721,18 +1723,20 @@ class GenericLlmRequest
17211723

17221724
void updatePerfMetrics(executor::IterationType iter)
17231725
{
1726+
auto const currentTokenTime = std::chrono::steady_clock::now();
1727+
17241728
if (!mPerfMetrics.firstIter)
17251729
{
17261730
mPerfMetrics.firstIter = iter;
1727-
mPerfMetrics.timingMetrics.firstTokenTime = std::chrono::steady_clock::now();
1731+
mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
17281732
}
17291733

17301734
mPerfMetrics.iter = iter;
17311735

17321736
if (isFinished())
17331737
{
17341738
mPerfMetrics.lastIter = iter;
1735-
mPerfMetrics.timingMetrics.lastTokenTime = std::chrono::steady_clock::now();
1739+
mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
17361740
}
17371741
}
17381742

cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class BasePeftCacheManager
5959
using RequestVector = std::vector<LlmRequestPtr>;
6060
using PeftTable = std::map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
6161

62+
virtual ~BasePeftCacheManager() = default;
63+
6264
/**
6365
* \brief add PEFT weights from llmRequest if any. This will kickoff background copy tasks.
6466
* \param[in] llmRequest: the request
@@ -100,6 +102,8 @@ class PeftCacheManager : public BasePeftCacheManager
100102
PeftCacheManager(PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
101103
runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
102104

105+
~PeftCacheManager() override = default;
106+
103107
void addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bool tryGpuCache = true) override;
104108

105109
PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,
@@ -166,6 +170,10 @@ class PeftCacheManager : public BasePeftCacheManager
166170

167171
class NoOpPeftCacheManager : public BasePeftCacheManager
168172
{
173+
public:
174+
~NoOpPeftCacheManager() override = default;
175+
176+
private:
169177
void addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bool tryGpuCache = true) override;
170178

171179
PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,

cpp/include/tensorrt_llm/common/cudaFp8Utils.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,78 @@ struct PackType
141141
using type = float;
142142
};
143143

144+
template <typename T>
145+
struct PackType<T, 1>
146+
{
147+
struct __CUDA_ALIGN__(std::alignment_of_v<T>) type
148+
{
149+
T array[1];
150+
};
151+
};
152+
153+
template <>
154+
struct PackType<float, 2>
155+
{
156+
struct __CUDA_ALIGN__(8) type
157+
{
158+
float array[2];
159+
};
160+
};
161+
162+
template <>
163+
struct PackType<float, 4>
164+
{
165+
struct __CUDA_ALIGN__(16) type
166+
{
167+
float array[4];
168+
};
169+
};
170+
171+
template <>
172+
struct PackType<float, 8>
173+
{
174+
struct __CUDA_ALIGN__(32) type
175+
{
176+
float array[8];
177+
};
178+
};
179+
180+
template <>
181+
struct PackType<float, 16>
182+
{
183+
struct __CUDA_ALIGN__(64) type
184+
{
185+
float array[16];
186+
};
187+
};
188+
189+
template <>
190+
struct PackType<half, 2>
191+
{
192+
struct __CUDA_ALIGN__(4) type
193+
{
194+
half array[2];
195+
};
196+
};
197+
198+
template <>
199+
struct PackType<half, 4>
200+
{
201+
struct __CUDA_ALIGN__(8) type
202+
{
203+
half array[4];
204+
};
205+
};
206+
207+
template <>
208+
struct PackType<half, 8>
209+
{
210+
struct __CUDA_ALIGN__(16) type
211+
{
212+
half array[8];
213+
};
214+
};
215+
144216
#ifdef ENABLE_BF16
145217
template <>
146218
struct PackType<__nv_bfloat16, 2>
@@ -159,6 +231,12 @@ struct PackType<__nv_bfloat16, 8>
159231
{
160232
using type = __nv_bfloat168;
161233
};
234+
235+
template <>
236+
struct PackType<__nv_bfloat16, 16>
237+
{
238+
using type = __nv_bfloat1616;
239+
};
162240
#endif
163241

164242
#ifdef ENABLE_FP8
@@ -179,6 +257,13 @@ struct PackType<__nv_fp8_e4m3, 8>
179257
{
180258
using type = __nv_fp8_8_e4m3;
181259
};
260+
261+
template <>
262+
struct PackType<__nv_fp8_e4m3, 16>
263+
{
264+
using type = __nv_fp8x16_e4m3;
265+
};
266+
182267
#endif
183268

184269
__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, __nv_fp8x4_e4m3 const* in)

0 commit comments

Comments
 (0)