Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
74ec464
[https://nvbugs/5569713][fix] Disable fp8 deep gemm for EXAONE-4.0-32…
JunyiXu-nv Oct 21, 2025
c3e181d
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more…
SimengLiu-nv Oct 22, 2025
aaec2f2
[https://nvbugs/5504095][fix] Unwaive test_user_specify_workspace cas…
nv-guomingz Oct 22, 2025
7a33269
[https://nvbugs/5546510][fix] Move torch.cuda.Stream out of torch com…
liji-nv Oct 22, 2025
ead3345
[https://nvbugs/5565549][fix] unwaive test_disaggregated_spec_dec_bat…
bo-nv Oct 22, 2025
3d55b5e
[https://nvbugs/5575829][fix] Unwaive gpt-oss test (#8576)
LinPoly Oct 22, 2025
8963c14
[https://nvbugs/5569754][fix] trtllm-llmapi-launch port conflict (#8…
Superjomn Oct 23, 2025
b5c9c43
[https://nvbugs/5582277][fix] rework DisaggPPTerminationHandler to fi…
reasonsolo Oct 23, 2025
a318a97
[https://nvbugs/5575902][fix] set max_batch_size=1 to stabilize accur…
reasonsolo Oct 23, 2025
d2ad5d3
[https://nvbugs/5587456][fix] Remove multimodal test cases using TRT …
jieli-matrix Oct 24, 2025
d8ad4bd
[None][test] Clean cache for certain easily hang cases (#8619)
crazydemo Oct 24, 2025
a5d39f5
[https://nvbugs/5597647][fix] Fix MNNVL Allreduce accuracy issue on H…
timlee0212 Oct 27, 2025
870da18
[https://nvbugs/5608489][fix] Fix output unpack issues for Llama3/4 N…
hyukn Oct 28, 2025
584ed86
[https://nvbugs/5572320][fix] Ported test_ad_trtllm_bench.py from mai…
MrGeva Oct 28, 2025
ddbb116
[https://nvbugs/5564465][fix] Overwrite only if default_max_tokens is…
LinPoly Oct 28, 2025
db697ce
[https://nvbugs/5578175][fix] Fix block range index (#8470)
chuangz0 Oct 28, 2025
5c42706
[https://nvbugs/5601203] [fix]Restrict fp8 blockscale moe case (#8583)
VALLIS-NERIA Oct 29, 2025
51493c1
[https://nvbugs/5606268][fix] Separate cuda graph workspace to preven…
JunyiXu-nv Oct 29, 2025
3ca32c4
[https://nvbugs/5575841] [test] Move test_moe.py to serial tests to i…
DomBrown Oct 30, 2025
3c2b4fe
[https://nvbugs/5488118][fix] Unwaive passed tests (#8758)
liji-nv Oct 31, 2025
0606630
[None][infra] Remove invaild waived tests which not in release branch…
ZhanruiSunCh Oct 31, 2025
5a41c16
[https://nvbugs/5325296][fix] Enable relaxed acceptance test on Black…
Barry-Delaney Oct 31, 2025
8a75dd5
[https://nvbugs/5444687][fix] Cherrypick online EPLB CI fix from main…
dongxuy04 Nov 3, 2025
6c1fdfd
[None][chore] Update linter rules for mass integration
mikeiovine Nov 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,7 @@ common-files: &common_files |
tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py |
tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py |
tests/unittest/_torch/thop/serial/test_moe_alltoall.py |
tests/unittest/_torch/thop/serial/test_moe.py |
tests/unittest/api_stability/api_stability_core.py |
tests/unittest/api_stability/test_llm_api.py |
tests/unittest/bindings/binding_test_utils.py |
Expand Down
10 changes: 8 additions & 2 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2142,7 +2142,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
return 0;
}

auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens();
auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens();
auto const generatedTokens = numCurrTokens - req.getPromptLen();
auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens;
auto const tokensPerStep = req.getNumDraftTokens() + 1;
Expand Down Expand Up @@ -2406,7 +2406,13 @@ void KVCacheManager::addSequence(
void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
{
auto const requestId = llmRequest.mRequestId;
if (mSequences.find(requestId) != mSequences.end())
bool found = false;
{
// protect the mSequences
std::scoped_lock lock(mSequencesMtx);
found = mSequences.find(requestId) != mSequences.end();
}
if (found)
{
auto& sequence = getSequence(requestId);
if (mEnableBlockReuse && !llmRequest.isDummyRequest())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ __device__ struct __attribute__((aligned(32))) LamportFlags
{
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
asm volatile("red.async.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("red.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
asm volatile("red.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
#else
atomicAdd(offset_access_ptr, 1);
#endif
Expand Down
13 changes: 13 additions & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2166,6 +2166,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
def noIsolateTests = false
def rerunFailed = false

echoNodeAndGpuInfo(pipeline, stageName)
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'

def extraInternalEnv = ""
def pytestTestTimeout = "3600"

// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
// Enable NCCL debug information for multi-GPU tests
extraInternalEnv += " NCCL_DEBUG=INFO"

def testDBList = renderTestDB(testList, llmSrc, stageName)

// Process shard test list and create separate files for regular and isolate tests
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,7 @@ exclude = [
"tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py",
"tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py",
"tests/unittest/_torch/thop/serial/test_moe_alltoall.py",
"tests/unittest/_torch/thop/serial/test_moe.py",
"tests/unittest/api_stability/api_stability_core.py",
"tests/unittest/api_stability/test_llm_api.py",
"tests/unittest/bindings/binding_test_utils.py",
Expand Down
14 changes: 12 additions & 2 deletions tensorrt_llm/_torch/attention_backend/trtllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ def is_nvfp4_output_kernel_available(
@dataclass(kw_only=True)
class TrtllmAttentionMetadata(AttentionMetadata):
workspace: Optional[torch.Tensor] = None
cuda_graph_workspace: Optional[torch.Tensor] = None

# TrtllmAttention needs to know the beam width to access to the cache indirection buffer,
# when beam search is enabled.
Expand Down Expand Up @@ -680,6 +681,14 @@ def _post_init_with_buffers(self, buffers) -> None:
device='cuda',
dtype=torch.int8,
)

if self.cuda_graph_workspace is None:
self.cuda_graph_workspace = torch.empty(
(0, ),
device='cuda',
dtype=torch.int8,
)

if self.kv_cache_manager is not None:
self.kv_cache_block_offsets = self.get_empty(
buffers,
Expand Down Expand Up @@ -1317,8 +1326,9 @@ def forward(
host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers,
host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping,
block_ids_per_seq=metadata.block_ids_per_seq,
workspace=metadata.
workspace, # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
# re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
workspace=metadata.workspace
if not metadata.is_cuda_graph else metadata.cuda_graph_workspace,
cache_indirection=metadata.cache_indirection,
kv_scale_orig_quant=self.kv_scale_orig_quant,
kv_scale_quant_orig=self.kv_scale_quant_orig,
Expand Down
12 changes: 4 additions & 8 deletions tensorrt_llm/_torch/compilation/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
self.capture_num_tokens = sorted(capture_num_tokens or [])
self.piecewise_cuda_graph = enable_piecewise_cuda_graph
self.no_optimization = False
# We only need to create aux streams.
self.aux_streams = Backend.Streams(
[torch.cuda.Stream() for _ in range(max_num_streams - 1)])
self.num_streams = max_num_streams
self.events = Backend.Events()
inductor_config.enable_auto_functionalized_v2 = False

Expand Down Expand Up @@ -109,10 +107,8 @@ def optimize(
# Do not apply multi-stream if enable piecewise cuda graph or inductor
# For piecewise cuda graph, we will apply the multi-stream optimization in piecewise_optimizer
# For inductor, we do not control the passes inside inductor.
if len(
self.aux_streams
) > 0 and not self.piecewise_cuda_graph and not self.enable_inductor:
num_events = multi_stream_schedule(gm, len(self.aux_streams) + 1)
if self.num_streams > 1 and not self.piecewise_cuda_graph and not self.enable_inductor:
num_events = multi_stream_schedule(gm, self.num_streams)
self.generate_events(num_events)

gm.recompile()
Expand All @@ -125,7 +121,7 @@ def optimize(
self.input_num_tokens,
self.capture_num_tokens,
self._graph_pool_handle,
len(self.aux_streams) + 1,
self.num_streams,
)
self.generate_events(num_events)
return gm
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/distributed/communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,8 @@ def tp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
def pp_allgather(self, obj):
return self.pp_comm.allgather(obj)

def pp_gather(self, obj):
return self.pp_comm.gather(obj)
def pp_gather(self, obj, root=0):
return self.pp_comm.gather(obj, root=root)

def pp_broadcast(self, obj, root=0):
return self.pp_comm.bcast(obj, root)
Expand Down
14 changes: 13 additions & 1 deletion tensorrt_llm/_torch/models/modeling_exaone4.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.quantization import QuantAlgo

from ..attention_backend import AttentionMetadata
from ..attention_backend.interface import (PositionalEmbeddingParams,
Expand Down Expand Up @@ -54,7 +55,8 @@ class Exaone4Attention(QKNormRoPEAttention):
def __init__(self,
model_config: ModelConfig[Exaone4Config],
layer_idx: Optional[int] = None,
fuse_qk_norm_rope: bool = False):
fuse_qk_norm_rope: bool = False,
disable_deep_gemm: bool = False):
config = model_config.pretrained_config

self.attention_window_size = None
Expand Down Expand Up @@ -88,6 +90,7 @@ def __init__(self,
layer_idx=layer_idx,
dtype=config.torch_dtype,
config=model_config,
disable_deep_gemm=disable_deep_gemm,
)

def forward(
Expand Down Expand Up @@ -128,9 +131,17 @@ def __init__(
self.is_quanted = model_config.quant_config and model_config.quant_config.quant_mode.has_any_quant(
)

disable_deep_gemm = False
quant_config = getattr(model_config, "quant_config", None)
if quant_config is not None:
# EXAONE4 fp8 has an illegal memory access issue with deep_gemm.
disable_deep_gemm = getattr(quant_config, "quant_algo",
None) == QuantAlgo.FP8_BLOCK_SCALES

self.self_attn = Exaone4Attention(
model_config,
layer_idx=layer_idx,
disable_deep_gemm=disable_deep_gemm,
)

self.mlp = GatedMLP(
Expand All @@ -140,6 +151,7 @@ def __init__(
dtype=config.torch_dtype,
config=model_config,
layer_idx=layer_idx,
disable_deep_gemm=disable_deep_gemm,
)

self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/models/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def forward(
))

# Unpack the allreduce output
if self.next_attn is not None and self.is_nvfp4:
if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = allreduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
Expand Down Expand Up @@ -790,7 +790,7 @@ def forward(
scale=scale,
eps=self.next_layer_layernorm.variance_epsilon,
))
if self.next_attn is not None and self.is_nvfp4:
if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = all_reduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
Expand Down
8 changes: 6 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,11 @@ def __init__(
use_ub = not use_ub_for_nccl and (
torch_compile_enable_userbuffers
and self._init_userbuffers(self.model.config.hidden_size))
self.backend_num_streams = Backend.Streams([
torch.cuda.Stream() for _ in
range(pytorch_backend_config.torch_compile_max_num_streams -
1)
])
self._torch_compile_backend = Backend(
torch_compile_inductor_enabled,
enable_userbuffers=use_ub,
Expand Down Expand Up @@ -2385,8 +2390,7 @@ def model_forward(self, **kwargs):
if self._torch_compile_backend is not None:
# Register aux streams and events to model extra attrs.
# The streams and events are list which could be updated during compilation.
attrs["aux_streams"] = weakref.ref(
self._torch_compile_backend.aux_streams)
attrs["aux_streams"] = weakref.ref(self.backend_num_streams)
attrs["events"] = weakref.ref(self._torch_compile_backend.events)
attrs["global_stream"] = torch.cuda.current_stream()

Expand Down
Loading