diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a75e55b79d3..b9ff023abea 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1073,6 +1073,7 @@ common-files: &common_files | tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py | tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py | tests/unittest/_torch/thop/serial/test_moe_alltoall.py | + tests/unittest/_torch/thop/serial/test_moe.py | tests/unittest/api_stability/api_stability_core.py | tests/unittest/api_stability/test_llm_api.py | tests/unittest/bindings/binding_test_utils.py | diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index b0b7b494fa6..a7b3cda7bff 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -2142,7 +2142,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep( return 0; } - auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens(); + auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens(); auto const generatedTokens = numCurrTokens - req.getPromptLen(); auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens; auto const tokensPerStep = req.getNumDraftTokens() + 1; @@ -2406,7 +2406,13 @@ void KVCacheManager::addSequence( void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest) { auto const requestId = llmRequest.mRequestId; - if (mSequences.find(requestId) != mSequences.end()) + bool found = false; + { + // protect the mSequences + std::scoped_lock lock(mSequencesMtx); + found = mSequences.find(requestId) != mSequences.end(); + } + if (found) { auto& sequence = getSequence(requestId); if (mEnableBlockReuse && !llmRequest.isDummyRequest()) diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu index 1cb65e6910a..313754155d7 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu @@ -117,8 +117,8 @@ __device__ struct __attribute__((aligned(32))) LamportFlags { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)) asm volatile("red.async.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory"); -#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("red.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory"); +#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) + asm volatile("red.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory"); #else atomicAdd(offset_access_ptr, 1); #endif diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 20b9a2a6ab1..901ab606f22 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2166,6 +2166,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def noIsolateTests = false def rerunFailed = false + echoNodeAndGpuInfo(pipeline, stageName) + sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' + + def extraInternalEnv = "" + def pytestTestTimeout = "3600" + + // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines. + extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\"" + // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest + extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}" + // Enable NCCL debug information for multi-GPU tests + extraInternalEnv += " NCCL_DEBUG=INFO" + def testDBList = renderTestDB(testList, llmSrc, stageName) // Process shard test list and create separate files for regular and isolate tests diff --git a/pyproject.toml b/pyproject.toml index 21e2921ad61..267bb14b64c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1113,6 +1113,7 @@ exclude = [ "tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py", "tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py", "tests/unittest/_torch/thop/serial/test_moe_alltoall.py", + "tests/unittest/_torch/thop/serial/test_moe.py", "tests/unittest/api_stability/api_stability_core.py", "tests/unittest/api_stability/test_llm_api.py", "tests/unittest/bindings/binding_test_utils.py", diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index 8b5a30d1a0b..3fcf616bcdb 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -565,6 +565,7 @@ def is_nvfp4_output_kernel_available( @dataclass(kw_only=True) class TrtllmAttentionMetadata(AttentionMetadata): workspace: Optional[torch.Tensor] = None + cuda_graph_workspace: Optional[torch.Tensor] = None # TrtllmAttention needs to know the beam width to access to the cache indirection buffer, # when beam search is enabled. @@ -680,6 +681,14 @@ def _post_init_with_buffers(self, buffers) -> None: device='cuda', dtype=torch.int8, ) + + if self.cuda_graph_workspace is None: + self.cuda_graph_workspace = torch.empty( + (0, ), + device='cuda', + dtype=torch.int8, + ) + if self.kv_cache_manager is not None: self.kv_cache_block_offsets = self.get_empty( buffers, @@ -1317,8 +1326,9 @@ def forward( host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers, host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping, block_ids_per_seq=metadata.block_ids_per_seq, - workspace=metadata. - workspace, # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue. + # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue. + workspace=metadata.workspace + if not metadata.is_cuda_graph else metadata.cuda_graph_workspace, cache_indirection=metadata.cache_indirection, kv_scale_orig_quant=self.kv_scale_orig_quant, kv_scale_quant_orig=self.kv_scale_quant_orig, diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py index 4ccb052498a..dc77dec6696 100644 --- a/tensorrt_llm/_torch/compilation/backend.py +++ b/tensorrt_llm/_torch/compilation/backend.py @@ -51,9 +51,7 @@ def __init__( self.capture_num_tokens = sorted(capture_num_tokens or []) self.piecewise_cuda_graph = enable_piecewise_cuda_graph self.no_optimization = False - # We only need to create aux streams. - self.aux_streams = Backend.Streams( - [torch.cuda.Stream() for _ in range(max_num_streams - 1)]) + self.num_streams = max_num_streams self.events = Backend.Events() inductor_config.enable_auto_functionalized_v2 = False @@ -109,10 +107,8 @@ def optimize( # Do not apply multi-stream if enable piecewise cuda graph or inductor # For piecewise cuda graph, we will apply the multi-stream optimization in piecewise_optimizer # For inductor, we do not control the passes inside inductor. - if len( - self.aux_streams - ) > 0 and not self.piecewise_cuda_graph and not self.enable_inductor: - num_events = multi_stream_schedule(gm, len(self.aux_streams) + 1) + if self.num_streams > 1 and not self.piecewise_cuda_graph and not self.enable_inductor: + num_events = multi_stream_schedule(gm, self.num_streams) self.generate_events(num_events) gm.recompile() @@ -125,7 +121,7 @@ def optimize( self.input_num_tokens, self.capture_num_tokens, self._graph_pool_handle, - len(self.aux_streams) + 1, + self.num_streams, ) self.generate_events(num_events) return gm diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 07f2b4227f9..f4ca3cde291 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -405,8 +405,8 @@ def tp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024): def pp_allgather(self, obj): return self.pp_comm.allgather(obj) - def pp_gather(self, obj): - return self.pp_comm.gather(obj) + def pp_gather(self, obj, root=0): + return self.pp_comm.gather(obj, root=root) def pp_broadcast(self, obj, root=0): return self.pp_comm.bcast(obj, root) diff --git a/tensorrt_llm/_torch/models/modeling_exaone4.py b/tensorrt_llm/_torch/models/modeling_exaone4.py index 86147787aa8..07951fc28a4 100644 --- a/tensorrt_llm/_torch/models/modeling_exaone4.py +++ b/tensorrt_llm/_torch/models/modeling_exaone4.py @@ -5,6 +5,7 @@ from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention from tensorrt_llm.functional import PositionEmbeddingType +from tensorrt_llm.quantization import QuantAlgo from ..attention_backend import AttentionMetadata from ..attention_backend.interface import (PositionalEmbeddingParams, @@ -54,7 +55,8 @@ class Exaone4Attention(QKNormRoPEAttention): def __init__(self, model_config: ModelConfig[Exaone4Config], layer_idx: Optional[int] = None, - fuse_qk_norm_rope: bool = False): + fuse_qk_norm_rope: bool = False, + disable_deep_gemm: bool = False): config = model_config.pretrained_config self.attention_window_size = None @@ -88,6 +90,7 @@ def __init__(self, layer_idx=layer_idx, dtype=config.torch_dtype, config=model_config, + disable_deep_gemm=disable_deep_gemm, ) def forward( @@ -128,9 +131,17 @@ def __init__( self.is_quanted = model_config.quant_config and model_config.quant_config.quant_mode.has_any_quant( ) + disable_deep_gemm = False + quant_config = getattr(model_config, "quant_config", None) + if quant_config is not None: + # EXAONE4 fp8 has an illegal memory access issue with deep_gemm. + disable_deep_gemm = getattr(quant_config, "quant_algo", + None) == QuantAlgo.FP8_BLOCK_SCALES + self.self_attn = Exaone4Attention( model_config, layer_idx=layer_idx, + disable_deep_gemm=disable_deep_gemm, ) self.mlp = GatedMLP( @@ -140,6 +151,7 @@ def __init__( dtype=config.torch_dtype, config=model_config, layer_idx=layer_idx, + disable_deep_gemm=disable_deep_gemm, ) self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size, diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 38d487a7eae..2f5bf1eef57 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -599,7 +599,7 @@ def forward( )) # Unpack the allreduce output - if self.next_attn is not None and self.is_nvfp4: + if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4: act_fp4, act_sf, residual = allreduce_output hidden_states = Fp4QuantizedTensor(act_fp4, act_sf) else: @@ -790,7 +790,7 @@ def forward( scale=scale, eps=self.next_layer_layernorm.variance_epsilon, )) - if self.next_attn is not None and self.is_nvfp4: + if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4: act_fp4, act_sf, residual = all_reduce_output hidden_states = Fp4QuantizedTensor(act_fp4, act_sf) else: diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index e6da9fc216a..1d0e6899cdf 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -270,6 +270,11 @@ def __init__( use_ub = not use_ub_for_nccl and ( torch_compile_enable_userbuffers and self._init_userbuffers(self.model.config.hidden_size)) + self.backend_num_streams = Backend.Streams([ + torch.cuda.Stream() for _ in + range(pytorch_backend_config.torch_compile_max_num_streams - + 1) + ]) self._torch_compile_backend = Backend( torch_compile_inductor_enabled, enable_userbuffers=use_ub, @@ -2385,8 +2390,7 @@ def model_forward(self, **kwargs): if self._torch_compile_backend is not None: # Register aux streams and events to model extra attrs. # The streams and events are list which could be updated during compilation. - attrs["aux_streams"] = weakref.ref( - self._torch_compile_backend.aux_streams) + attrs["aux_streams"] = weakref.ref(self.backend_num_streams) attrs["events"] = weakref.ref(self._torch_compile_backend.events) attrs["global_stream"] = torch.cuda.current_stream() diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 0bcc090ff5b..96e795d1c1a 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -7,7 +7,7 @@ import time import traceback from contextlib import contextmanager -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union import torch @@ -267,7 +267,7 @@ def __init__(self, self._disagg_pp_termination_handler = None if self.dist.pp_size > 1 and self.enable_kv_cache_reuse and self.kv_cache_transceiver: self._disagg_pp_termination_handler = DisaggPPTerminationHandler( - self.num_micro_batches, self.dist) + self.dist, self._do_terminate_request) if self.dist.pp_size > 1: self.event_loop = self._executor_loop_pp @@ -763,9 +763,6 @@ def _executor_loop_cleanup(self): if h is not None: h.wait() - if self._disagg_pp_termination_handler is not None: - self._disagg_pp_termination_handler.cleanup() - with self.response_cv: self.is_shutdown = True self.response_cv.notify_all() @@ -981,10 +978,8 @@ def _executor_loop_pp(self): self._terminate_disagg_ctx_finished_requests() if self._disagg_pp_termination_handler is not None: - requests_to_terminate = self._disagg_pp_termination_handler.sync( - prev_microbatch_id) - for req in requests_to_terminate: - self._do_terminate_request(req) + self._disagg_pp_termination_handler.terminate_pending_requests( + ) # march forward in microbatch slots microbatch_id = (microbatch_id + 1) % self.num_micro_batches @@ -2385,94 +2380,73 @@ class DisaggPPTerminationHandler: resources to avoid a NCCL hang. """ - def __init__(self, num_micro_batches: int, dist): - self.dist = dist - # Request termination synchronization across PP ranks - # {request_id: {'ready_to_terminate': set{ranks}, 'terminated': {ranks}}} - self.pending_termination = {} - self.termination_handles = [None] * num_micro_batches - # Local map from request_id -> local LlmRequest awaiting consensus termination - self.local_termination = {} - - def terminate(self, request: LlmRequest) -> bool: - req_key = request.py_request_id - self.local_termination[req_key] = request - state = self.pending_termination.get(req_key, None) - if state is None: - state = {'ready_to_terminate': set(), 'terminated': set()} - self.pending_termination[req_key] = state - if self.dist.rank not in state['ready_to_terminate']: - state['ready_to_terminate'].add(self.dist.rank) - return False - - def sync(self, microbatch_id: int) -> List[LlmRequest]: - """Ring-communicate pending termination state and apply local terminations upon consensus. - - Each rank sends its current pending_termination snapshot to the next PP rank - and receives the previous rank's snapshot. After merging, apply any terminations - that have reached consensus (i.e., all PP ranks are ready). - """ - snapshot = { - req_id: { - 'ready_to_terminate': state.get('ready_to_terminate', set()), - 'terminated': state.get('terminated', set()), - } - for req_id, state in self.pending_termination.items() - } - - if self.termination_handles[microbatch_id] is not None: - self.termination_handles[microbatch_id].wait() + def __init__(self, dist, terminator_func: Callable[[LlmRequest], None]): + self._dist = dist + self._terminator_func = terminator_func + self._pending_termination = {} + self._terminating_iteration = 0 + self._send_handle = None + self._comm_tag = TERMINATION_COMM_TAG_BASE - term_tag = TERMINATION_COMM_TAG_BASE + microbatch_id - self.termination_handles[microbatch_id] = self.dist.isend_object( - snapshot, - dest=self.dist.next_pp_rank, - tag=term_tag, - ) - remote_state = self.dist.recv_object( - src=self.dist.prev_pp_rank, - tag=term_tag, - ) - logger.debug( - f"received remote state for microbatch {microbatch_id}, prev pp rank: {self.dist.prev_pp_rank} state {remote_state}" - ) + def terminate(self, request: LlmRequest): + self._pending_termination[request.py_request_id] = request - if remote_state: - for req_id, state in remote_state.items(): - local = self.pending_termination.get(req_id) - if local is None: - self.pending_termination[req_id] = { - 'ready_to_terminate': state.get('ready_to_terminate', - set()), - 'terminated': state.get('terminated', set()), - } - else: - for key in ('ready_to_terminate', 'terminated'): - for r in state.get(key, []): - if r not in local[key]: - local[key].add(r) + @nvtx_range("_disagg_pp_termination_handler_sync") + def terminate_pending_requests(self): + """ + Ring-style communicating to decide which requests to be terminated and avoid bubbles. + This ensures that one request is terminated from rank_0 to rank_(pp_size-1) in order. + """ + terminate_req_ids = [] + term_state = None + if self._send_handle: + self._send_handle.wait() + + if not (self._dist.is_first_pp_rank + and self._terminating_iteration == 0): + term_state = self._dist.recv_object(src=self._dist.prev_pp_rank, + tag=self._comm_tag) + + ready_req_map = term_state["ready"] if term_state else { + } # {req_id: num_ranks} ranks vote in the ready dict + terminate_req_ids = term_state["term"] if term_state else [ + ] # request ids to be terminated in the current iteration + + reqs_to_terminate = { + req_id: self._pending_termination.pop(req_id, None) + for req_id in terminate_req_ids + if req_id in self._pending_termination + } - requests_to_terminate = [] - to_delete = [] - for req_id, state in self.pending_termination.items(): - ready = state.get('ready_to_terminate', set()) - done = state.get('terminated', set()) - # If all PP ranks are ready to terminate the request, we can free the resources - if len(ready) >= self.dist.pp_size and self.dist.rank not in done: - local_req = self.local_termination.get(req_id) - if local_req is not None: - requests_to_terminate.append(local_req) - done.add(self.dist.rank) - if len(done) >= self.dist.pp_size: - to_delete.append(req_id) - if req_id in self.local_termination: - self.local_termination.pop(req_id, None) - for req_id in to_delete: - self.pending_termination.pop(req_id, None) + if self._dist.is_first_pp_rank: + # rank0 proposes the requests to be terminated + ready_req_map = {req_id: 1 for req_id in self._pending_termination} + else: + # if a rank agrees to terminate a request, increase the vote count for the request id + for req_id in ready_req_map.keys(): + if req_id in self._pending_termination: + ready_req_map[req_id] += 1 + + if self._dist.is_last_pp_rank: + new_terminate_req_ids = [ + req_id for req_id, num_ranks in ready_req_map.items() + if num_ranks == self._dist.pp_size + ] + # by determining the terminate ids in the last rank, we can save the overhead of sending the ready dict back to rank0 + new_term_state = {"ready": {}, "term": new_terminate_req_ids} + else: + # other pp ranks pass the updated ready dict and terminate request ids to the next rank, and the + # terminate_req_ids will not change in a given iteration, so we can terminate the requests synchronously + new_term_state = {"ready": ready_req_map, "term": terminate_req_ids} - return requests_to_terminate + self._send_handle = self._dist.isend_object( + new_term_state, dest=self._dist.next_pp_rank, tag=self._comm_tag) - def cleanup(self): - for h in self.termination_handles: - if h is not None: - h.wait() + if reqs_to_terminate: + logger.debug( + f'rank {self._dist.pp_rank} terminates {list(reqs_to_terminate.keys())} in iter {self._terminating_iteration}' + ) + for req_id, req in reqs_to_terminate.items(): + if req: + self._terminator_func(req) + self._terminating_iteration += 1 diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index ff6d3402bb6..df455d96241 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -428,7 +428,7 @@ def _deduce_max_tokens(request: GenerationRequest, # default_max_tokens is the biggest available value if max_tokens is None: return default_max_tokens - elif max_tokens > default_max_tokens: + elif max_tokens > default_max_tokens and default_max_tokens > 0: logger.warning( f"User-specified `max_tokens` ({max_tokens}) is greater than deduced " f"`default_max_tokens` ({default_max_tokens}), using default_max_tokens instead." diff --git a/tensorrt_llm/llmapi/trtllm-llmapi-launch b/tensorrt_llm/llmapi/trtllm-llmapi-launch index d552289fc12..62b46be4c8c 100755 --- a/tensorrt_llm/llmapi/trtllm-llmapi-launch +++ b/tensorrt_llm/llmapi/trtllm-llmapi-launch @@ -24,17 +24,9 @@ function mpi_world_size { } function export_free_tcp_addr_for_spawn_proxy_process { - # find free port starting from 10012 - local free_port=$(python -c 'import socket; s=socket.socket(); -port = 10012 -while True: - try: - s.bind(("", port)) - break - except OSError: - port += 1 -print(port); s.close()') - export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}" + # Generate unique IPC address without importing tensorrt_llm to avoid MPI initialization conflicts + local free_port=$(python3 -c "import uuid, tempfile, os; print(f'ipc://{os.path.join(tempfile.gettempdir(), \"rpc_test_\" + str(uuid.uuid4()))}')") + export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR=$free_port log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR" export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32) @@ -44,9 +36,12 @@ print(port); s.close()') export tllm_mpi_size=$(mpi_world_size) log_stderr "tllm_mpi_size: $tllm_mpi_size" -export_free_tcp_addr_for_spawn_proxy_process if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then + + # IPC only works on localhost and in MPI rank0 process + export_free_tcp_addr_for_spawn_proxy_process + log_stderr "Rank${mpi_rank} run ${task_with_command[@]} in background" # MPI doesn't allow spawn a process sharing the MPI environment in a MPI diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 5a1f28fc9a2..e2422fe8db8 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -181,11 +181,18 @@ def launch_disaggregated_llm( gen_servers.append((env_gen, gen_server_args)) @contextlib.contextmanager - def multi_popen(server_configs): + def multi_popen(server_configs, server_name="", enable_redirect_log=False): processes = [] + log_files = [] try: - for env, args in server_configs: - proc = popen(args, env=env) + for i, (env, args) in enumerate(server_configs): + if enable_redirect_log: + f = open(f"output_{server_name}_{i}.log", "w+") + env["TLLM_LOG_LEVEL"] = "INFO" + proc = popen(args, env=env, stdout=f, stderr=f) + log_files.append(f) + else: + proc = popen(args, env=env) processes.append(proc) with contextlib.ExitStack() as stack: @@ -193,6 +200,8 @@ def multi_popen(server_configs): stack.enter_context(proc) for proc in processes ] yield opened_processes + for f in log_files: + f.close() except Exception as e: print( f"Failed to start disaggregated server processes in multi_popen: {e}" @@ -204,13 +213,19 @@ def multi_popen(server_configs): disaggregated_serving_config_path, "--server_start_timeout", str(server_waiting_timeout) ] - with (MyThreadPoolExecutor(max_workers=16) as - thread_pool, temp_dir, multi_popen(ctx_servers + gen_servers) as - worker_processes, popen(server_cmd) as server_process): + with ( + MyThreadPoolExecutor(max_workers=16) as thread_pool, + temp_dir, + multi_popen(ctx_servers, "ctx") as ctx_processes, + multi_popen(gen_servers, "gen") as gen_processes, + multi_popen([(os.environ, server_cmd)], "disagg") as + server_processes, + ): start_time = time.time() while time.time() - start_time < server_waiting_timeout: time.sleep(5) - for process in itertools.chain(worker_processes, [server_process]): + for process in itertools.chain(ctx_processes, gen_processes, + server_processes): if process.poll() is not None: raise Exception( f"process {process.pid} exited with code {process.returncode}" @@ -306,6 +321,7 @@ def run_parallel_test(model_name: str, kv_cache_config = { "free_gpu_memory_fraction": 0.5, + "enable_block_reuse": True } ctx_server_config = { "pipeline_parallel_size": ctx_pp, @@ -404,7 +420,6 @@ def test_auto_dtype(self, disable_overlap_scheduler, ctx_enable_block_reuse, task.evaluate(llm) @pytest.mark.skip_less_device(2) - @skip_pre_hopper def test_ngram(self): speculative_decoding_config = { "decoding_type": "NGram", @@ -1014,12 +1029,15 @@ def test_chunked_prefill(self): }, "enable_chunked_prefill": True, "max_num_tokens": 256, + "max_batch_size": + 1, # max_batch_size=1 will stabilize the accuracy test result at a cost of speed } gen_server_config = { "cuda_graph_config": None, "cache_transceiver_config": { "backend": "DEFAULT" - } + }, + "max_batch_size": 1, } disaggregated_server_config = { "hostname": "localhost", diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ce08c2117b6..0a88305e59b 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -652,15 +652,15 @@ def test_nvfp4_tp4(self): @pytest.mark.skip_less_device(4) @skip_pre_blackwell - def test_fp8_tp2pp2(self): - model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8" + def test_fp4_tp2pp2(self): + model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4" kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(model_path, tensor_parallel_size=2, pipeline_parallel_size=2, max_batch_size=32, kv_cache_config=kv_cache_config) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 sampling_params = SamplingParams( max_tokens=256, temperature=0.0, diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 05be999be9a..4e8630dc873 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2679,6 +2679,7 @@ def torch_empty_cache() -> None: Manually empty the torch CUDA cache before each test, to reduce risk of OOM errors. """ if torch.cuda.is_available(): + gc.collect() torch.cuda.empty_cache() gc.collect() diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index f10aa6af24e..570e499fb74 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -419,8 +419,9 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path, max_batch_size=1)) kv_cache_configs = [ - KvCacheConfig(max_tokens=128, enable_block_reuse=False) - for _ in range(2) + KvCacheConfig(max_tokens=128, + enable_block_reuse=False, + free_gpu_memory_fraction=0.4) for _ in range(2) ] cache_transceiver_configs = [ CacheTransceiverConfig(backend="DEFAULT") for _ in range(2) diff --git a/tests/integration/defs/examples/serve/test_serve.py b/tests/integration/defs/examples/serve/test_serve.py index 1a8b07aa4af..c861d525a26 100755 --- a/tests/integration/defs/examples/serve/test_serve.py +++ b/tests/integration/defs/examples/serve/test_serve.py @@ -2,7 +2,7 @@ import time import requests -from defs.conftest import llm_models_root, skip_pre_hopper +from defs.conftest import llm_models_root, skip_no_hopper from defs.trt_test_alternative import popen, print_error, print_info from openai import OpenAI from requests.exceptions import RequestException @@ -92,9 +92,11 @@ def check_openai_chat_completion(http_port="8000", raise -@skip_pre_hopper +@skip_no_hopper def test_extra_llm_api_options(serve_test_root): test_configs_root = f"{serve_test_root}/test_configs" + + # moe backend = CUTLASS which only supports fp8 blockscale on Hopper config_file = f"{test_configs_root}/Qwen3-30B-A3B-FP8.yml" model_path = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8" diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 81267671de8..80a894a1a91 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -31,10 +31,10 @@ from .common import (PluginOptions, convert_weights, get_mmlu_accuracy, prune_checkpoint, quantize_data, refit_model, venv_check_call) -from .conftest import (get_device_count, llm_models_root, skip_no_sm120, - skip_nvlink_inactive, skip_post_blackwell, skip_pre_ada, - skip_pre_blackwell, skip_pre_hopper, tests_path, - unittest_path) +from .conftest import (get_device_count, get_sm_version, llm_models_root, + skip_no_sm120, skip_nvlink_inactive, skip_post_blackwell, + skip_pre_ada, skip_pre_blackwell, skip_pre_hopper, + tests_path, unittest_path) sys.path.append(os.path.join(str(tests_path()), '/../examples/apps')) @@ -2195,7 +2195,6 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv, _check_mem_usage(running_log, [106.3, 0, 0, 0], 8) -@skip_post_blackwell @pytest.mark.skip_less_device_memory(110000) @pytest.mark.skip_less_device(8) @pytest.mark.parametrize("model_name,model_path", [ @@ -2206,6 +2205,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus( llm_root, llm_venv, model_name, model_path): print(f"Testing {model_name}.") example_root = Path(os.path.join(llm_root, "examples", "llm-api")) + is_blackwell = get_sm_version() > 90 with tempfile.NamedTemporaryFile(mode='w+t', suffix=f".{model_name}.log", dir="./", @@ -2219,7 +2219,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus( "--moe_ep_size=8", "--tp_size=8", "--use_cuda_graph", - f"--kv_cache_fraction={_MEM_FRACTION_95}", + f"--kv_cache_fraction={_MEM_FRACTION_50 if is_blackwell else _MEM_FRACTION_95}", "--max_batch_size=1", "--max_seq_len=3000", "--disable_kv_cache_reuse", @@ -2232,6 +2232,8 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus( "--relaxed_delta=0.5", "--enable_attention_dp", "--use_one_model", + "--moe_backend", + "DEEPGEMM" if is_blackwell else "CUTLASS", ], stdout=running_log) _check_mem_usage(running_log, [85.6, 0, 0, 0], 8) @@ -2273,7 +2275,7 @@ def test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus( @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path,gpu_count", [ - ("Llama3.1-70B-BF16", "llama-3.1-model/Meta-Llama-3.1-70B", 2), + ("Llama3.1-70B-BF16", "llama-3.1-model/Meta-Llama-3.1-70B", 8), ("Mixtral-8x7B-BF16", "Mixtral-8x7B-v0.1", 8), pytest.param('Llama3.1-70B-FP8', 'llama-3.1-model/Llama-3.1-70B-Instruct-FP8', @@ -2304,7 +2306,7 @@ def test_ptp_quickstart_advanced_multi_gpus(llm_root, llm_venv, model_name, pytest.skip(f"Not enough GPUs for {model_name}") example_root = Path(os.path.join(llm_root, "examples", "llm-api")) mapping = { - "Llama3.1-70B-BF16": 91.0, + "Llama3.1-70B-BF16": 24.6, "Mixtral-8x7B-BF16": 16.5, "Llama3.1-70B-FP8": 58.5, "Llama3.1-405B-FP8": 63.2, diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 094c7055ef9..e0a40f3ba39 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -417,7 +417,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2 +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False] @@ -652,7 +652,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision] test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] -test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2] +test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 7672e2d9815..28086f91129 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -129,7 +129,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2 +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True] @@ -229,7 +229,7 @@ test_e2e.py::test_openai_consistent_chat test_e2e.py::test_openai_multi_chat_example test_e2e.py::test_ptp_quickstart test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] -test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2] +test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1-8] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 1045f2e9b5d..518a3a20629 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -27,14 +27,6 @@ examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugi examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] # Multimodal Executor Cpp E2E Tests examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] @@ -67,8 +59,6 @@ examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-f examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2_0.5b_instruct] examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_0.5b_instruct] examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] -examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] -examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] examples/test_gpt.py::test_llm_gpt2_starcoder_1node_4gpus[starcoder2-disable_fmha-enable_gemm_plugin-enable_attention_plugin] examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] @@ -437,7 +427,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] -test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2] +test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8] diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 114717be909..7795f7925b7 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -76,7 +76,9 @@ l0_b200: - unittest/_torch/modeling -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_gpt_oss" + - unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8 # ------------- AutoDeploy tests --------------- + - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] - unittest/_torch/auto_deploy/unit/singlegpu - condition: diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index da24259218e..103f31b24da 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -57,7 +57,7 @@ l0_dgx_b200: - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 - - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2 + - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - condition: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index dc43383222c..5b47b2e6e6c 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -32,6 +32,9 @@ l0_h100: - unittest/_torch/modeling -k "modeling_nemotron" - unittest/_torch/modeling -k "modeling_gemma3" - unittest/_torch/modeling -k "modeling_gpt_oss" + - unittest/disaggregated/test_disagg_utils.py + - unittest/disaggregated/test_router.py + - unittest/disaggregated/test_remoteDictionary.py - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index d3b11864737..6db4fb847df 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -274,7 +274,6 @@ accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075) accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF) accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5488118) test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523) cpp/test_unit_tests.py::test_unit_tests[kernels-80] SKIP (https://nvbugs/5504078) accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319) @@ -336,7 +335,6 @@ cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90] SKIP (https: full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search SKIP (https://nvbugs/5481075) full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337) -full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5596337) accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847) test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it] SKIP (https://nvbugs/5568836) unittest/executor/test_rpc.py SKIP (https://nvbugs/5596365) @@ -360,48 +358,3 @@ full:RTX/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm- full:RTX/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm-fp8] SKIP (https://nvbugs/5569719) full:RTX/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto] SKIP (https://nvbugs/5569719) full:RTX/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-fp8] SKIP (https://nvbugs/5569719) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343) -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313) -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5569696) -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5569696) -test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[DeepSeek-V3-671B-FP8-DeepSeek-V3-0324-8] SKIP (https://nvbugs/5613456) -test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5596377) -unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_llm_rpc_streaming_tp2 SKIP (https://nvbugs/5594753) -triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) -triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) -unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py::test_fp8_rowwise_linear[dtype0] SKIP (https://nvbugs/5619396) -unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py::test_fp8_rowwise_linear[dtype1] SKIP (https://nvbugs/5619396) -accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5582258) -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5624367) -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5624367) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5587574) -triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5434308) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5629910) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] SKIP (https://nvbugs/5625962) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] SKIP (https://nvbugs/5625962) -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] SKIP (https://nvbugs/5630274) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] SKIP (https://nvbugs/5630345) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5630345) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] SKIP (https://nvbugs/5630345) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5587393) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] SKIP (https://nvbugs/5629887) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5503479) -test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5596382) -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8] SKIP (https://nvbugs/5568836) -disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5633340) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5606136) -triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5606136) -accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233) -examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) -unittest/_torch/auto_deploy/unit/singlegpu/compile/test_cuda_graph_batch_sizes.py::TestCudaGraphBatchSizes::test_forward_fallback_for_oversized_batch SKIP (https://nvbugs/5606166) -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5606266) -examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5606268) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5626197) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5628952) diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py index 28a35323b6e..d4eec8cbc66 100644 --- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py +++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py @@ -1,3 +1,6 @@ +import json +import os +import shutil import unittest from copy import deepcopy from dataclasses import dataclass @@ -51,8 +54,9 @@ class Exaone4Config(PretrainedConfig): "max_position_embeddings": 131072, "model_type": "exaone4", "num_attention_heads": 40, - "num_hidden_layers": - 4, #NOTE: For testing, we use 4 instead of 64(all layers) + # NOTE: For testing, we use 32 instead of 64(all layers) + # Increase from 4 to 32 to trigger the deep_gemm kernel issue + "num_hidden_layers": 32, "num_key_value_heads": 8, "pad_token_id": 0, "rms_norm_eps": 1e-05, @@ -74,6 +78,15 @@ class Exaone4Config(PretrainedConfig): "attn_implementation": "flash_attention_2" } +EXAONE4_FP8_QUANT_CONFIG = { + "quantization_config": { + "activation_scheme": "dynamic", + "modules_to_not_convert": None, + "quant_method": "fp8", + "weight_block_size": [128, 128] + }, +} + @dataclass(repr=False) class Scenario: @@ -390,3 +403,30 @@ def run_forward(input_ids, position_ids, attn_metadata): if graph_runner is not None: graph_runner.clear() kv_cache_manager.shutdown() + + @parameterized.expand([None, "FP8"]) + def test_llm_load(self, quant_algo): + + def dump_config_json(dst_dir, config): + if os.path.exists(dst_dir): + shutil.rmtree(dst_dir) + os.makedirs(dst_dir) + + dst_path = os.path.join(dst_dir, 'config.json') + with open(dst_path, 'w', encoding='utf-8') as f: + json.dump(config, f, indent=2, ensure_ascii=False) + + config_dict = deepcopy(EXAONE4_SINGLE_LAYER_CONFIG) + if quant_algo == "FP8": + if getSMVersion() < 89: + self.skipTest( + "This test is not supported in pre-Ada architecture") + + config_dict.update(EXAONE4_FP8_QUANT_CONFIG) + + tmp_model_dir = f"/tmp/exaone4_llm_load_test_model" + dump_config_json(tmp_model_dir, config_dict) + try: + tensorrt_llm.LLM(model=tmp_model_dir, load_format="dummy") + except Exception: + raise RuntimeError("Failed to load model.") diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index 7ccbc50d7b6..53692c51921 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -183,7 +183,6 @@ def func(input, residual, norm_weight, eps, enable_fusion): ) def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy, fusion): - torch.manual_seed(42) tensor_parallel_size = 2 diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index a94e89c743f..5a38f0d0788 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): is_fp8 = quant == "fp8" is_fp4 = quant == "fp4" - if tp_size == 4: - pytest.skip(f"https://nvbugs/5515753") - if torch.cuda.device_count() < tp_size: pytest.skip(f"Not enough GPUs available, need {tp_size} " f"but only have {torch.cuda.device_count()}") diff --git a/tests/unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py b/tests/unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py index 2f4ad7cb714..e1b36a94604 100644 --- a/tests/unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py +++ b/tests/unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py @@ -1,12 +1,11 @@ import pytest import torch -from utils.util import skip_pre_hopper from tensorrt_llm._torch.modules.linear import Linear from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig -@skip_pre_hopper +@skip_blackwell @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_fp8_rowwise_linear(dtype): SEQ_LEN = 10 diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py similarity index 99% rename from tests/unittest/_torch/thop/parallel/test_moe.py rename to tests/unittest/_torch/thop/serial/test_moe.py index ddfa2eca65f..dde1927521a 100644 --- a/tests/unittest/_torch/thop/parallel/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -1062,6 +1062,7 @@ class TestMoeFp4: ) def test_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info): + pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, @@ -1148,6 +1149,7 @@ def test_autotune_fp8_fp4(self, num_tokens, hidden_size, intermediate_size, ids=["use_score_as_input", "use_topk_as_input"]) def test_no_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info, use_topk_as_input): + pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, @@ -1215,6 +1217,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int, if padding >= 256: pytest.skip("Routing kernel requires that padding be less than 256") + if intermediate_size == 384: + pytest.skip("https://nvbugs/5434352") + assert top_k <= num_experts assert top_k <= 10 assert num_experts % 4 == 0 diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 2e9c59a9485..1f62a53ee9e 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -590,7 +590,6 @@ def llm_for_sampling_params(): llm.shutdown() -@pytest.mark.skip(reason="https://nvbugs/5504095") @pytest.mark.part0 def test_user_specify_workspace(): user_specified_ws_path = '/tmp/specified_workspace'