diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index a72d8d95ee6..f32ce4c7d06 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -493,7 +493,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ #if !DISABLE_SYNC_FOR_PROFILING uint32_t expected_value = *ptrs.flag_val; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 + // .acquire and .release qualifiers for fence instruction require sm_90 or higher. asm volatile("fence.release.sys;"); +#else + asm volatile("fence.acq_rel.sys;"); +#endif #pragma unroll 1 // No unroll as one iter is typically enough for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize) { @@ -525,7 +530,6 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ flag_set = flag_value == expected_value; } while (!flag_set); } - // asm volatile("fence.acquire.sys;"); #endif } } @@ -1018,7 +1022,6 @@ __global__ void moeA2ACombineKernel( if (blockIdx.x == 0) { - // asm volatile("fence.release.sys;"); #pragma unroll 1 // No unroll for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize) { @@ -1050,7 +1053,12 @@ __global__ void moeA2ACombineKernel( flag_set = flag_value == expected_value; } while (!flag_set); } +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 + // .acquire and .release qualifiers for fence instruction require sm_90 or higher. asm volatile("fence.acquire.sys;"); +#else + asm volatile("fence.acq_rel.sys;"); +#endif } __syncthreads(); #endif diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 74e18b2cd2a..a6bf164d1ad 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,8 +1,8 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_TAG=25.10-py3 -ARG TRITON_BASE_TAG=25.10-py3 +ARG BASE_TAG=25.12-py3 +ARG TRITON_BASE_TAG=25.12-py3 ARG DEVEL_IMAGE=devel FROM ${BASE_IMAGE}:${BASE_TAG} AS base @@ -147,6 +147,7 @@ RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,sour pip install /tmp/wheel/tensorrt_llm*.whl COPY README.md ./ +COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl ./ COPY docs docs COPY cpp/include include diff --git a/docker/Makefile b/docker/Makefile index 519dbbda13d..67c0a36015c 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -202,17 +202,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE) jenkins-rockylinux8_%: STAGE = tritondevel jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available -jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +jenkins-rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8 rockylinux8_%: STAGE = tritondevel rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8 # For x86_64 and aarch64 ubuntu22_%: STAGE = tritondevel ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda -ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04 +ubuntu22_%: BASE_TAG = 13.1.0-devel-ubuntu22.04 trtllm_%: STAGE = release trtllm_%: PUSH_TO_STAGING := 0 diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh index 0dc5cb305aa..555a3b348b8 100644 --- a/docker/common/install_cuda_toolkit.sh +++ b/docker/common/install_cuda_toolkit.sh @@ -5,7 +5,7 @@ set -ex # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file. # CUDA version is usually aligned with the latest NGC CUDA image tag. # Only use when public CUDA image is not ready. -CUDA_VER="13.0.2_580.95.05" +CUDA_VER="13.1.0_590.44.01" CUDA_VER_SHORT="${CUDA_VER%_*}" NVCC_VERSION_OUTPUT=$(nvcc --version) diff --git a/docker/common/install_polygraphy.sh b/docker/common/install_polygraphy.sh index 315658d7a0d..da9df649546 100644 --- a/docker/common/install_polygraphy.sh +++ b/docker/common/install_polygraphy.sh @@ -5,7 +5,7 @@ set -ex if [ -n "${GITHUB_MIRROR}" ]; then export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" fi -pip3 install polygraphy==0.49.9 +pip3 install polygraphy==0.49.26 # Clean up pip cache and temporary files pip3 cache purge diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index 069b26846c8..0cf13150199 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -5,7 +5,7 @@ set -ex # Use latest stable version from https://pypi.org/project/torch/#history # and closest to the version specified in # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -TORCH_VERSION="2.9.0" +TORCH_VERSION="2.9.1" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index 3887be6fa26..f6ae0f3872d 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -2,20 +2,20 @@ set -ex -TRT_VER="10.13.3.9" +TRT_VER="10.14.1.48" # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -CUDA_VER="13.0" # 13.0.2 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 +CUDA_VER="13.1" # 13.1.0 # Keep the installation for cuDNN if users want to install PyTorch with source codes. # PyTorch 2.x can compile with cuDNN v9. -CUDNN_VER="9.14.0.64-1" -NCCL_VER="2.27.7-1+cuda13.0" -CUBLAS_VER="13.1.0.3-1" +CUDNN_VER="9.17.0.29-1" +NCCL_VER="2.28.9-1+cuda13.0" +CUBLAS_VER="13.2.0.9-1" # Align with the pre-installed CUDA / NVCC / NVRTC versions from # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html -NVRTC_VER="13.0.88-1" -CUDA_RUNTIME="13.0.96-1" -CUDA_DRIVER_VERSION="580.95.05-1.el8" +NVRTC_VER="13.1.80-1" +CUDA_RUNTIME="13.1.80-1" +CUDA_DRIVER_VERSION="590.44.01-1.el8" for i in "$@"; do case $i in @@ -118,7 +118,12 @@ install_rockylinux_requirements() { install_tensorrt() { PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))') PARSED_PY_VERSION=$(echo "${PY_VERSION//./}") + TRT_CUDA_VERSION=${CUDA_VER} + # No cuda-13.1 version for TensorRT yet, use cuda-13.0 instead + if [ "$CUDA_VER" = "13.1" ]; then + TRT_CUDA_VERSION="13.0" + fi TRT_VER_SHORT=$(echo $TRT_VER | cut -d. -f1-3) if [ -z "$RELEASE_URL_TRT" ];then diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 261c0a6d3a0..843e78f998c 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -83,19 +83,19 @@ def BUILD_CONFIGS = [ (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], (CONFIG_LINUX_AARCH64_PYBIND): [ (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], (CONFIG_LINUX_AARCH64_LLVM) : [ (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD", (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], ] diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 3e81b22a099..324d4337a2f 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -1092,7 +1092,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) } if (singleGpuTestFailed) { - if (env.JOB_NAME ==~ /.*PostMerge.*/) { + //if (env.JOB_NAME ==~ /.*PostMerge.*/) { + if (testFilter[(IS_POST_MERGE)] || env.JOB_NAME ==~ /.*PostMerge.*/) { echo "In the official post-merge pipeline, x86_64 single-GPU test failed, whereas multi-GPU test is still kept running." } else { stage("[Test-x86_64-Multi-GPU] Blocked") { @@ -1196,7 +1197,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) } if (singleGpuTestFailed) { - if (env.JOB_NAME ==~ /.*PostMerge.*/) { + // if (env.JOB_NAME ==~ /.*PostMerge.*/) { + if (testFilter[(IS_POST_MERGE)] || env.JOB_NAME ==~ /.*PostMerge.*/) { echo "In the official post-merge pipeline, SBSA single-GPU test failed, whereas multi-GPU test is still kept running." } else { stage("[Test-SBSA-Multi-GPU] Blocked") { diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index e39e4e012d9..69c354b51f3 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312 // DLFW torch image -DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3" +DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.12-py3" //Ubuntu base image UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04" @@ -83,7 +83,7 @@ BUILD_CORES_REQUEST = "8" BUILD_CORES_LIMIT = "8" BUILD_MEMORY_REQUEST = "48Gi" BUILD_MEMORY_LIMIT = "96Gi" -BUILD_JOBS = "4" +BUILD_JOBS = "8" SLURM_CORES_REQUEST = "1" SLURM_CORES_LIMIT = "1" @@ -316,6 +316,11 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) { foundRunningLine = true return false // Don't include the "Running" line itself } + // Stop collecting when we hit the warnings/errors summary separator + if (foundRunningLine && line.contains('======================')) { + foundRunningLine = false // Stop collecting + return false + } def hasDoubleColon = line.contains('::') def shouldInclude = foundRunningLine && hasDoubleColon @@ -3329,7 +3334,7 @@ def launchTestJobs(pipeline, testFilter) // Python version and OS for sanity check x86SanityCheckConfigs = [ "PY312-DLFW": [ - LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, + DLFW_IMAGE, "B200_PCIe", X86_64_TRIPLE, false, @@ -3364,7 +3369,7 @@ def launchTestJobs(pipeline, testFilter) AARCH64_TRIPLE, false, "", - UBUNTU_24_04_IMAGE, + DLFW_IMAGE, true, // Extra PyTorch CUDA 13.0 install ], "PY312-DLFW": [ @@ -3464,7 +3469,7 @@ def launchTestJobs(pipeline, testFilter) def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb") trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb") - trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-1") } // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8) if (values[6]) { @@ -3472,9 +3477,9 @@ def launchTestJobs(pipeline, testFilter) // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability. // PyTorch CUDA 13.0 package and torchvision package can be installed as expected. if (k8s_arch == "amd64") { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } else { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } } diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 32643c10d3c..24e44e26fab 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512241744-10055 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202601011103-9818 diff --git a/requirements.txt b/requirements.txt index a21b8ca2819..9c0b705829e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,13 +19,14 @@ pandas h5py==3.12.1 StrEnum sentencepiece>=0.1.99 -tensorrt~=10.13.3 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0. -torch>=2.9.0a0,<=2.9.0 +tensorrt~=10.14.1 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.10.0a0. +torch>=2.9.1,<=2.10.0a0 torchvision nvidia-modelopt[torch]~=0.37.0 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7 -nvidia-nccl-cu13==2.27.7 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.28.9 +# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7; platform_system == "Linux" +nvidia-nccl-cu13>=2.27.7,<=2.28.9 nvidia-cuda-nvrtc transformers==4.57.1 prometheus_client @@ -65,7 +66,7 @@ ninja etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a blake3 soundfile -triton==3.5.0 +triton==3.5.1 tiktoken blobfile openai-harmony==0.0.4 diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a48b99d7edc..76d115326fb 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -495,6 +495,56 @@ examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5715568) unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741) +cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642) +unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py::test_bamba_patches[ibm-ai-platform/Bamba-9B-v2-True-False] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py::test_bamba_patches[ibm-ai-platform/Bamba-9B-v2-True] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py::test_bamba_patches[nvidia/NVIDIA-Nemotron-Nano-12B-v2-True-False] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel-8-4] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel-8-2] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel2-8-4] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel2-8-2] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel3-8-4] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[RepeatKVModel3-8-2] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[HFRepeatKVModel-8-4] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_match_repeat_kv[HFRepeatKVModel-8-2] SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py::test_counter_example SKIP (https://nvbugs/5761665) +unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py::test_match_llama_attention[eager-config1] SKIP (https://nvbugs/5761665) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[7-7168-2112] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[7-2048-7168] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[16-7168-2112] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[16-2048-7168] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[64-7168-2112] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[64-2048-7168] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[128-7168-2112] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[128-2048-7168] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[4096-7168-2112] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scale_gemm[4096-2048-7168] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[7168-4096-ms0] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[7168-4096-ms1] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[7168-4096-ms2] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[7168-4096-ms3] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[2048-7168-ms0] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[2048-7168-ms1] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[2048-7168-ms2] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_moe_gemm[2048-7168-ms3] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[7168-4096-1-1024] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[7168-4096-2-512] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[7168-4096-4-256] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[2048-7168-1-1024] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[2048-7168-2-512] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/deep_gemm_tests.py::test_fp8_block_scaling_bmm[2048-7168-4-256] SKIP (https://nvbugs/5761699) +unittest/_torch/thop/parallel/test_fp8_block_scale_gemm.py::test_deep_gemm_in_subprocess[env2] SKIP (https://nvbugs/5766853) +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype SKIP (https://nvbugs/5766864) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py::test_dist_backend_all_gather[torch] SKIP (https://nvbugs/5766986) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py::test_dist_backend_all_gather[trtllm] SKIP (https://nvbugs/5766986) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[Linear-torch_dist_all_gather-False-False-2] SKIP (https://nvbugs/5766982) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[Linear-torch_dist_all_gather-False-True-2] SKIP (https://nvbugs/5766982) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[Linear-torch_dist_all_gather-True-False-2] SKIP (https://nvbugs/5766982) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[Linear-torch_dist_all_gather-True-True-2] SKIP (https://nvbugs/5766982) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding_pattern_detection[Linear-torch_dist_all_gather-False-False-8] SKIP (https://nvbugs/5766974) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding_pattern_detection[Linear-torch_dist_all_gather-False-True-8] SKIP (https://nvbugs/5766974) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding_pattern_detection[Linear-torch_dist_all_gather-True-False-8] SKIP (https://nvbugs/5766974) +unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding_pattern_detection[Linear-torch_dist_all_gather-True-True-8] SKIP (https://nvbugs/5766974) triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5766952) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] SKIP (https://nvbugs/5722618)