diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index 3cff1f73986..18a7068c9a0 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -62379,7 +62379,7 @@ Copyright 2018- The Hugging Face team. All rights reserved. - `Homepage`: https://github.com/huggingface/transformers -## triton (3.5.0) +## triton (3.5.1) ### Licenses License: `MIT License` @@ -62417,6 +62417,40 @@ License: `MIT License` - `Homepage`: https://github.com/triton-lang/triton/ +## triton-kernels (3.5.1) + +### Licenses +License: `MIT License` + + - `LICENSE` (from triton repository root): +``` +Copyright 2018-2020 Philippe Tillet +Copyright 2020-2022 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files +(the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Source`: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels + + ## tritonclient (2.63.0) ### Licenses diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md index 85cb21f6ebe..22761c10e99 100644 --- a/examples/models/core/gpt_oss/README.md +++ b/examples/models/core/gpt_oss/README.md @@ -107,33 +107,10 @@ Once again, the function call works successfully, this time using a different fu ## Using OpenAI Triton Kernels for MoE -OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels; enable them with the steps below: -1. **Build and install Triton** (tested with the commit below): +OpenAI ships a set of Triton kernels optimized for its MoE models. -```bash -git clone https://github.com/triton-lang/triton.git -cd triton -# Specific commit verified with TensorRT-LLM -git checkout f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f -pip install -r python/requirements.txt # build-time dependencies -pip install wheel build -python3 setup.py bdist_wheel -pip install ./dist/*.whl -``` - -2. **Expose the Triton kernels to TensorRT-LLM** - The kernels are not packaged in the wheel, so set the environment variable `TRITON_ROOT` to your Triton clone: - -```bash -export TRITON_ROOT=/local/user/triton -# TensorRT-LLM expects the kernels at: -# $TRITON_ROOT/python/triton_kernels -``` - -3. **Select Triton as the MoE backend** - -• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`: +To use the Triton MoE backend with **trtllm-serve** (or other similar commands), add this snippet to the YAML file passed via `--config`: ```yaml moe_config: diff --git a/requirements.txt b/requirements.txt index 2e789cbc7f6..1062163e155 100644 --- a/requirements.txt +++ b/requirements.txt @@ -67,6 +67,8 @@ etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225 blake3 soundfile triton==3.5.1 +# NOTE: the triton-kernels version should be aligned with the triton version above +triton-kernels @ git+https://github.com/triton-lang/triton.git@v3.5.1#subdirectory=python/triton_kernels tiktoken blobfile openai-harmony==0.0.4 diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py index d47d44378d9..c0a55d6c235 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py @@ -4,36 +4,15 @@ import torch import torch.nn.functional as F - -IS_TRITON_KERNELS_AVAILABLE = True -TRITON_KERNELS_UNAVAILABLE_REASON = "" - -try: - from triton_kernels.matmul_ogs import ( - FlexCtx, - FnSpecs, - FusedActivation, - PrecisionConfig, - matmul_ogs, - ) - from triton_kernels.numerics import InFlexData - from triton_kernels.routing import RoutingData, routing - from triton_kernels.swiglu import swiglu_fn - from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor - from triton_kernels.tensor_details import layout - from triton_kernels.tensor_details.layout import StridedLayout - - from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter - -except Exception as _e: - IS_TRITON_KERNELS_AVAILABLE = False - TRITON_KERNELS_UNAVAILABLE_REASON = f"{type(_e).__name__}: {_e}" - - FlexCtx = FnSpecs = FusedActivation = PrecisionConfig = matmul_ogs = None - InFlexData = RoutingData = routing = swiglu_fn = None - FP4 = convert_layout = wrap_torch_tensor = None - layout = StridedLayout = None - TritonEPRouter = None +from triton_kernels.matmul_ogs import FlexCtx, FnSpecs, FusedActivation, PrecisionConfig, matmul_ogs +from triton_kernels.numerics import InFlexData +from triton_kernels.routing import RoutingData, routing +from triton_kernels.swiglu import swiglu_fn +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout +from triton_kernels.tensor_details.layout import StridedLayout + +from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter # copied from transformers.integrations.mxfp4::swizzle_mxfp4 with minor modification diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py index b60a9f875c6..4113732111a 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py @@ -9,7 +9,6 @@ register_ad_pattern, ) -from ...custom_ops.fused_moe.mxfp4_moe import IS_TRITON_KERNELS_AVAILABLE from ...utils.module import get_submodule_of_param from ...utils.node_utils import is_op from ..interface import BaseTransform, TransformInfo, TransformRegistry @@ -220,11 +219,7 @@ def _apply( shared_config, ) -> Tuple[GraphModule, TransformInfo]: qcfg = factory.get_quant_config() - if ( - not qcfg - or qcfg.get("quant_method", "") != self.algo_name - or not IS_TRITON_KERNELS_AVAILABLE - ): + if not qcfg or qcfg.get("quant_method", "") != self.algo_name: return gm, TransformInfo( skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True ) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py index 22d95411bc1..c256b1313c9 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py @@ -1,32 +1,19 @@ from __future__ import annotations import os -import sys from typing import Dict, List, NamedTuple, Optional import torch import torch.nn as nn import triton import triton.language as tl - -IS_TRITON_KERNELS_AVAILABLE = False -# We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels -# Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified. -triton_root = os.getenv('TRITON_ROOT') -if triton_root: - triton_root = os.path.abspath( - os.path.join(triton_root, 'python', 'triton_kernels')) - if os.path.exists(triton_root) and triton_root not in sys.path: - sys.path.insert(0, triton_root) - assert triton.__version__ >= "3.4.0", "Triton kernels are detected but the Triton wheel is too old" - import triton_kernels.swiglu - from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation, - PrecisionConfig, matmul_ogs) - from triton_kernels.numerics import InFlexData - from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch - from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor - from triton_kernels.tensor_details import layout - IS_TRITON_KERNELS_AVAILABLE = True +import triton_kernels.swiglu +from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation, + PrecisionConfig, matmul_ogs) +from triton_kernels.numerics import InFlexData +from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout from ...model_config import ModelConfig from ..linear import TensorParallelMode, load_weight_shard @@ -214,11 +201,16 @@ def create_weights(self, module: torch.nn.Module): module.intermediate_size_per_partition, module.hidden_size, ) + # Bias shapes use the output dimension (last dim) of the transposed weight shapes + w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2]) + w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2]) super().create_weights(module, weight_dtype, w3_w1_weight_shape, w2_weight_shape, - bias_dtype=torch.float32) + bias_dtype=torch.float32, + w3_w1_bias_shape=w3_w1_bias_shape, + w2_bias_shape=w2_bias_shape) self.setup_quant_scales(module) def setup_quant_scales(self, module: torch.nn.Module): @@ -404,12 +396,17 @@ def create_weights(self, module: torch.nn.Module): module.intermediate_size_per_partition, module.hidden_size, ) + # Bias shapes use the output dimension (last dim) of the transposed weight shapes + w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2]) + w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2]) FusedMoEMethodBase.create_weights(self, module, weight_dtype, w3_w1_weight_shape, w2_weight_shape, - bias_dtype=torch.float32) + bias_dtype=torch.float32, + w3_w1_bias_shape=w3_w1_bias_shape, + w2_bias_shape=w2_bias_shape) fc31_dequant = nn.Parameter(torch.empty( module.expert_size_per_partition, dtype=torch.float32), @@ -1295,8 +1292,6 @@ def __init__( weight_loading_mode=weight_loading_mode, layer_idx=layer_idx, ) - if not IS_TRITON_KERNELS_AVAILABLE: - raise ImportError("Triton kernels are not available.") if torch.cuda.get_device_capability()[0] != 9 and self.ep_size > 1: raise NotImplementedError( "TritonFusedMoE is only supported on Hopper with EP size > 1.") diff --git a/tensorrt_llm/_torch/modules/triton_linear.py b/tensorrt_llm/_torch/modules/triton_linear.py index dfc3d584e64..f22f35bd0c8 100644 --- a/tensorrt_llm/_torch/modules/triton_linear.py +++ b/tensorrt_llm/_torch/modules/triton_linear.py @@ -4,20 +4,15 @@ import torch from torch.nn.parameter import Parameter +from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, matmul_ogs +from triton_kernels.numerics import InFlexData from tensorrt_llm._torch.peft.lora.layer import LoraLayer from tensorrt_llm.mapping import Mapping from ...models.modeling_utils import QuantConfig -# Reuse the common Triton import setup -from .fused_moe.fused_moe_triton import (IS_TRITON_KERNELS_AVAILABLE, - maybe_update_stride, +from .fused_moe.fused_moe_triton import (maybe_update_stride, swizzle_weight_and_scale) - -if IS_TRITON_KERNELS_AVAILABLE: - from triton_kernels.matmul_ogs import (FlexCtx, PrecisionConfig, matmul_ogs) - from triton_kernels.numerics import InFlexData - from .linear import (Linear, LinearMethodBase, TensorParallelMode, WeightsLoadingConfig, copy_weight, load_weight_shard, load_weights_fused_gate_up_helper, @@ -383,9 +378,6 @@ def __init__( use_custom_cublas_mm: bool = False, lora: Optional[LoraLayer] = None, ): - if not IS_TRITON_KERNELS_AVAILABLE: - raise ImportError("Triton kernels are not available. " - "Please install the required dependencies.") assert not use_custom_cublas_mm, "TritonLinear does not support custom cublas mm." super().__init__( diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index a4365a5ce9d..eafaeebdafd 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -234,10 +234,6 @@ microsoft/phi-4: accuracy: 90.64 mistralai/Codestral-22B-v0.1: - accuracy: 67.10 -GPT-OSS/BF16: - - accuracy: 90.3 - - kv_cache_quant_algo: FP8 - accuracy: 90.3 GPT-OSS/120B-MXFP4: - accuracy: 90.3 - spec_dec_algo: Eagle diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index fc4ba295209..19d1acedb0f 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -48,8 +48,6 @@ def patched_start_mpi_pool(self): from tensorrt_llm import LLM from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig, DeepSeekSparseAttentionConfig, EagleDecodingConfig, KvCacheConfig, MoeConfig, @@ -3614,7 +3612,10 @@ def test_nvfp4( task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"]) + @pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", + pytest.param("TRITON", marks=skip_no_hopper), "TRTLLM"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (1, 1, 1, False, True, True), @@ -3645,11 +3646,6 @@ def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size, patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("TRITON moe backend is not available.") - if get_sm_version() < 90: - pytest.skip("TRITON moe backend requires Hopper or newer.") if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100: pytest.skip( "CUTLASS or TRTLLM moe backend requires Blackwell or newer.") @@ -4107,11 +4103,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [ (True, True), ]) @@ -4120,8 +4117,6 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph, mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -4158,11 +4153,12 @@ def test_dummy_load_format(self): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (4, 1, 1, False, True, True), @@ -4191,10 +4187,6 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4252,11 +4244,12 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, extra_evaluator_kwargs=extra_evaluator_kwargs) @pytest.mark.skip_less_device(8) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (8, 1, 1, False, True, True), @@ -4269,9 +4262,6 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size, mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -4293,6 +4283,7 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size, extra_evaluator_kwargs=self.extra_evaluator_kwargs) @pytest.mark.skip_less_device(4) + @skip_no_hopper @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) @@ -4307,14 +4298,8 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size, mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4") - pytorch_config = dict( - disable_overlap_scheduler=not overlap_scheduler, - cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5, dtype=kv_cache_dtype) @@ -4323,11 +4308,12 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - **pytorch_config, + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, enable_attention_dp=attention_dp, moe_config=MoeConfig(backend="TRITON")) with llm: - model_name = "GPT-OSS/BF16" + model_name = "GPT-OSS/120B-MXFP4" task = GSM8K(model_name) task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) @@ -4336,11 +4322,12 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size, @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (2, 1, 1, False, True, True), @@ -4351,10 +4338,6 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size, def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) @@ -4423,16 +4406,13 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4492,17 +4472,14 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker): ids=["overlap_scheduler", "no_overlap_scheduler"]) @pytest.mark.parametrize("one_model", [True, False], ids=["one_model", "two_model"]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if get_sm_version() == 90: pytest.skip( "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4" @@ -4684,17 +4661,14 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker): ids=["overlap_scheduler", "no_overlap_scheduler"]) @pytest.mark.parametrize("one_model", [True, False], ids=["one_model", "two_model"]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py index 211053a299c..12ef18218a7 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py @@ -4,10 +4,8 @@ import torch import torch.distributed as dist from _dist_test_utils import get_device_counts +from utils.util import skip_no_hopper -from tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.mxfp4_moe import ( - IS_TRITON_KERNELS_AVAILABLE, -) from tensorrt_llm._torch.auto_deploy.distributed.common import spawn_multiprocess_job @@ -109,10 +107,7 @@ def _rand_scales(shape): torch.testing.assert_close(part_out, ref_out, rtol=5e-2, atol=5e-2, equal_nan=True) -@pytest.mark.skipif( - not IS_TRITON_KERNELS_AVAILABLE, - reason="triton_kernels unavailable", -) +@skip_no_hopper @pytest.mark.parametrize("num_experts", [6, 8]) @pytest.mark.parametrize("topk", [4]) # must be <= num_experts @pytest.mark.parametrize("device_count", get_device_counts()) diff --git a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py index 0e2b9745931..7d6bcc29e96 100644 --- a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py +++ b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py @@ -5,10 +5,9 @@ import pytest from transformers import AutoTokenizer from utils.llm_data import llm_models_root +from utils.util import skip_no_hopper from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig configs = """ @@ -48,11 +47,10 @@ def dump_config_json(dst_dir): json.dump(json_configs, f, indent=2, ensure_ascii=False) -@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"]) +@pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)]) def test_gpt_oss_trtllmgen(moe_backend): - if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - prompts = [ "How are you?", "Hello, my name is", diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 773218a40c2..30e64a16f2f 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -19,9 +19,8 @@ from mpi4py.futures import MPIPoolExecutor from transformers.configuration_utils import PretrainedConfig from utils.util import (check_accuracy, skip_blackwell, skip_blackwell_geforce, - skip_neither_ada_nor_hopper_unittest, - skip_non_hopper_unittest, skip_pre_blackwell, - skip_pre_hopper) + skip_neither_ada_nor_hopper_unittest, skip_no_hopper, + skip_pre_blackwell, skip_pre_hopper) from tensorrt_llm._torch.autotuner import AutoTuner, autotune from tensorrt_llm._torch.model_config import ModelConfig @@ -41,8 +40,6 @@ from tensorrt_llm._torch.modules.fused_moe.quantization import \ NVFP4CutlassFusedMoEMethod # isort: on -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm._torch.modules.gated_mlp import GatedMLP from tensorrt_llm._utils import get_sm_version, mpi_rank from tensorrt_llm.mapping import Mapping @@ -92,8 +89,8 @@ def test_fused_moe(moe_backend, mapping=None): if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") + if get_sm_version() != 90: + pytest.skip("TRITON moe backend is only supported on Hopper") if dtype != torch.bfloat16: pytest.skip("Unsupported for TritonFusedMoE") if routing_cls != RenormalizeMoeRoutingMethod: @@ -192,9 +189,9 @@ def test_fused_moe(moe_backend, # Evaluate outputs torch.cuda.synchronize() # There can be one off mismatch in the outputs due to different kernel implementations - # Here we check 99% of the outputs are within the tolerance + # Here we check most of the outputs are within the tolerance # The CutlassFusedMoE case fails as well without this change on H100 for bf16 - check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.984) + check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.975) m //= 2 @@ -514,7 +511,9 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list): @skip_pre_hopper -@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"]) +@pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)]) @pytest.mark.parametrize("routing_cls", [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod]) @pytest.mark.parametrize("bias", [True, False]) @@ -522,8 +521,6 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list): def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias): if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") if dtype != torch.bfloat16: pytest.skip("Unsupported for TritonFusedMoE") if routing_cls != RenormalizeMoeRoutingMethod: @@ -632,19 +629,30 @@ def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias): with torch.inference_mode(), autotune(): fused_moe.forward(x, router_logits) - # Explicitly capture context for kernel testing - with AutoTuner.get().capture() as all_tactics, torch.inference_mode(): - output = fused_moe.forward(x, router_logits) - - # Test all kernel tactics - for tactic in all_tactics: - with AutoTuner.get().replay(tactic), torch.inference_mode(): + # TRITON backend uses Triton kernels which don't register with AutoTuner + if moe_backend == "TRITON": + with torch.inference_mode(): + output = fused_moe.forward(x, router_logits) + check_accuracy(output, + ref_output, + rtol=0.04, + atol=0.1, + percent=0.99) + else: + # Explicitly capture context for kernel testing + with AutoTuner.get().capture() as all_tactics, torch.inference_mode( + ): output = fused_moe.forward(x, router_logits) - check_accuracy(output, - ref_output, - rtol=0.04, - atol=0.1, - percent=0.99) + + # Test all kernel tactics + for tactic in all_tactics: + with AutoTuner.get().replay(tactic), torch.inference_mode(): + output = fused_moe.forward(x, router_logits) + check_accuracy(output, + ref_output, + rtol=0.04, + atol=0.1, + percent=0.99) def set_tensor_value_2(x, num_row, num_cols): @@ -1174,7 +1182,7 @@ def test_fused_moe_fp8_blockwise_cute_dsl(dtype, return True -@skip_non_hopper_unittest +@skip_no_hopper @pytest.mark.parametrize( "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode", product( @@ -1306,7 +1314,7 @@ def test_fused_moe_fp8_blockwise_cutlass(dtype, return True -@skip_non_hopper_unittest +@skip_no_hopper @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("ep_size", [1, 2, 4]) @@ -2526,7 +2534,7 @@ def ref(): check_accuracy(output, ref_output, rtol=1e-2, atol=0.1, percent=0.99) -@skip_pre_hopper +@skip_no_hopper @pytest.mark.parametrize("experts", [8, 128]) @pytest.mark.parametrize( "hidden_size, intermediate_size", @@ -2542,12 +2550,8 @@ def ref(): @pytest.mark.parametrize("dynamic_quant", [True, False]) def test_fused_moe_triton_mxfp4(experts, hidden_size, intermediate_size, fp8_activation, bias, dynamic_quant): - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if torch.cuda.get_device_capability()[0] < 10 and fp8_activation: + if fp8_activation: pytest.skip("Latest Triton requires BF16 activation on Hopper") - if torch.cuda.get_device_capability()[0] >= 10 and not fp8_activation: - pytest.skip("Latest Triton requires FP8 activation on Blackwell") mapping = Mapping() mapping.rank = mpi_rank() diff --git a/tests/unittest/_torch/modules/test_triton_linear.py b/tests/unittest/_torch/modules/test_triton_linear.py index 2d5e87ae688..9b1229cf5d8 100644 --- a/tests/unittest/_torch/modules/test_triton_linear.py +++ b/tests/unittest/_torch/modules/test_triton_linear.py @@ -5,10 +5,8 @@ import pytest import torch from mpi4py import MPI -from utils.util import check_accuracy, skip_pre_hopper +from utils.util import check_accuracy, skip_no_hopper -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm._torch.modules.linear import Linear from tensorrt_llm._torch.modules.triton_linear import TritonLinear from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig @@ -21,11 +19,10 @@ ) -@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear]) +@pytest.mark.parametrize( + "linear_cls", + [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)]) def test_linear_unquantized(linear_cls): - if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear: - pytest.skip("Triton kernels are not available") - torch.manual_seed(0) torch.cuda.manual_seed(0) num_tokens = 128 @@ -56,11 +53,10 @@ def test_linear_unquantized(linear_cls): check_accuracy(actual_c, reference_c, atol=0.01, rtol=0.01, percent=0.99) -@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear]) +@pytest.mark.parametrize( + "linear_cls", + [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)]) def test_linear_fp8qdq(linear_cls): - if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear: - pytest.skip("Triton kernels are not available") - torch.manual_seed(0) torch.cuda.manual_seed(0) num_tokens = 128 @@ -100,18 +96,12 @@ def test_linear_fp8qdq(linear_cls): percent=0.99) -@skip_pre_hopper +@skip_no_hopper @pytest.mark.parametrize("activation_dtype", [torch.bfloat16, torch.float8_e4m3fn]) def test_linear_mxfp4(activation_dtype): - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if torch.cuda.get_device_capability( - )[0] < 10 and activation_dtype == torch.float8_e4m3fn: + if activation_dtype == torch.float8_e4m3fn: pytest.skip("Latest Triton requires BF16 activation on Hopper") - if torch.cuda.get_device_capability( - )[0] >= 10 and activation_dtype == torch.bfloat16: - pytest.skip("Latest Triton requires FP8 activation on Blackwell") dtype = torch.bfloat16 num_tokens = 128 diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 2c731e9110b..828bf08ea55 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -106,6 +106,9 @@ def isSM100Family(): skip_pre_hopper = pytest.mark.skipif( getSMVersion() < 90, reason="This test is not supported in pre-Hopper architecture") +skip_no_hopper = pytest.mark.skipif( + getSMVersion() != 90, + reason="This test is only supported in Hopper architecture") skip_pre_blackwell = pytest.mark.skipif( getSMVersion() < 100, reason="This test is not supported in pre-Blackwell architecture")