diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
index 3cff1f73986..18a7068c9a0 100644
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -62379,7 +62379,7 @@ Copyright 2018- The Hugging Face team. All rights reserved.
   - `Homepage`: https://github.com/huggingface/transformers
 
 
-## triton (3.5.0)
+## triton (3.5.1)
 
 ### Licenses
 License: `MIT License`
@@ -62417,6 +62417,40 @@ License: `MIT License`
   - `Homepage`: https://github.com/triton-lang/triton/
 
 
+## triton-kernels (3.5.1)
+
+### Licenses
+License: `MIT License`
+
+  - `LICENSE` (from triton repository root):
+```
+Copyright 2018-2020 Philippe Tillet
+Copyright 2020-2022 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+```
+
+### URLs
+  - `Source`: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels
+
+
 ## tritonclient (2.63.0)
 
 ### Licenses
diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md
index 85cb21f6ebe..22761c10e99 100644
--- a/examples/models/core/gpt_oss/README.md
+++ b/examples/models/core/gpt_oss/README.md
@@ -107,33 +107,10 @@ Once again, the function call works successfully, this time using a different fu
 
 ## Using OpenAI Triton Kernels for MoE
 
-OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels; enable them with the steps below:
 
-1. **Build and install Triton** (tested with the commit below):
+OpenAI ships a set of Triton kernels optimized for its MoE models.
 
-```bash
-git clone https://github.com/triton-lang/triton.git
-cd triton
-# Specific commit verified with TensorRT-LLM
-git checkout f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f
-pip install -r python/requirements.txt          # build-time dependencies
-pip install wheel build
-python3 setup.py bdist_wheel
-pip install ./dist/*.whl
-```
-
-2. **Expose the Triton kernels to TensorRT-LLM**
-   The kernels are not packaged in the wheel, so set the environment variable `TRITON_ROOT` to your Triton clone:
-
-```bash
-export TRITON_ROOT=/local/user/triton
-# TensorRT-LLM expects the kernels at:
-#   $TRITON_ROOT/python/triton_kernels
-```
-
-3. **Select Triton as the MoE backend**
-
-• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`:
+To use the Triton MoE backend with **trtllm-serve** (or other similar commands), add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
diff --git a/requirements.txt b/requirements.txt
index 2e789cbc7f6..1062163e155 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -67,6 +67,8 @@ etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225
 blake3
 soundfile
 triton==3.5.1
+# NOTE: the triton-kernels version should be aligned with the triton version above
+triton-kernels @ git+https://github.com/triton-lang/triton.git@v3.5.1#subdirectory=python/triton_kernels
 tiktoken
 blobfile
 openai-harmony==0.0.4
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
index d47d44378d9..c0a55d6c235 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
@@ -4,36 +4,15 @@
 
 import torch
 import torch.nn.functional as F
-
-IS_TRITON_KERNELS_AVAILABLE = True
-TRITON_KERNELS_UNAVAILABLE_REASON = ""
-
-try:
-    from triton_kernels.matmul_ogs import (
-        FlexCtx,
-        FnSpecs,
-        FusedActivation,
-        PrecisionConfig,
-        matmul_ogs,
-    )
-    from triton_kernels.numerics import InFlexData
-    from triton_kernels.routing import RoutingData, routing
-    from triton_kernels.swiglu import swiglu_fn
-    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
-    from triton_kernels.tensor_details import layout
-    from triton_kernels.tensor_details.layout import StridedLayout
-
-    from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter
-
-except Exception as _e:
-    IS_TRITON_KERNELS_AVAILABLE = False
-    TRITON_KERNELS_UNAVAILABLE_REASON = f"{type(_e).__name__}: {_e}"
-
-    FlexCtx = FnSpecs = FusedActivation = PrecisionConfig = matmul_ogs = None
-    InFlexData = RoutingData = routing = swiglu_fn = None
-    FP4 = convert_layout = wrap_torch_tensor = None
-    layout = StridedLayout = None
-    TritonEPRouter = None
+from triton_kernels.matmul_ogs import FlexCtx, FnSpecs, FusedActivation, PrecisionConfig, matmul_ogs
+from triton_kernels.numerics import InFlexData
+from triton_kernels.routing import RoutingData, routing
+from triton_kernels.swiglu import swiglu_fn
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.tensor_details.layout import StridedLayout
+
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter
 
 
 # copied from transformers.integrations.mxfp4::swizzle_mxfp4 with minor modification
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py
index b60a9f875c6..4113732111a 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py
@@ -9,7 +9,6 @@
     register_ad_pattern,
 )
 
-from ...custom_ops.fused_moe.mxfp4_moe import IS_TRITON_KERNELS_AVAILABLE
 from ...utils.module import get_submodule_of_param
 from ...utils.node_utils import is_op
 from ..interface import BaseTransform, TransformInfo, TransformRegistry
@@ -220,11 +219,7 @@ def _apply(
         shared_config,
     ) -> Tuple[GraphModule, TransformInfo]:
         qcfg = factory.get_quant_config()
-        if (
-            not qcfg
-            or qcfg.get("quant_method", "") != self.algo_name
-            or not IS_TRITON_KERNELS_AVAILABLE
-        ):
+        if not qcfg or qcfg.get("quant_method", "") != self.algo_name:
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
index 22d95411bc1..c256b1313c9 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -1,32 +1,19 @@
 from __future__ import annotations
 
 import os
-import sys
 from typing import Dict, List, NamedTuple, Optional
 
 import torch
 import torch.nn as nn
 import triton
 import triton.language as tl
-
-IS_TRITON_KERNELS_AVAILABLE = False
-# We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels
-# Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified.
-triton_root = os.getenv('TRITON_ROOT')
-if triton_root:
-    triton_root = os.path.abspath(
-        os.path.join(triton_root, 'python', 'triton_kernels'))
-    if os.path.exists(triton_root) and triton_root not in sys.path:
-        sys.path.insert(0, triton_root)
-    assert triton.__version__ >= "3.4.0", "Triton kernels are detected but the Triton wheel is too old"
-    import triton_kernels.swiglu
-    from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation,
-                                           PrecisionConfig, matmul_ogs)
-    from triton_kernels.numerics import InFlexData
-    from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch
-    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
-    from triton_kernels.tensor_details import layout
-    IS_TRITON_KERNELS_AVAILABLE = True
+import triton_kernels.swiglu
+from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation,
+                                       PrecisionConfig, matmul_ogs)
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
 
 from ...model_config import ModelConfig
 from ..linear import TensorParallelMode, load_weight_shard
@@ -214,11 +201,16 @@ def create_weights(self, module: torch.nn.Module):
             module.intermediate_size_per_partition,
             module.hidden_size,
         )
+        # Bias shapes use the output dimension (last dim) of the transposed weight shapes
+        w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2])
+        w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2])
         super().create_weights(module,
                                weight_dtype,
                                w3_w1_weight_shape,
                                w2_weight_shape,
-                               bias_dtype=torch.float32)
+                               bias_dtype=torch.float32,
+                               w3_w1_bias_shape=w3_w1_bias_shape,
+                               w2_bias_shape=w2_bias_shape)
         self.setup_quant_scales(module)
 
     def setup_quant_scales(self, module: torch.nn.Module):
@@ -404,12 +396,17 @@ def create_weights(self, module: torch.nn.Module):
             module.intermediate_size_per_partition,
             module.hidden_size,
         )
+        # Bias shapes use the output dimension (last dim) of the transposed weight shapes
+        w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2])
+        w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2])
         FusedMoEMethodBase.create_weights(self,
                                           module,
                                           weight_dtype,
                                           w3_w1_weight_shape,
                                           w2_weight_shape,
-                                          bias_dtype=torch.float32)
+                                          bias_dtype=torch.float32,
+                                          w3_w1_bias_shape=w3_w1_bias_shape,
+                                          w2_bias_shape=w2_bias_shape)
 
         fc31_dequant = nn.Parameter(torch.empty(
             module.expert_size_per_partition, dtype=torch.float32),
@@ -1295,8 +1292,6 @@ def __init__(
             weight_loading_mode=weight_loading_mode,
             layer_idx=layer_idx,
         )
-        if not IS_TRITON_KERNELS_AVAILABLE:
-            raise ImportError("Triton kernels are not available.")
         if torch.cuda.get_device_capability()[0] != 9 and self.ep_size > 1:
             raise NotImplementedError(
                 "TritonFusedMoE is only supported on Hopper with EP size > 1.")
diff --git a/tensorrt_llm/_torch/modules/triton_linear.py b/tensorrt_llm/_torch/modules/triton_linear.py
index dfc3d584e64..f22f35bd0c8 100644
--- a/tensorrt_llm/_torch/modules/triton_linear.py
+++ b/tensorrt_llm/_torch/modules/triton_linear.py
@@ -4,20 +4,15 @@
 
 import torch
 from torch.nn.parameter import Parameter
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, matmul_ogs
+from triton_kernels.numerics import InFlexData
 
 from tensorrt_llm._torch.peft.lora.layer import LoraLayer
 from tensorrt_llm.mapping import Mapping
 
 from ...models.modeling_utils import QuantConfig
-# Reuse the common Triton import setup
-from .fused_moe.fused_moe_triton import (IS_TRITON_KERNELS_AVAILABLE,
-                                         maybe_update_stride,
+from .fused_moe.fused_moe_triton import (maybe_update_stride,
                                          swizzle_weight_and_scale)
-
-if IS_TRITON_KERNELS_AVAILABLE:
-    from triton_kernels.matmul_ogs import (FlexCtx, PrecisionConfig, matmul_ogs)
-    from triton_kernels.numerics import InFlexData
-
 from .linear import (Linear, LinearMethodBase, TensorParallelMode,
                      WeightsLoadingConfig, copy_weight, load_weight_shard,
                      load_weights_fused_gate_up_helper,
@@ -383,9 +378,6 @@ def __init__(
         use_custom_cublas_mm: bool = False,
         lora: Optional[LoraLayer] = None,
     ):
-        if not IS_TRITON_KERNELS_AVAILABLE:
-            raise ImportError("Triton kernels are not available. "
-                              "Please install the required dependencies.")
         assert not use_custom_cublas_mm, "TritonLinear does not support custom cublas mm."
 
         super().__init__(
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index a4365a5ce9d..eafaeebdafd 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -234,10 +234,6 @@ microsoft/phi-4:
     accuracy: 90.64
 mistralai/Codestral-22B-v0.1:
   - accuracy: 67.10
-GPT-OSS/BF16:
-  - accuracy: 90.3
-  - kv_cache_quant_algo: FP8
-    accuracy: 90.3
 GPT-OSS/120B-MXFP4:
   - accuracy: 90.3
   - spec_dec_algo: Eagle
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index fc4ba295209..19d1acedb0f 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -48,8 +48,6 @@ def patched_start_mpi_pool(self):
 
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
-    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig,
                                  DeepSeekSparseAttentionConfig,
                                  EagleDecodingConfig, KvCacheConfig, MoeConfig,
@@ -3614,7 +3612,10 @@ def test_nvfp4(
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"])
+    @pytest.mark.parametrize(
+        "moe_backend",
+        ["CUTLASS",
+         pytest.param("TRITON", marks=skip_no_hopper), "TRTLLM"])
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
             (1, 1, 1, False, True, True),
@@ -3645,11 +3646,6 @@ def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
         patch_mpi_pool_session_for_env(mocker,
                                        {"ENABLE_CONFIGURABLE_MOE": env_value})
 
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("TRITON moe backend is not available.")
-            if get_sm_version() < 90:
-                pytest.skip("TRITON moe backend requires Hopper or newer.")
         if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100:
             pytest.skip(
                 "CUTLASS or TRTLLM moe backend requires Blackwell or newer.")
@@ -4107,11 +4103,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
         (True, True),
     ])
@@ -4120,8 +4117,6 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
-        if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
-            pytest.skip("Triton kernels are not available")
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -4158,11 +4153,12 @@ def test_dummy_load_format(self):
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
             (4, 1, 1, False, True, True),
@@ -4191,10 +4187,6 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
         patch_mpi_pool_session_for_env(mocker,
                                        {"ENABLE_CONFIGURABLE_MOE": env_value})
 
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
-
         MAX_OUTPUT_LEN = 128179
         MAX_INPUT_LEN = 32768
 
@@ -4252,11 +4244,12 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                           extra_evaluator_kwargs=extra_evaluator_kwargs)
 
     @pytest.mark.skip_less_device(8)
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
             (8, 1, 1, False, True, True),
@@ -4269,9 +4262,6 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size,
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -4293,6 +4283,7 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
     @pytest.mark.skip_less_device(4)
+    @skip_no_hopper
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
@@ -4307,14 +4298,8 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
-        if not IS_TRITON_KERNELS_AVAILABLE:
-            pytest.skip("Triton kernels are not available")
         monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
 
-        pytorch_config = dict(
-            disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
-
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
                                         dtype=kv_cache_dtype)
 
@@ -4323,11 +4308,12 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  **pytorch_config,
+                  disable_overlap_scheduler=not overlap_scheduler,
+                  cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
                   enable_attention_dp=attention_dp,
                   moe_config=MoeConfig(backend="TRITON"))
         with llm:
-            model_name = "GPT-OSS/BF16"
+            model_name = "GPT-OSS/120B-MXFP4"
             task = GSM8K(model_name)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
@@ -4336,11 +4322,12 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
             (2, 1, 1, False, True, True),
@@ -4351,10 +4338,6 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
     def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                       ep_size, attention_dp, cuda_graph, overlap_scheduler,
                       mocker):
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
-
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
@@ -4423,16 +4406,13 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp,
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
-
         MAX_OUTPUT_LEN = 128179
         MAX_INPUT_LEN = 32768
 
@@ -4492,17 +4472,14 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
                              ids=["overlap_scheduler", "no_overlap_scheduler"])
     @pytest.mark.parametrize("one_model", [True, False],
                              ids=["one_model", "two_model"])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
                           mocker):
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
-
         if get_sm_version() == 90:
             pytest.skip(
                 "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4"
@@ -4684,17 +4661,14 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
                              ids=["overlap_scheduler", "no_overlap_scheduler"])
     @pytest.mark.parametrize("one_model", [True, False],
                              ids=["one_model", "two_model"])
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("moe_backend", [
+        "CUTLASS",
+        pytest.param("TRTLLM", marks=skip_pre_blackwell),
+        pytest.param("TRITON", marks=skip_no_hopper)
+    ],
+                             ids=["cutlass", "trtllm", "triton"])
     def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
                           mocker):
-        if moe_backend == "TRITON":
-            if not IS_TRITON_KERNELS_AVAILABLE:
-                pytest.skip("Triton kernels are not available")
-
         MAX_OUTPUT_LEN = 128179
         MAX_INPUT_LEN = 32768
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py
index 211053a299c..12ef18218a7 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py
@@ -4,10 +4,8 @@
 import torch
 import torch.distributed as dist
 from _dist_test_utils import get_device_counts
+from utils.util import skip_no_hopper
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.mxfp4_moe import (
-    IS_TRITON_KERNELS_AVAILABLE,
-)
 from tensorrt_llm._torch.auto_deploy.distributed.common import spawn_multiprocess_job
 
 
@@ -109,10 +107,7 @@ def _rand_scales(shape):
     torch.testing.assert_close(part_out, ref_out, rtol=5e-2, atol=5e-2, equal_nan=True)
 
 
-@pytest.mark.skipif(
-    not IS_TRITON_KERNELS_AVAILABLE,
-    reason="triton_kernels unavailable",
-)
+@skip_no_hopper
 @pytest.mark.parametrize("num_experts", [6, 8])
 @pytest.mark.parametrize("topk", [4])  # must be <= num_experts
 @pytest.mark.parametrize("device_count", get_device_counts())
diff --git a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py
index 0e2b9745931..7d6bcc29e96 100644
--- a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py
+++ b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py
@@ -5,10 +5,9 @@
 import pytest
 from transformers import AutoTokenizer
 from utils.llm_data import llm_models_root
+from utils.util import skip_no_hopper
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
-    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig
 
 configs = """
@@ -48,11 +47,10 @@ def dump_config_json(dst_dir):
         json.dump(json_configs, f, indent=2, ensure_ascii=False)
 
 
-@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"])
+@pytest.mark.parametrize(
+    "moe_backend",
+    ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)])
 def test_gpt_oss_trtllmgen(moe_backend):
-    if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
-        pytest.skip("Triton kernels are not available")
-
     prompts = [
         "How are you?",
         "Hello, my name is",
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 773218a40c2..30e64a16f2f 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -19,9 +19,8 @@
 from mpi4py.futures import MPIPoolExecutor
 from transformers.configuration_utils import PretrainedConfig
 from utils.util import (check_accuracy, skip_blackwell, skip_blackwell_geforce,
-                        skip_neither_ada_nor_hopper_unittest,
-                        skip_non_hopper_unittest, skip_pre_blackwell,
-                        skip_pre_hopper)
+                        skip_neither_ada_nor_hopper_unittest, skip_no_hopper,
+                        skip_pre_blackwell, skip_pre_hopper)
 
 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.model_config import ModelConfig
@@ -41,8 +40,6 @@
 from tensorrt_llm._torch.modules.fused_moe.quantization import \
     NVFP4CutlassFusedMoEMethod
 # isort: on
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
-    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm._torch.modules.gated_mlp import GatedMLP
 from tensorrt_llm._utils import get_sm_version, mpi_rank
 from tensorrt_llm.mapping import Mapping
@@ -92,8 +89,8 @@ def test_fused_moe(moe_backend,
                    mapping=None):
 
     if moe_backend == "TRITON":
-        if not IS_TRITON_KERNELS_AVAILABLE:
-            pytest.skip("Triton kernels are not available")
+        if get_sm_version() != 90:
+            pytest.skip("TRITON moe backend is only supported on Hopper")
         if dtype != torch.bfloat16:
             pytest.skip("Unsupported for TritonFusedMoE")
         if routing_cls != RenormalizeMoeRoutingMethod:
@@ -192,9 +189,9 @@ def test_fused_moe(moe_backend,
         # Evaluate outputs
         torch.cuda.synchronize()
         # There can be one off mismatch in the outputs due to different kernel implementations
-        # Here we check 99% of the outputs are within the tolerance
+        # Here we check most of the outputs are within the tolerance
         # The CutlassFusedMoE case fails as well without this change on H100 for bf16
-        check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.984)
+        check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.975)
         m //= 2
 
 
@@ -514,7 +511,9 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list):
 
 
 @skip_pre_hopper
-@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"])
+@pytest.mark.parametrize(
+    "moe_backend",
+    ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)])
 @pytest.mark.parametrize("routing_cls",
                          [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod])
 @pytest.mark.parametrize("bias", [True, False])
@@ -522,8 +521,6 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list):
 def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias):
 
     if moe_backend == "TRITON":
-        if not IS_TRITON_KERNELS_AVAILABLE:
-            pytest.skip("Triton kernels are not available")
         if dtype != torch.bfloat16:
             pytest.skip("Unsupported for TritonFusedMoE")
         if routing_cls != RenormalizeMoeRoutingMethod:
@@ -632,19 +629,30 @@ def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias):
         with torch.inference_mode(), autotune():
             fused_moe.forward(x, router_logits)
 
-        # Explicitly capture context for kernel testing
-        with AutoTuner.get().capture() as all_tactics, torch.inference_mode():
-            output = fused_moe.forward(x, router_logits)
-
-        # Test all kernel tactics
-        for tactic in all_tactics:
-            with AutoTuner.get().replay(tactic), torch.inference_mode():
+        # TRITON backend uses Triton kernels which don't register with AutoTuner
+        if moe_backend == "TRITON":
+            with torch.inference_mode():
+                output = fused_moe.forward(x, router_logits)
+            check_accuracy(output,
+                           ref_output,
+                           rtol=0.04,
+                           atol=0.1,
+                           percent=0.99)
+        else:
+            # Explicitly capture context for kernel testing
+            with AutoTuner.get().capture() as all_tactics, torch.inference_mode(
+            ):
                 output = fused_moe.forward(x, router_logits)
-                check_accuracy(output,
-                               ref_output,
-                               rtol=0.04,
-                               atol=0.1,
-                               percent=0.99)
+
+            # Test all kernel tactics
+            for tactic in all_tactics:
+                with AutoTuner.get().replay(tactic), torch.inference_mode():
+                    output = fused_moe.forward(x, router_logits)
+                    check_accuracy(output,
+                                   ref_output,
+                                   rtol=0.04,
+                                   atol=0.1,
+                                   percent=0.99)
 
 
 def set_tensor_value_2(x, num_row, num_cols):
@@ -1174,7 +1182,7 @@ def test_fused_moe_fp8_blockwise_cute_dsl(dtype,
     return True
 
 
-@skip_non_hopper_unittest
+@skip_no_hopper
 @pytest.mark.parametrize(
     "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode",
     product(
@@ -1306,7 +1314,7 @@ def test_fused_moe_fp8_blockwise_cutlass(dtype,
     return True
 
 
-@skip_non_hopper_unittest
+@skip_no_hopper
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
@@ -2526,7 +2534,7 @@ def ref():
         check_accuracy(output, ref_output, rtol=1e-2, atol=0.1, percent=0.99)
 
 
-@skip_pre_hopper
+@skip_no_hopper
 @pytest.mark.parametrize("experts", [8, 128])
 @pytest.mark.parametrize(
     "hidden_size, intermediate_size",
@@ -2542,12 +2550,8 @@ def ref():
 @pytest.mark.parametrize("dynamic_quant", [True, False])
 def test_fused_moe_triton_mxfp4(experts, hidden_size, intermediate_size,
                                 fp8_activation, bias, dynamic_quant):
-    if not IS_TRITON_KERNELS_AVAILABLE:
-        pytest.skip("Triton kernels are not available")
-    if torch.cuda.get_device_capability()[0] < 10 and fp8_activation:
+    if fp8_activation:
         pytest.skip("Latest Triton requires BF16 activation on Hopper")
-    if torch.cuda.get_device_capability()[0] >= 10 and not fp8_activation:
-        pytest.skip("Latest Triton requires FP8 activation on Blackwell")
 
     mapping = Mapping()
     mapping.rank = mpi_rank()
diff --git a/tests/unittest/_torch/modules/test_triton_linear.py b/tests/unittest/_torch/modules/test_triton_linear.py
index 2d5e87ae688..9b1229cf5d8 100644
--- a/tests/unittest/_torch/modules/test_triton_linear.py
+++ b/tests/unittest/_torch/modules/test_triton_linear.py
@@ -5,10 +5,8 @@
 import pytest
 import torch
 from mpi4py import MPI
-from utils.util import check_accuracy, skip_pre_hopper
+from utils.util import check_accuracy, skip_no_hopper
 
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
-    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm._torch.modules.linear import Linear
 from tensorrt_llm._torch.modules.triton_linear import TritonLinear
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
@@ -21,11 +19,10 @@
 )
 
 
-@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear])
+@pytest.mark.parametrize(
+    "linear_cls",
+    [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)])
 def test_linear_unquantized(linear_cls):
-    if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear:
-        pytest.skip("Triton kernels are not available")
-
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
     num_tokens = 128
@@ -56,11 +53,10 @@ def test_linear_unquantized(linear_cls):
     check_accuracy(actual_c, reference_c, atol=0.01, rtol=0.01, percent=0.99)
 
 
-@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear])
+@pytest.mark.parametrize(
+    "linear_cls",
+    [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)])
 def test_linear_fp8qdq(linear_cls):
-    if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear:
-        pytest.skip("Triton kernels are not available")
-
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
     num_tokens = 128
@@ -100,18 +96,12 @@ def test_linear_fp8qdq(linear_cls):
                    percent=0.99)
 
 
-@skip_pre_hopper
+@skip_no_hopper
 @pytest.mark.parametrize("activation_dtype",
                          [torch.bfloat16, torch.float8_e4m3fn])
 def test_linear_mxfp4(activation_dtype):
-    if not IS_TRITON_KERNELS_AVAILABLE:
-        pytest.skip("Triton kernels are not available")
-    if torch.cuda.get_device_capability(
-    )[0] < 10 and activation_dtype == torch.float8_e4m3fn:
+    if activation_dtype == torch.float8_e4m3fn:
         pytest.skip("Latest Triton requires BF16 activation on Hopper")
-    if torch.cuda.get_device_capability(
-    )[0] >= 10 and activation_dtype == torch.bfloat16:
-        pytest.skip("Latest Triton requires FP8 activation on Blackwell")
 
     dtype = torch.bfloat16
     num_tokens = 128
diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py
index 2c731e9110b..828bf08ea55 100644
--- a/tests/unittest/utils/util.py
+++ b/tests/unittest/utils/util.py
@@ -106,6 +106,9 @@ def isSM100Family():
 skip_pre_hopper = pytest.mark.skipif(
     getSMVersion() < 90,
     reason="This test is not supported in pre-Hopper architecture")
+skip_no_hopper = pytest.mark.skipif(
+    getSMVersion() != 90,
+    reason="This test is only supported in Hopper architecture")
 skip_pre_blackwell = pytest.mark.skipif(
     getSMVersion() < 100,
     reason="This test is not supported in pre-Blackwell architecture")