Skip to content

Commit 3ffdba4

Browse files
[None][feat] Include triton-kernels as a packaged dependency
Signed-off-by: Anish Shanbhag <[email protected]>
1 parent bb6a397 commit 3ffdba4

File tree

12 files changed

+59
-155
lines changed

12 files changed

+59
-155
lines changed

ATTRIBUTIONS-Python.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62417,6 +62417,40 @@ License: `MIT License`
6241762417
- `Homepage`: https://github.com/triton-lang/triton/
6241862418

6241962419

62420+
## triton-kernels (1.0.0)
62421+
62422+
### Licenses
62423+
License: `MIT License`
62424+
62425+
- `LICENSE` (from triton repository root):
62426+
```
62427+
Copyright 2018-2020 Philippe Tillet
62428+
Copyright 2020-2022 OpenAI
62429+
62430+
Permission is hereby granted, free of charge, to any person obtaining
62431+
a copy of this software and associated documentation files
62432+
(the "Software"), to deal in the Software without restriction,
62433+
including without limitation the rights to use, copy, modify, merge,
62434+
publish, distribute, sublicense, and/or sell copies of the Software,
62435+
and to permit persons to whom the Software is furnished to do so,
62436+
subject to the following conditions:
62437+
62438+
The above copyright notice and this permission notice shall be
62439+
included in all copies or substantial portions of the Software.
62440+
62441+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
62442+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62443+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
62444+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
62445+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
62446+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
62447+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
62448+
```
62449+
62450+
### URLs
62451+
- `Source`: https://github.com/triton-lang/triton/tree/f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f/python/triton_kernels
62452+
62453+
6242062454
## tritonclient (2.63.0)
6242162455

6242262456
### Licenses

examples/models/core/gpt_oss/README.md

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -107,33 +107,10 @@ Once again, the function call works successfully, this time using a different fu
107107

108108
## Using OpenAI Triton Kernels for MoE
109109

110-
OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels; enable them with the steps below:
111110

112-
1. **Build and install Triton** (tested with the commit below):
111+
OpenAI ships a set of Triton kernels optimized for its MoE models.
113112

114-
```bash
115-
git clone https://github.com/triton-lang/triton.git
116-
cd triton
117-
# Specific commit verified with TensorRT-LLM
118-
git checkout f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f
119-
pip install -r python/requirements.txt # build-time dependencies
120-
pip install wheel build
121-
python3 setup.py bdist_wheel
122-
pip install ./dist/*.whl
123-
```
124-
125-
2. **Expose the Triton kernels to TensorRT-LLM**
126-
The kernels are not packaged in the wheel, so set the environment variable `TRITON_ROOT` to your Triton clone:
127-
128-
```bash
129-
export TRITON_ROOT=/local/user/triton
130-
# TensorRT-LLM expects the kernels at:
131-
# $TRITON_ROOT/python/triton_kernels
132-
```
133-
134-
3. **Select Triton as the MoE backend**
135-
136-
**trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`:
113+
To use the Triton MoE backend with **trtllm-serve** (or other similar commands), add this snippet to the YAML file passed via `--config`:
137114

138115
```yaml
139116
moe_config:

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225
6666
blake3
6767
soundfile
6868
triton==3.5.0
69+
# triton_kernels provides OpenAI's MoE kernels (matmul_ogs, routing, swiglu)
70+
# NOTE: the version below should be aligned with the triton version above
71+
triton-kernels @ git+https://github.com/triton-lang/[email protected]#subdirectory=python/triton_kernels
6972
tiktoken
7073
blobfile
7174
openai-harmony==0.0.4

tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py

Lines changed: 9 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,15 @@
44

55
import torch
66
import torch.nn.functional as F
7-
8-
IS_TRITON_KERNELS_AVAILABLE = True
9-
TRITON_KERNELS_UNAVAILABLE_REASON = ""
10-
11-
try:
12-
from triton_kernels.matmul_ogs import (
13-
FlexCtx,
14-
FnSpecs,
15-
FusedActivation,
16-
PrecisionConfig,
17-
matmul_ogs,
18-
)
19-
from triton_kernels.numerics import InFlexData
20-
from triton_kernels.routing import RoutingData, routing
21-
from triton_kernels.swiglu import swiglu_fn
22-
from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
23-
from triton_kernels.tensor_details import layout
24-
from triton_kernels.tensor_details.layout import StridedLayout
25-
26-
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter
27-
28-
except Exception as _e:
29-
IS_TRITON_KERNELS_AVAILABLE = False
30-
TRITON_KERNELS_UNAVAILABLE_REASON = f"{type(_e).__name__}: {_e}"
31-
32-
FlexCtx = FnSpecs = FusedActivation = PrecisionConfig = matmul_ogs = None
33-
InFlexData = RoutingData = routing = swiglu_fn = None
34-
FP4 = convert_layout = wrap_torch_tensor = None
35-
layout = StridedLayout = None
36-
TritonEPRouter = None
7+
from triton_kernels.matmul_ogs import FlexCtx, FnSpecs, FusedActivation, PrecisionConfig, matmul_ogs
8+
from triton_kernels.numerics import InFlexData
9+
from triton_kernels.routing import RoutingData, routing
10+
from triton_kernels.swiglu import swiglu_fn
11+
from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
12+
from triton_kernels.tensor_details import layout
13+
from triton_kernels.tensor_details.layout import StridedLayout
14+
15+
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter
3716

3817

3918
# copied from transformers.integrations.mxfp4::swizzle_mxfp4 with minor modification

tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
register_ad_pattern,
1010
)
1111

12-
from ...custom_ops.fused_moe.mxfp4_moe import IS_TRITON_KERNELS_AVAILABLE
1312
from ...utils.module import get_submodule_of_param
1413
from ...utils.node_utils import is_op
1514
from ..interface import BaseTransform, TransformInfo, TransformRegistry
@@ -220,11 +219,7 @@ def _apply(
220219
shared_config,
221220
) -> Tuple[GraphModule, TransformInfo]:
222221
qcfg = factory.get_quant_config()
223-
if (
224-
not qcfg
225-
or qcfg.get("quant_method", "") != self.algo_name
226-
or not IS_TRITON_KERNELS_AVAILABLE
227-
):
222+
if not qcfg or qcfg.get("quant_method", "") != self.algo_name:
228223
return gm, TransformInfo(
229224
skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
230225
)

tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,19 @@
11
from __future__ import annotations
22

33
import os
4-
import sys
54
from typing import Dict, List, NamedTuple, Optional
65

76
import torch
87
import torch.nn as nn
98
import triton
109
import triton.language as tl
11-
12-
IS_TRITON_KERNELS_AVAILABLE = False
13-
# We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels
14-
# Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified.
15-
triton_root = os.getenv('TRITON_ROOT')
16-
if triton_root:
17-
triton_root = os.path.abspath(
18-
os.path.join(triton_root, 'python', 'triton_kernels'))
19-
if os.path.exists(triton_root) and triton_root not in sys.path:
20-
sys.path.insert(0, triton_root)
21-
assert triton.__version__ >= "3.4.0", "Triton kernels are detected but the Triton wheel is too old"
22-
import triton_kernels.swiglu
23-
from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation,
24-
PrecisionConfig, matmul_ogs)
25-
from triton_kernels.numerics import InFlexData
26-
from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch
27-
from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
28-
from triton_kernels.tensor_details import layout
29-
IS_TRITON_KERNELS_AVAILABLE = True
10+
import triton_kernels.swiglu
11+
from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation,
12+
PrecisionConfig, matmul_ogs)
13+
from triton_kernels.numerics import InFlexData
14+
from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch
15+
from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
16+
from triton_kernels.tensor_details import layout
3017

3118
from ...model_config import ModelConfig
3219
from ..linear import TensorParallelMode, load_weight_shard
@@ -1295,8 +1282,6 @@ def __init__(
12951282
weight_loading_mode=weight_loading_mode,
12961283
layer_idx=layer_idx,
12971284
)
1298-
if not IS_TRITON_KERNELS_AVAILABLE:
1299-
raise ImportError("Triton kernels are not available.")
13001285
if torch.cuda.get_device_capability()[0] != 9 and self.ep_size > 1:
13011286
raise NotImplementedError(
13021287
"TritonFusedMoE is only supported on Hopper with EP size > 1.")

tensorrt_llm/_torch/modules/triton_linear.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,15 @@
44

55
import torch
66
from torch.nn.parameter import Parameter
7+
from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, matmul_ogs
8+
from triton_kernels.numerics import InFlexData
79

810
from tensorrt_llm._torch.peft.lora.layer import LoraLayer
911
from tensorrt_llm.mapping import Mapping
1012

1113
from ...models.modeling_utils import QuantConfig
12-
# Reuse the common Triton import setup
13-
from .fused_moe.fused_moe_triton import (IS_TRITON_KERNELS_AVAILABLE,
14-
maybe_update_stride,
14+
from .fused_moe.fused_moe_triton import (maybe_update_stride,
1515
swizzle_weight_and_scale)
16-
17-
if IS_TRITON_KERNELS_AVAILABLE:
18-
from triton_kernels.matmul_ogs import (FlexCtx, PrecisionConfig, matmul_ogs)
19-
from triton_kernels.numerics import InFlexData
20-
2116
from .linear import (Linear, LinearMethodBase, TensorParallelMode,
2217
WeightsLoadingConfig, copy_weight, load_weight_shard,
2318
load_weights_fused_gate_up_helper,
@@ -383,9 +378,6 @@ def __init__(
383378
use_custom_cublas_mm: bool = False,
384379
lora: Optional[LoraLayer] = None,
385380
):
386-
if not IS_TRITON_KERNELS_AVAILABLE:
387-
raise ImportError("Triton kernels are not available. "
388-
"Please install the required dependencies.")
389381
assert not use_custom_cublas_mm, "TritonLinear does not support custom cublas mm."
390382

391383
super().__init__(

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ def patched_start_mpi_pool(self):
4848

4949
from tensorrt_llm import LLM
5050
from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig
51-
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
52-
IS_TRITON_KERNELS_AVAILABLE
5351
from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig,
5452
DeepSeekSparseAttentionConfig,
5553
EagleDecodingConfig, KvCacheConfig, MoeConfig,
@@ -3603,8 +3601,6 @@ def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
36033601
{"ENABLE_CONFIGURABLE_MOE": env_value})
36043602

36053603
if moe_backend == "TRITON":
3606-
if not IS_TRITON_KERNELS_AVAILABLE:
3607-
pytest.skip("TRITON moe backend is not available.")
36083604
if get_sm_version() < 90:
36093605
pytest.skip("TRITON moe backend requires Hopper or newer.")
36103606
if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100:
@@ -4042,8 +4038,6 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
40424038
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
40434039
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
40444040
{"scores_filter": "exact_match,flexible-extract"})
4045-
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
4046-
pytest.skip("Triton kernels are not available")
40474041

40484042
pytorch_config = dict(
40494043
disable_overlap_scheduler=not overlap_scheduler,
@@ -4113,10 +4107,6 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
41134107
patch_mpi_pool_session_for_env(mocker,
41144108
{"ENABLE_CONFIGURABLE_MOE": env_value})
41154109

4116-
if moe_backend == "TRITON":
4117-
if not IS_TRITON_KERNELS_AVAILABLE:
4118-
pytest.skip("Triton kernels are not available")
4119-
41204110
MAX_OUTPUT_LEN = 128179
41214111
MAX_INPUT_LEN = 32768
41224112

@@ -4191,9 +4181,6 @@ def test_w4_8gpus(self, moe_backend, tp_size, pp_size, ep_size,
41914181
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
41924182
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
41934183
{"scores_filter": "exact_match,flexible-extract"})
4194-
if moe_backend == "TRITON":
4195-
if not IS_TRITON_KERNELS_AVAILABLE:
4196-
pytest.skip("Triton kernels are not available")
41974184

41984185
pytorch_config = dict(
41994186
disable_overlap_scheduler=not overlap_scheduler,
@@ -4229,8 +4216,6 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
42294216
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
42304217
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
42314218
{"scores_filter": "exact_match,flexible-extract"})
4232-
if not IS_TRITON_KERNELS_AVAILABLE:
4233-
pytest.skip("Triton kernels are not available")
42344219
monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
42354220

42364221
pytorch_config = dict(
@@ -4273,10 +4258,6 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
42734258
def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
42744259
ep_size, attention_dp, cuda_graph, overlap_scheduler,
42754260
mocker):
4276-
if moe_backend == "TRITON":
4277-
if not IS_TRITON_KERNELS_AVAILABLE:
4278-
pytest.skip("Triton kernels are not available")
4279-
42804261
pytorch_config = dict(
42814262
disable_overlap_scheduler=not overlap_scheduler,
42824263
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
@@ -4351,10 +4332,6 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp,
43514332
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
43524333
ids=["cutlass", "trtllm", "triton"])
43534334
def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
4354-
if moe_backend == "TRITON":
4355-
if not IS_TRITON_KERNELS_AVAILABLE:
4356-
pytest.skip("Triton kernels are not available")
4357-
43584335
MAX_OUTPUT_LEN = 128179
43594336
MAX_INPUT_LEN = 32768
43604337

@@ -4421,10 +4398,6 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
44214398
ids=["cutlass", "trtllm", "triton"])
44224399
def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
44234400
mocker):
4424-
if moe_backend == "TRITON":
4425-
if not IS_TRITON_KERNELS_AVAILABLE:
4426-
pytest.skip("Triton kernels are not available")
4427-
44284401
if get_sm_version() == 90:
44294402
pytest.skip(
44304403
"https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4"
@@ -4613,10 +4586,6 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
46134586
ids=["cutlass", "trtllm", "triton"])
46144587
def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
46154588
mocker):
4616-
if moe_backend == "TRITON":
4617-
if not IS_TRITON_KERNELS_AVAILABLE:
4618-
pytest.skip("Triton kernels are not available")
4619-
46204589
MAX_OUTPUT_LEN = 128179
46214590
MAX_INPUT_LEN = 32768
46224591

tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55
import torch.distributed as dist
66
from _dist_test_utils import get_device_counts
77

8-
from tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.mxfp4_moe import (
9-
IS_TRITON_KERNELS_AVAILABLE,
10-
)
118
from tensorrt_llm._torch.auto_deploy.distributed.common import spawn_multiprocess_job
129

1310

@@ -109,10 +106,6 @@ def _rand_scales(shape):
109106
torch.testing.assert_close(part_out, ref_out, rtol=5e-2, atol=5e-2, equal_nan=True)
110107

111108

112-
@pytest.mark.skipif(
113-
not IS_TRITON_KERNELS_AVAILABLE,
114-
reason="triton_kernels unavailable",
115-
)
116109
@pytest.mark.parametrize("num_experts", [6, 8])
117110
@pytest.mark.parametrize("topk", [4]) # must be <= num_experts
118111
@pytest.mark.parametrize("device_count", get_device_counts())

tests/unittest/_torch/modeling/test_modeling_gpt_oss.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from utils.llm_data import llm_models_root
88

99
from tensorrt_llm import LLM, SamplingParams
10-
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
11-
IS_TRITON_KERNELS_AVAILABLE
1210
from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig
1311

1412
configs = """
@@ -50,9 +48,6 @@ def dump_config_json(dst_dir):
5048

5149
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"])
5250
def test_gpt_oss_trtllmgen(moe_backend):
53-
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
54-
pytest.skip("Triton kernels are not available")
55-
5651
prompts = [
5752
"How are you?",
5853
"Hello, my name is",

0 commit comments

Comments
 (0)