Skip to content

Commit e235e69

Browse files
committed
Merge branch 'main' into lluo/release_windows_zip
2 parents 6eebf8e + 9682ea3 commit e235e69

File tree

6 files changed

+160
-121
lines changed

6 files changed

+160
-121
lines changed

.github/scripts/filter-matrix.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
# currently we don't support python 3.13t due to tensorrt does not support 3.13t
1010
disabled_python_versions: List[str] = ["3.13t", "3.14", "3.14t"]
11+
disabled_cuda_versions: List[str] = ["cu130"]
1112

1213
# jetpack 6.2 only officially supports python 3.10 and cu126
1314
jetpack_python_versions: List[str] = ["3.10"]
@@ -36,7 +37,9 @@ def filter_matrix_item(
3637
if item["python_version"] in disabled_python_versions:
3738
# Skipping disabled Python version
3839
return False
39-
40+
if item["desired_cuda"] in disabled_cuda_versions:
41+
# Skipping disabled CUDA version
42+
return False
4043
if is_jetpack:
4144
if limit_pr_builds:
4245
# pr build,matrix passed from test-infra is cu128, python 3.9, change to cu126, python 3.10

docsrc/getting_started/jetpack.rst

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,22 @@ System Preparation
6060
sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
6161
sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
6262
63-
Building Torch-TensorRT
64-
***********************
63+
Installation Torch-TensorRT in JetPack
64+
*************************************
65+
66+
You can directly install the torch-tensorrt wheel from the JPL repo which is built specifically for JetPack 6.2.
67+
68+
.. code-block:: sh
69+
# verify tensorrt 10.3 is already installed via jetpack installation process
70+
python -m pip list | grep tensorrt
71+
# install torch-tensorrt wheel from JPL repo which is built specifically for JetPack 6.2
72+
python -m pip install torch==2.8.0 torch_tensorrt==2.8.0 torchvision==0.24.0 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126
73+
74+
75+
Building Torch-TensorRT in JetPack
76+
*********************************
77+
78+
You can also build the torch-tensorrt wheel from the source code on your own.
6579

6680
Build Environment Setup
6781
=======================
@@ -92,25 +106,22 @@ Build Environment Setup
92106
# Can only install the torch and torchvision wheel from the JPL repo which is built specifically for JetPack 6.2
93107
python -m pip install torch==2.8.0 torchvision==0.23.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126
94108
109+
4. **Build the Wheel**:
95110

96-
Building the Wheel
97-
==================
111+
.. code-block:: sh
98112
99-
.. code-block:: sh
100-
python setup.py bdist_wheel --jetpack
113+
python setup.py bdist_wheel --jetpack
101114
102-
Installation
103-
============
115+
5. **Install the Wheel**:
104116

105-
.. code-block:: sh
106-
# you will be able to find the wheel in the dist directory, has platform name linux_tegra_aarch64
117+
.. code-block:: sh
118+
119+
# you will be able to find the wheel in the dist directory
107120
cd dist
108-
python -m pip install torch_tensorrt-2.8.0.dev0+d8318d8fc-cp310-cp310-linux_tegra_aarch64.whl
121+
python -m pip install torch_tensorrt-2.8.0.dev0+d8318d8fc-cp310-cp310-linux_aarch64.whl
109122

110-
Post-Installation Verification
111-
==============================
123+
6. **Verify installation by importing in Python**:
112124

113-
Verify installation by importing in Python:
114125
.. code-block:: python
115126
116127
# verify whether the torch-tensorrt can be imported

py/torch_tensorrt/_compile.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import collections.abc
44
import logging
55
import platform
6+
import warnings
67
from enum import Enum
78
from typing import Any, Callable, List, Optional, Sequence, Set, Union
89

@@ -121,6 +122,11 @@ def _get_target_fe(module_type: _ModuleType, ir: str) -> _IRType:
121122
"Requested using the TS frontend but the TS frontend is not available in this build of Torch-TensorRT"
122123
)
123124
elif module_is_fxable and ir_targets_fx:
125+
warnings.warn(
126+
"FX frontend is deprecated. Please use the Dynamo frontend instead.",
127+
DeprecationWarning,
128+
stacklevel=2,
129+
)
124130
if ENABLED_FEATURES.fx_frontend:
125131
return _IRType.fx
126132
else:
@@ -237,6 +243,11 @@ def compile(
237243
)
238244
return compiled_ts_module
239245
elif target_ir == _IRType.fx:
246+
warnings.warn(
247+
"FX frontend is deprecated. Please use the Dynamo frontend instead.",
248+
DeprecationWarning,
249+
stacklevel=2,
250+
)
240251
if not ENABLED_FEATURES.fx_frontend:
241252
raise RuntimeError(
242253
"FX frontend is not enabled, cannot compile with target_ir=fx"

tests/py/dynamo/automatic_plugin/test_automatic_plugin.py

Lines changed: 49 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -12,54 +12,60 @@
1212
from ..conversion.harness import DispatchTestCase
1313

1414

15+
@triton.jit
16+
def elementwise_mul_kernel(X, Y, Z, BLOCK_SIZE: tl.constexpr):
17+
# Program ID determines the block of data each thread will process
18+
pid = tl.program_id(0)
19+
# Compute the range of elements that this thread block will work on
20+
block_start = pid * BLOCK_SIZE
21+
# Range of indices this thread will handle
22+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
23+
# Load elements from the X and Y tensors
24+
x_vals = tl.load(X + offsets)
25+
y_vals = tl.load(Y + offsets)
26+
# Perform the element-wise multiplication
27+
z_vals = x_vals * y_vals
28+
# Store the result in Z
29+
tl.store(Z + offsets, z_vals)
30+
31+
32+
@torch.library.custom_op("torchtrt_ex::elementwise_mul", mutates_args=()) # type: ignore[misc]
33+
def elementwise_mul(X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
34+
# Ensure the tensors are on the GPU
35+
assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
36+
assert X.shape == Y.shape, "Tensors must have the same shape."
37+
38+
# Create output tensor
39+
Z = torch.empty_like(X)
40+
41+
# Define block size
42+
BLOCK_SIZE = 1024
43+
44+
# Grid of programs
45+
grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
46+
47+
# Launch the kernel
48+
elementwise_mul_kernel[grid](X, Y, Z, BLOCK_SIZE=BLOCK_SIZE)
49+
50+
return Z
51+
52+
53+
@torch.library.register_fake("torchtrt_ex::elementwise_mul")
54+
def elementwise_mul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
55+
return x
56+
57+
58+
if not torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx:
59+
torch_tensorrt.dynamo.conversion.plugins.custom_op(
60+
"torchtrt_ex::elementwise_mul", supports_dynamic_shapes=True
61+
)
62+
63+
1564
@unittest.skipIf(
1665
torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx,
1766
"TensorRT RTX does not support plugins",
1867
)
1968
class TestAutomaticPlugin(DispatchTestCase):
20-
@triton.jit
21-
def elementwise_mul_kernel(X, Y, Z, BLOCK_SIZE: tl.constexpr):
22-
# Program ID determines the block of data each thread will process
23-
pid = tl.program_id(0)
24-
# Compute the range of elements that this thread block will work on
25-
block_start = pid * BLOCK_SIZE
26-
# Range of indices this thread will handle
27-
offsets = block_start + tl.arange(0, BLOCK_SIZE)
28-
# Load elements from the X and Y tensors
29-
x_vals = tl.load(X + offsets)
30-
y_vals = tl.load(Y + offsets)
31-
# Perform the element-wise multiplication
32-
z_vals = x_vals * y_vals
33-
# Store the result in Z
34-
tl.store(Z + offsets, z_vals)
35-
36-
@torch.library.custom_op("torchtrt_ex::elementwise_mul", mutates_args=()) # type: ignore[misc]
37-
def elementwise_mul(X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
38-
# Ensure the tensors are on the GPU
39-
assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
40-
assert X.shape == Y.shape, "Tensors must have the same shape."
41-
42-
# Create output tensor
43-
Z = torch.empty_like(X)
44-
45-
# Define block size
46-
BLOCK_SIZE = 1024
47-
48-
# Grid of programs
49-
grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
50-
51-
# Launch the kernel
52-
elementwise_mul_kernel[grid](X, Y, Z, BLOCK_SIZE=BLOCK_SIZE)
53-
54-
return Z
55-
56-
@torch.library.register_fake("torchtrt_ex::elementwise_mul")
57-
def elementwise_mul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
58-
return x
59-
60-
torch_tensorrt.dynamo.conversion.plugins.custom_op(
61-
"torchtrt_ex::elementwise_mul", supports_dynamic_shapes=True
62-
)
6369

6470
@parameterized.expand(
6571
[

tests/py/dynamo/automatic_plugin/test_automatic_plugin_with_attrs.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import unittest
12
from typing import Tuple
23

34
import torch
@@ -11,57 +12,62 @@
1112
from ..conversion.harness import DispatchTestCase
1213

1314

14-
@unittest.skipIf(
15-
torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx,
16-
"TensorRT RTX does not support plugins",
17-
)
18-
class TestAutomaticPlugin(DispatchTestCase):
15+
@triton.jit
16+
def elementwise_scale_mul_kernel(X, Y, Z, a, b, BLOCK_SIZE: tl.constexpr):
17+
pid = tl.program_id(0)
18+
# Compute the range of elements that this thread block will work on
19+
block_start = pid * BLOCK_SIZE
20+
# Range of indices this thread will handle
21+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
22+
# Load elements from the X and Y tensors
23+
x_vals = tl.load(X + offsets)
24+
y_vals = tl.load(Y + offsets)
25+
# Perform the element-wise multiplication
26+
z_vals = x_vals * y_vals * a + b
27+
# Store the result in Z
28+
tl.store(Z + offsets, z_vals)
29+
30+
31+
@torch.library.custom_op("torchtrt_ex::elementwise_scale_mul", mutates_args=()) # type: ignore[misc]
32+
def elementwise_scale_mul(
33+
X: torch.Tensor, Y: torch.Tensor, b: float = 0.2, a: int = 2
34+
) -> torch.Tensor:
35+
# Ensure the tensors are on the GPU
36+
assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
37+
assert X.shape == Y.shape, "Tensors must have the same shape."
38+
39+
# Create output tensor
40+
Z = torch.empty_like(X)
41+
42+
# Define block size
43+
BLOCK_SIZE = 1024
44+
45+
# Grid of programs
46+
grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
47+
48+
# Launch the kernel with parameters a and b
49+
elementwise_scale_mul_kernel[grid](X, Y, Z, a, b, BLOCK_SIZE=BLOCK_SIZE)
1950

20-
@triton.jit
21-
def elementwise_scale_mul_kernel(X, Y, Z, a, b, BLOCK_SIZE: tl.constexpr):
22-
pid = tl.program_id(0)
23-
# Compute the range of elements that this thread block will work on
24-
block_start = pid * BLOCK_SIZE
25-
# Range of indices this thread will handle
26-
offsets = block_start + tl.arange(0, BLOCK_SIZE)
27-
# Load elements from the X and Y tensors
28-
x_vals = tl.load(X + offsets)
29-
y_vals = tl.load(Y + offsets)
30-
# Perform the element-wise multiplication
31-
z_vals = x_vals * y_vals * a + b
32-
# Store the result in Z
33-
tl.store(Z + offsets, z_vals)
34-
35-
@torch.library.custom_op("torchtrt_ex::elementwise_scale_mul", mutates_args=()) # type: ignore[misc]
36-
def elementwise_scale_mul(
37-
X: torch.Tensor, Y: torch.Tensor, b: float = 0.2, a: int = 2
38-
) -> torch.Tensor:
39-
# Ensure the tensors are on the GPU
40-
assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
41-
assert X.shape == Y.shape, "Tensors must have the same shape."
42-
43-
# Create output tensor
44-
Z = torch.empty_like(X)
45-
46-
# Define block size
47-
BLOCK_SIZE = 1024
48-
49-
# Grid of programs
50-
grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
51-
52-
# Launch the kernel with parameters a and b
53-
elementwise_scale_mul_kernel[grid](X, Y, Z, a, b, BLOCK_SIZE=BLOCK_SIZE)
54-
55-
return Z
56-
57-
@torch.library.register_fake("torchtrt_ex::elementwise_scale_mul")
58-
def _(x: torch.Tensor, y: torch.Tensor, b: float = 0.2, a: int = 2) -> torch.Tensor:
59-
return x
51+
return Z
6052

53+
54+
@torch.library.register_fake("torchtrt_ex::elementwise_scale_mul")
55+
def _(x: torch.Tensor, y: torch.Tensor, b: float = 0.2, a: int = 2) -> torch.Tensor:
56+
return x
57+
58+
59+
if not torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx:
6160
torch_tensorrt.dynamo.conversion.plugins.custom_op(
6261
"torchtrt_ex::elementwise_scale_mul", supports_dynamic_shapes=True
6362
)
6463

64+
65+
@unittest.skipIf(
66+
torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx,
67+
"TensorRT RTX does not support plugins",
68+
)
69+
class TestAutomaticPlugin(DispatchTestCase):
70+
6571
@parameterized.expand(
6672
[
6773
((64, 64), torch.float),

tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,26 @@
1111

1212
from ..conversion.harness import DispatchTestCase
1313

14-
# flashinfer has been impacted by torch upstream change: https://github.com/pytorch/pytorch/commit/660b0b8128181d11165176ea3f979fa899f24db1
15-
# got ImportError: cannot import name '_get_pybind11_abi_build_flags' from 'torch.utils.cpp_extension'
16-
# if importlib.util.find_spec("flashinfer"):
17-
# import flashinfer
14+
if importlib.util.find_spec("flashinfer"):
15+
import flashinfer
16+
17+
18+
@torch.library.custom_op("flashinfer::rmsnorm", mutates_args=()) # type: ignore[misc]
19+
def flashinfer_rmsnorm(
20+
input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
21+
) -> torch.Tensor:
22+
return flashinfer.norm.rmsnorm(input, weight)
23+
24+
25+
@torch.library.register_fake("flashinfer::rmsnorm")
26+
def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor:
27+
return input
28+
29+
30+
if not torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx:
31+
torch_tensorrt.dynamo.conversion.plugins.custom_op(
32+
"flashinfer::rmsnorm", supports_dynamic_shapes=True
33+
)
1834

1935

2036
@unittest.skip("Not Available")
@@ -25,20 +41,6 @@
2541
)
2642
class TestAutomaticPlugin(DispatchTestCase):
2743

28-
@torch.library.custom_op("flashinfer::rmsnorm", mutates_args=()) # type: ignore[misc]
29-
def flashinfer_rmsnorm(
30-
input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
31-
) -> torch.Tensor:
32-
return flashinfer.norm.rmsnorm(input, weight)
33-
34-
@torch.library.register_fake("flashinfer::rmsnorm")
35-
def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor:
36-
return input
37-
38-
torch_tensorrt.dynamo.conversion.plugins.custom_op(
39-
"flashinfer::rmsnorm", supports_dynamic_shapes=True
40-
)
41-
4244
@parameterized.expand(
4345
[
4446
((64, 64), (64,), torch.float16),

0 commit comments

Comments
 (0)