Skip to content

Commit 949a5fd

Browse files
committed
Update base for Update on "introduce cuda stream into runtime backend"
This diff introduces CUDA streams into the Executorch runtime backend. The changes include: * Adding CUDA stream support to the `cuda_backend.cpp` file * Including the `cuda_runtime.h` header file in `cuda_backend.cpp` * Adding a `void* cuda_stream` field to the `AOTInductorModelContainer` struct in `aoti_model_container.h` to store the CUDA stream * Defining a new macro `ET_CHECK_OR_LOG` in `log.h` to check a condition and log an error message if the condition is false. Differential Revision: [D84128173](https://our.internmc.facebook.com/intern/diff/D84128173/) [ghstack-poisoned]
2 parents 5e7f1b4 + 0142a1a commit 949a5fd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1543
-282
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bd06b54e627fbfd354a2cffa4c80fb21883209a9
1+
44d8d54e38c0258357d4e92e1fefe21e845947a3

.github/workflows/cuda.yml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,87 @@ jobs:
8686
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
89+
90+
test-voxtral-cuda-e2e:
91+
name: test-voxtral-cuda-e2e
92+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
93+
permissions:
94+
id-token: write
95+
contents: read
96+
secrets: inherit
97+
strategy:
98+
fail-fast: false
99+
with:
100+
timeout: 90
101+
secrets-env: EXECUTORCH_HF_TOKEN
102+
runner: linux.g5.4xlarge.nvidia.gpu
103+
gpu-arch-type: cuda
104+
gpu-arch-version: 12.6
105+
use-custom-docker-registry: false
106+
submodules: recursive
107+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108+
script: |
109+
set -eux
110+
111+
echo "::group::Setup ExecuTorch"
112+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
113+
echo "::endgroup::"
114+
115+
echo "::group::Setup Huggingface"
116+
pip install -U "huggingface_hub[cli]" accelerate
117+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
118+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120+
pip install mistral-common librosa
121+
echo "::endgroup::"
122+
123+
echo "::group::Export Voxtral"
124+
optimum-cli export executorch \
125+
--model "mistralai/Voxtral-Mini-3B-2507" \
126+
--task "multimodal-text-to-text" \
127+
--recipe "cuda" \
128+
--dtype bfloat16 \
129+
--device cuda \
130+
--max_seq_len 1024 \
131+
--output_dir ./
132+
echo "::endgroup::"
133+
134+
echo "::group::Build Voxtral Runner"
135+
cmake -DCMAKE_BUILD_TYPE=Release \
136+
-DEXECUTORCH_BUILD_CUDA=ON \
137+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
138+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
139+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
140+
-DEXECUTORCH_BUILD_TESTS=ON \
141+
-Bcmake-out .
142+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143+
echo "::endgroup::"
144+
145+
echo "::group::Run Voxtral Runner"
146+
# Capture output and allow exit code 139 if we have the expected printout
147+
set +e
148+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149+
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
150+
EXIT_CODE=$?
151+
set -e
152+
153+
echo "$OUTPUT"
154+
155+
# Check if the output contains "Run latency (ms):"
156+
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157+
echo "Found expected output: 'Run latency (ms):'"
158+
if [ $EXIT_CODE -eq 139 ]; then
159+
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160+
exit 0
161+
elif [ $EXIT_CODE -ne 0 ]; then
162+
echo "Unexpected exit code: $EXIT_CODE"
163+
exit $EXIT_CODE
164+
else
165+
echo "Command succeeded with exit code 0"
166+
exit 0
167+
fi
168+
else
169+
echo "Expected output 'Run latency (ms):' not found in output"
170+
exit 1
171+
fi
172+
echo "::endgroup::"

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
266266
executorch_move_interface_include_directories_to_build_time_only(
267267
pthreadpool_interface
268268
)
269+
270+
if(APPLE)
271+
# Use hidden visibility for pthreadpool on Apple platforms to avoid issues
272+
# with pthreadpool symbols from libtorch_cpu taking precedence over the ones
273+
# from the pthreadpool library statically linked in _portable_lib. The
274+
# pthreadpool public APIs are marked as weak by default on some Apple
275+
# platforms, so setting to hidden visibility works around this by not
276+
# putting the symbol in the indirection table. See
277+
# https://github.com/pytorch/executorch/issues/14321 for more details.
278+
target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
279+
endif()
280+
269281
install(
270282
TARGETS pthreadpool pthreadpool_interface fxdiv
271283
EXPORT ExecuTorchTargets

backends/aoti/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
3434
// Convert based on known PyTorch dtype codes (without CUDA-specific
3535
// dependency)
3636
switch (dtype) {
37+
case 4: // PyTorch's int64 dtype code
38+
return executorch::aten::ScalarType::Long;
3739
case 6: // PyTorch's float32 dtype code
3840
return executorch::aten::ScalarType::Float;
3941
case 15: // PyTorch's bfloat16 dtype code

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ReplaceScalarWithTensorArgPassTOSABI,
9292
ReplaceScalarWithTensorArgPassTOSAMI,
9393
)
94+
from .rewrite_matmul import RewriteMatmulPass # noqa
9495
from .rewrite_upsample import RewriteUpsamplePass # noqa
9596
from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa
9697
from .size_adjust_input_pass import SizeAdjustInputPass # noqa

backends/arm/_passes/_debug_passes.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
from typing import Set, Type
7+
68
import torch
79
from executorch.devtools.visualization.visualization_utils import visualize_graph
810
from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
1416
This pass visualizes the graph at the point of insertion in the pass manager
1517
"""
1618

19+
_passes_required_after: Set[Type[ExportPass]] = set()
20+
1721
def __init__(self, exported_program: ExportedProgram) -> None:
1822
super().__init__()
1923
self.exported_program = exported_program

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
ReplaceScalarWithTensorArgPassTOSABI,
9393
ReplaceScalarWithTensorArgPassTOSAMI,
9494
RetraceFoldedDtypesPass,
95+
RewriteMatmulPass,
9596
RewriteUpsamplePass,
9697
ScalarsToAttributePass,
9798
SizeAdjustInputPass,
@@ -211,6 +212,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
211212
self.add_pass(RewriteUpsamplePass(exported_program))
212213
self.add_pass(AddBiasPass(exported_program))
213214

215+
self.add_pass(RewriteMatmulPass(exported_program))
214216
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
215217
self.add_pass(ToTosaMemoryFormatPass(exported_program))
216218
self.add_pass(RemoveNoopPass())
@@ -297,6 +299,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
297299
self.add_pass(RewriteUpsamplePass(exported_program))
298300
self.add_pass(AddBiasPass(exported_program))
299301
self.add_pass(InsertTableOpsPass(exported_program))
302+
self.add_pass(RewriteMatmulPass(exported_program))
300303
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
301304
self.add_pass(ToTosaMemoryFormatPass(exported_program))
302305
self.add_pass(RemoveNoopPass())

backends/arm/_passes/convert_minmax_pass.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6-
from typing import Set, Type
6+
from typing import cast, Set, Type
77

88
import torch
9+
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
910
from executorch.backends.arm._passes.convert_squeezes_to_view import (
1011
ConvertSqueezesToViewPass,
1112
)
@@ -101,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
101102
replace_node, op, squeeze_op = self.get_variables(node)
102103

103104
# Unwrap args
104-
if len(node.args) == 2:
105+
if len(node.args) == 1:
106+
# If dims is unspecified, min/max over all dims.
107+
input_node = cast(torch.fx.Node, node.args[0])
108+
input_shape = get_first_fake_tensor(input_node).shape
109+
dims = range(len(input_shape))
110+
keepdims = False
111+
elif len(node.args) == 2:
105112
input_node, dims = node.args
106113
keepdims = False
107114
elif len(node.args) == 3:
108115
input_node, dims, keepdims = node.args
109116
else:
110-
raise RuntimeError(f"Unexpected arg size in {node.name}")
117+
raise RuntimeError(
118+
f"Unexpected arg size {len(node.args)} in {node.name}"
119+
)
111120

112121
try:
113-
iter(dims)
114-
except:
115-
dims = [dims]
122+
iter(dims) # type:ignore[assignment]
123+
except Exception:
124+
dims = [dims] # type:ignore[assignment]
116125
else:
117-
dims = list(dims)
126+
dims = list(dims) # type:ignore[assignment]
118127

119128
# Unroll multi-dimensional reduction and keep-dims arg
120129
with graph_module.graph.inserting_before(node):

backends/arm/_passes/decompose_div_tensor_mode.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
"full": exir_ops.edge.aten.full.default,
2323
"lt": exir_ops.edge.aten.lt.Tensor,
2424
"where": exir_ops.edge.aten.where.self,
25+
"mul": exir_ops.edge.aten.mul.Tensor,
26+
"sub": exir_ops.edge.aten.sub.Tensor,
2527
}
2628

2729
aten_unary = {
@@ -31,6 +33,8 @@
3133
"full": torch.ops.aten.full.default,
3234
"lt": torch.ops.aten.lt.Tensor,
3335
"where": torch.ops.aten.where.self,
36+
"mul": torch.ops.aten.mul.Tensor,
37+
"sub": torch.ops.aten.sub.Tensor,
3438
}
3539

3640

@@ -70,13 +74,57 @@ def call_operator(self, op, args, kwargs, meta):
7074
return q
7175

7276
if rounding_mode == "floor":
73-
return super().call_operator(opset["floor"], (q,), {}, meta)
77+
q_raw = q
78+
79+
# trunc(q_raw) = where(q_raw < 0, ceil(q_raw), floor(q_raw))
80+
q_floor = super().call_operator(opset["floor"], (q_raw,), {}, meta)
81+
q_ceil = super().call_operator(opset["ceil"], (q_raw,), {}, meta)
82+
83+
# a zero tensor with the right shape
84+
out_shape = (1,) * len(meta["val"].size())
85+
zero = super().call_operator(
86+
opset["full"],
87+
args=(out_shape, 0.0),
88+
kwargs={},
89+
meta=meta,
90+
)
91+
92+
is_neg = super().call_operator(opset["lt"], (q_raw, zero), {}, meta)
93+
q_trunc = super().call_operator(
94+
opset["where"], (is_neg, q_ceil, q_floor), {}, meta
95+
)
96+
97+
# r = a - q_trunc * b (true remainder under truncation)
98+
q_times_b = super().call_operator(opset["mul"], (q_trunc, b), {}, meta)
99+
r = super().call_operator(opset["sub"], (a, q_times_b), {}, meta)
100+
101+
# Decide if we need to subtract 1:
102+
# for b > 0, adjust if r < 0; for b < 0, adjust if r > 0.
103+
b_pos = super().call_operator(opset["lt"], (zero, b), {}, meta) # b > 0
104+
r_lt0 = super().call_operator(opset["lt"], (r, zero), {}, meta) # r < 0
105+
r_gt0 = super().call_operator(opset["lt"], (zero, r), {}, meta) # r > 0
106+
107+
adjust_if = super().call_operator(
108+
opset["where"], (b_pos, r_lt0, r_gt0), {}, meta
109+
)
110+
111+
one = super().call_operator(
112+
opset["full"],
113+
args=(out_shape, 1.0),
114+
kwargs={},
115+
meta=meta,
116+
)
117+
q_minus_1 = super().call_operator(opset["sub"], (q_trunc, one), {}, meta)
118+
119+
return super().call_operator(
120+
opset["where"], (adjust_if, q_minus_1, q_trunc), {}, meta
121+
)
74122

75123
if rounding_mode == "trunc":
76124
zero = super().call_operator(
77125
opset["full"],
78126
args=((1,) * len(meta["val"].size()), 0.0),
79-
kwargs={"dtype": torch.float32},
127+
kwargs={},
80128
meta=meta,
81129
)
82130
lt0 = self.call_operator(opset["lt"], (q, zero), {}, meta)

backends/arm/_passes/decompose_meandim_pass.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
9494
input_shape = list(x.data.shape)
9595
output_shape = list(meta["val"].shape)
9696
dims_to_reduce = get_node_arg(args, 1)
97+
if dims_to_reduce is None:
98+
dims_to_reduce = range(len(input_shape))
9799
dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
98100
dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
99101

0 commit comments

Comments
 (0)