Skip to content

Commit 122dd50

Browse files
authored
Merge branch 'main' into dev_index_put
2 parents e1dfa52 + 09c93d4 commit 122dd50

File tree

74 files changed

+2622
-314
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2622
-314
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bd06b54e627fbfd354a2cffa4c80fb21883209a9
1+
44d8d54e38c0258357d4e92e1fefe21e845947a3

.github/workflows/cuda.yml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,87 @@ jobs:
8686
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
89+
90+
test-voxtral-cuda-e2e:
91+
name: test-voxtral-cuda-e2e
92+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
93+
permissions:
94+
id-token: write
95+
contents: read
96+
secrets: inherit
97+
strategy:
98+
fail-fast: false
99+
with:
100+
timeout: 90
101+
secrets-env: EXECUTORCH_HF_TOKEN
102+
runner: linux.g5.4xlarge.nvidia.gpu
103+
gpu-arch-type: cuda
104+
gpu-arch-version: 12.6
105+
use-custom-docker-registry: false
106+
submodules: recursive
107+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108+
script: |
109+
set -eux
110+
111+
echo "::group::Setup ExecuTorch"
112+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
113+
echo "::endgroup::"
114+
115+
echo "::group::Setup Huggingface"
116+
pip install -U "huggingface_hub[cli]" accelerate
117+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
118+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120+
pip install mistral-common librosa
121+
echo "::endgroup::"
122+
123+
echo "::group::Export Voxtral"
124+
optimum-cli export executorch \
125+
--model "mistralai/Voxtral-Mini-3B-2507" \
126+
--task "multimodal-text-to-text" \
127+
--recipe "cuda" \
128+
--dtype bfloat16 \
129+
--device cuda \
130+
--max_seq_len 1024 \
131+
--output_dir ./
132+
echo "::endgroup::"
133+
134+
echo "::group::Build Voxtral Runner"
135+
cmake -DCMAKE_BUILD_TYPE=Release \
136+
-DEXECUTORCH_BUILD_CUDA=ON \
137+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
138+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
139+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
140+
-DEXECUTORCH_BUILD_TESTS=ON \
141+
-Bcmake-out .
142+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143+
echo "::endgroup::"
144+
145+
echo "::group::Run Voxtral Runner"
146+
# Capture output and allow exit code 139 if we have the expected printout
147+
set +e
148+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149+
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
150+
EXIT_CODE=$?
151+
set -e
152+
153+
echo "$OUTPUT"
154+
155+
# Check if the output contains "Run latency (ms):"
156+
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157+
echo "Found expected output: 'Run latency (ms):'"
158+
if [ $EXIT_CODE -eq 139 ]; then
159+
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160+
exit 0
161+
elif [ $EXIT_CODE -ne 0 ]; then
162+
echo "Unexpected exit code: $EXIT_CODE"
163+
exit $EXIT_CODE
164+
else
165+
echo "Command succeeded with exit code 0"
166+
exit 0
167+
fi
168+
else
169+
echo "Expected output 'Run latency (ms):' not found in output"
170+
exit 1
171+
fi
172+
echo "::endgroup::"

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
266266
executorch_move_interface_include_directories_to_build_time_only(
267267
pthreadpool_interface
268268
)
269+
270+
if(APPLE)
271+
# Use hidden visibility for pthreadpool on Apple platforms to avoid issues
272+
# with pthreadpool symbols from libtorch_cpu taking precedence over the ones
273+
# from the pthreadpool library statically linked in _portable_lib. The
274+
# pthreadpool public APIs are marked as weak by default on some Apple
275+
# platforms, so setting to hidden visibility works around this by not
276+
# putting the symbol in the indirection table. See
277+
# https://github.com/pytorch/executorch/issues/14321 for more details.
278+
target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
279+
endif()
280+
269281
install(
270282
TARGETS pthreadpool pthreadpool_interface fxdiv
271283
EXPORT ExecuTorchTargets

backends/aoti/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
3434
// Convert based on known PyTorch dtype codes (without CUDA-specific
3535
// dependency)
3636
switch (dtype) {
37+
case 4: // PyTorch's int64 dtype code
38+
return executorch::aten::ScalarType::Long;
3739
case 6: // PyTorch's float32 dtype code
3840
return executorch::aten::ScalarType::Float;
3941
case 15: // PyTorch's bfloat16 dtype code

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ReplaceScalarWithTensorArgPassTOSABI,
9292
ReplaceScalarWithTensorArgPassTOSAMI,
9393
)
94+
from .rewrite_matmul import RewriteMatmulPass # noqa
9495
from .rewrite_upsample import RewriteUpsamplePass # noqa
9596
from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa
9697
from .size_adjust_input_pass import SizeAdjustInputPass # noqa

backends/arm/_passes/_debug_passes.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
from typing import Set, Type
7+
68
import torch
79
from executorch.devtools.visualization.visualization_utils import visualize_graph
810
from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
1416
This pass visualizes the graph at the point of insertion in the pass manager
1517
"""
1618

19+
_passes_required_after: Set[Type[ExportPass]] = set()
20+
1721
def __init__(self, exported_program: ExportedProgram) -> None:
1822
super().__init__()
1923
self.exported_program = exported_program

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
ReplaceScalarWithTensorArgPassTOSABI,
9393
ReplaceScalarWithTensorArgPassTOSAMI,
9494
RetraceFoldedDtypesPass,
95+
RewriteMatmulPass,
9596
RewriteUpsamplePass,
9697
ScalarsToAttributePass,
9798
SizeAdjustInputPass,
@@ -211,6 +212,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
211212
self.add_pass(RewriteUpsamplePass(exported_program))
212213
self.add_pass(AddBiasPass(exported_program))
213214

215+
self.add_pass(RewriteMatmulPass(exported_program))
214216
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
215217
self.add_pass(ToTosaMemoryFormatPass(exported_program))
216218
self.add_pass(RemoveNoopPass())
@@ -297,6 +299,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
297299
self.add_pass(RewriteUpsamplePass(exported_program))
298300
self.add_pass(AddBiasPass(exported_program))
299301
self.add_pass(InsertTableOpsPass(exported_program))
302+
self.add_pass(RewriteMatmulPass(exported_program))
300303
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
301304
self.add_pass(ToTosaMemoryFormatPass(exported_program))
302305
self.add_pass(RemoveNoopPass())

backends/arm/_passes/convert_minmax_pass.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6-
from typing import Set, Type
6+
from typing import cast, Set, Type
77

88
import torch
9+
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
910
from executorch.backends.arm._passes.convert_squeezes_to_view import (
1011
ConvertSqueezesToViewPass,
1112
)
@@ -101,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
101102
replace_node, op, squeeze_op = self.get_variables(node)
102103

103104
# Unwrap args
104-
if len(node.args) == 2:
105+
if len(node.args) == 1:
106+
# If dims is unspecified, min/max over all dims.
107+
input_node = cast(torch.fx.Node, node.args[0])
108+
input_shape = get_first_fake_tensor(input_node).shape
109+
dims = range(len(input_shape))
110+
keepdims = False
111+
elif len(node.args) == 2:
105112
input_node, dims = node.args
106113
keepdims = False
107114
elif len(node.args) == 3:
108115
input_node, dims, keepdims = node.args
109116
else:
110-
raise RuntimeError(f"Unexpected arg size in {node.name}")
117+
raise RuntimeError(
118+
f"Unexpected arg size {len(node.args)} in {node.name}"
119+
)
111120

112121
try:
113-
iter(dims)
114-
except:
115-
dims = [dims]
122+
iter(dims) # type:ignore[assignment]
123+
except Exception:
124+
dims = [dims] # type:ignore[assignment]
116125
else:
117-
dims = list(dims)
126+
dims = list(dims) # type:ignore[assignment]
118127

119128
# Unroll multi-dimensional reduction and keep-dims arg
120129
with graph_module.graph.inserting_before(node):

backends/arm/_passes/decompose_div_tensor_mode.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
"full": exir_ops.edge.aten.full.default,
2323
"lt": exir_ops.edge.aten.lt.Tensor,
2424
"where": exir_ops.edge.aten.where.self,
25+
"mul": exir_ops.edge.aten.mul.Tensor,
26+
"sub": exir_ops.edge.aten.sub.Tensor,
2527
}
2628

2729
aten_unary = {
@@ -31,6 +33,8 @@
3133
"full": torch.ops.aten.full.default,
3234
"lt": torch.ops.aten.lt.Tensor,
3335
"where": torch.ops.aten.where.self,
36+
"mul": torch.ops.aten.mul.Tensor,
37+
"sub": torch.ops.aten.sub.Tensor,
3438
}
3539

3640

@@ -70,13 +74,57 @@ def call_operator(self, op, args, kwargs, meta):
7074
return q
7175

7276
if rounding_mode == "floor":
73-
return super().call_operator(opset["floor"], (q,), {}, meta)
77+
q_raw = q
78+
79+
# trunc(q_raw) = where(q_raw < 0, ceil(q_raw), floor(q_raw))
80+
q_floor = super().call_operator(opset["floor"], (q_raw,), {}, meta)
81+
q_ceil = super().call_operator(opset["ceil"], (q_raw,), {}, meta)
82+
83+
# a zero tensor with the right shape
84+
out_shape = (1,) * len(meta["val"].size())
85+
zero = super().call_operator(
86+
opset["full"],
87+
args=(out_shape, 0.0),
88+
kwargs={},
89+
meta=meta,
90+
)
91+
92+
is_neg = super().call_operator(opset["lt"], (q_raw, zero), {}, meta)
93+
q_trunc = super().call_operator(
94+
opset["where"], (is_neg, q_ceil, q_floor), {}, meta
95+
)
96+
97+
# r = a - q_trunc * b (true remainder under truncation)
98+
q_times_b = super().call_operator(opset["mul"], (q_trunc, b), {}, meta)
99+
r = super().call_operator(opset["sub"], (a, q_times_b), {}, meta)
100+
101+
# Decide if we need to subtract 1:
102+
# for b > 0, adjust if r < 0; for b < 0, adjust if r > 0.
103+
b_pos = super().call_operator(opset["lt"], (zero, b), {}, meta) # b > 0
104+
r_lt0 = super().call_operator(opset["lt"], (r, zero), {}, meta) # r < 0
105+
r_gt0 = super().call_operator(opset["lt"], (zero, r), {}, meta) # r > 0
106+
107+
adjust_if = super().call_operator(
108+
opset["where"], (b_pos, r_lt0, r_gt0), {}, meta
109+
)
110+
111+
one = super().call_operator(
112+
opset["full"],
113+
args=(out_shape, 1.0),
114+
kwargs={},
115+
meta=meta,
116+
)
117+
q_minus_1 = super().call_operator(opset["sub"], (q_trunc, one), {}, meta)
118+
119+
return super().call_operator(
120+
opset["where"], (adjust_if, q_minus_1, q_trunc), {}, meta
121+
)
74122

75123
if rounding_mode == "trunc":
76124
zero = super().call_operator(
77125
opset["full"],
78126
args=((1,) * len(meta["val"].size()), 0.0),
79-
kwargs={"dtype": torch.float32},
127+
kwargs={},
80128
meta=meta,
81129
)
82130
lt0 = self.call_operator(opset["lt"], (q, zero), {}, meta)

backends/arm/_passes/decompose_meandim_pass.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
9494
input_shape = list(x.data.shape)
9595
output_shape = list(meta["val"].shape)
9696
dims_to_reduce = get_node_arg(args, 1)
97+
if dims_to_reduce is None:
98+
dims_to_reduce = range(len(input_shape))
9799
dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
98100
dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
99101

0 commit comments

Comments
 (0)