pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 84 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/utils.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/utils.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/_debug_passes.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/_passes/_debug_passes.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_minmax_pass.py‎
Lines changed: 16 additions & 7 deletions b/‎backends/arm/_passes/convert_minmax_pass.py‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎backends/arm/_passes/decompose_div_tensor_mode.py‎
Lines changed: 50 additions & 2 deletions b/‎backends/arm/_passes/decompose_div_tensor_mode.py‎
Lines changed: 50 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 2 additions & 0 deletions
@@ -1 +1 @@
-bd06b54e627fbfd354a2cffa4c80fb21883209a9
+44d8d54e38c0258357d4e92e1fefe21e845947a3
@@ -86,3 +86,87 @@ jobs:
         PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --output_dir ./
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner"
+        # Capture output and allow exit code 139 if we have the expected printout
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        # Check if the output contains "Run latency (ms):"
+        if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
+          echo "Found expected output: 'Run latency (ms):'"
+          if [ $EXIT_CODE -eq 139 ]; then
+            echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
+            exit 0
+          elif [ $EXIT_CODE -ne 0 ]; then
+            echo "Unexpected exit code: $EXIT_CODE"
+            exit $EXIT_CODE
+          else
+            echo "Command succeeded with exit code 0"
+            exit 0
+          fi
+        else
+          echo "Expected output 'Run latency (ms):' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   executorch_move_interface_include_directories_to_build_time_only(
     pthreadpool_interface
   )
+
+  if(APPLE)
+    # Use hidden visibility for pthreadpool on Apple platforms to avoid issues
+    # with pthreadpool symbols from libtorch_cpu taking precedence over the ones
+    # from the pthreadpool library statically linked in _portable_lib. The
+    # pthreadpool public APIs are marked as weak by default on some Apple
+    # platforms, so setting to hidden visibility works around this by not
+    # putting the symbol in the indirection table. See
+    # https://github.com/pytorch/executorch/issues/14321 for more details.
+    target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
+  endif()
+
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
 
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
 
@@ -91,6 +91,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
 )
+from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
 
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.devtools.visualization.visualization_utils import visualize_graph
 from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
     This pass visualizes the graph at the point of insertion in the pass manager
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
 
@@ -92,6 +92,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
+    RewriteMatmulPass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
@@ -211,6 +212,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
 
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
@@ -297,6 +299,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
 
@@ -3,9 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Set, Type
+from typing import cast, Set, Type
 
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm._passes.convert_squeezes_to_view import (
     ConvertSqueezesToViewPass,
 )
@@ -101,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
             replace_node, op, squeeze_op = self.get_variables(node)
 
             # Unwrap args
-            if len(node.args) == 2:
+            if len(node.args) == 1:
+                # If dims is unspecified, min/max over all dims.
+                input_node = cast(torch.fx.Node, node.args[0])
+                input_shape = get_first_fake_tensor(input_node).shape
+                dims = range(len(input_shape))
+                keepdims = False
+            elif len(node.args) == 2:
                 input_node, dims = node.args
                 keepdims = False
             elif len(node.args) == 3:
                 input_node, dims, keepdims = node.args
             else:
-                raise RuntimeError(f"Unexpected arg size in {node.name}")
+                raise RuntimeError(
+                    f"Unexpected arg size {len(node.args)} in {node.name}"
+                )
 
             try:
-                iter(dims)
-            except:
-                dims = [dims]
+                iter(dims)  # type:ignore[assignment]
+            except Exception:
+                dims = [dims]  # type:ignore[assignment]
             else:
-                dims = list(dims)
+                dims = list(dims)  # type:ignore[assignment]
 
             # Unroll multi-dimensional reduction and keep-dims arg
             with graph_module.graph.inserting_before(node):
 
@@ -22,6 +22,8 @@
     "full": exir_ops.edge.aten.full.default,
     "lt": exir_ops.edge.aten.lt.Tensor,
     "where": exir_ops.edge.aten.where.self,
+    "mul": exir_ops.edge.aten.mul.Tensor,
+    "sub": exir_ops.edge.aten.sub.Tensor,
 }
 
 aten_unary = {
@@ -31,6 +33,8 @@
     "full": torch.ops.aten.full.default,
     "lt": torch.ops.aten.lt.Tensor,
     "where": torch.ops.aten.where.self,
+    "mul": torch.ops.aten.mul.Tensor,
+    "sub": torch.ops.aten.sub.Tensor,
 }
 
 
@@ -70,13 +74,57 @@ def call_operator(self, op, args, kwargs, meta):
             return q
 
         if rounding_mode == "floor":
-            return super().call_operator(opset["floor"], (q,), {}, meta)
+            q_raw = q
+
+            # trunc(q_raw) = where(q_raw < 0, ceil(q_raw), floor(q_raw))
+            q_floor = super().call_operator(opset["floor"], (q_raw,), {}, meta)
+            q_ceil = super().call_operator(opset["ceil"], (q_raw,), {}, meta)
+
+            # a zero tensor with the right shape
+            out_shape = (1,) * len(meta["val"].size())
+            zero = super().call_operator(
+                opset["full"],
+                args=(out_shape, 0.0),
+                kwargs={},
+                meta=meta,
+            )
+
+            is_neg = super().call_operator(opset["lt"], (q_raw, zero), {}, meta)
+            q_trunc = super().call_operator(
+                opset["where"], (is_neg, q_ceil, q_floor), {}, meta
+            )
+
+            # r = a - q_trunc * b (true remainder under truncation)
+            q_times_b = super().call_operator(opset["mul"], (q_trunc, b), {}, meta)
+            r = super().call_operator(opset["sub"], (a, q_times_b), {}, meta)
+
+            # Decide if we need to subtract 1:
+            # for b > 0, adjust if r < 0; for b < 0, adjust if r > 0.
+            b_pos = super().call_operator(opset["lt"], (zero, b), {}, meta)  # b > 0
+            r_lt0 = super().call_operator(opset["lt"], (r, zero), {}, meta)  # r < 0
+            r_gt0 = super().call_operator(opset["lt"], (zero, r), {}, meta)  # r > 0
+
+            adjust_if = super().call_operator(
+                opset["where"], (b_pos, r_lt0, r_gt0), {}, meta
+            )
+
+            one = super().call_operator(
+                opset["full"],
+                args=(out_shape, 1.0),
+                kwargs={},
+                meta=meta,
+            )
+            q_minus_1 = super().call_operator(opset["sub"], (q_trunc, one), {}, meta)
+
+            return super().call_operator(
+                opset["where"], (adjust_if, q_minus_1, q_trunc), {}, meta
+            )
 
         if rounding_mode == "trunc":
             zero = super().call_operator(
                 opset["full"],
                 args=((1,) * len(meta["val"].size()), 0.0),
-                kwargs={"dtype": torch.float32},
+                kwargs={},
                 meta=meta,
             )
             lt0 = self.call_operator(opset["lt"], (q, zero), {}, meta)
 
@@ -94,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
         input_shape = list(x.data.shape)
         output_shape = list(meta["val"].shape)
         dims_to_reduce = get_node_arg(args, 1)
+        if dims_to_reduce is None:
+            dims_to_reduce = range(len(input_shape))
         dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
         dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-bd06b54e627fbfd354a2cffa4c80fb21883209a9`
	`1`	`+44d8d54e38c0258357d4e92e1fefe21e845947a3`
Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@`
`91`	`91`	`ReplaceScalarWithTensorArgPassTOSABI,`
`92`	`92`	`ReplaceScalarWithTensorArgPassTOSAMI,`
`93`	`93`	`)`
	`94`	`+from .rewrite_matmul import RewriteMatmulPass # noqa`
`94`	`95`	`from .rewrite_upsample import RewriteUpsamplePass # noqa`
`95`	`96`	`from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa`
`96`	`97`	`from .size_adjust_input_pass import SizeAdjustInputPass # noqa`