pytorch
diff --git a/‎backends/cadence/aot/memory_constraints.py‎
Lines changed: 32 additions & 0 deletions b/‎backends/cadence/aot/memory_constraints.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 39 additions & 2 deletions b/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/models/llava/export_llava.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/models/llava/export_llava.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/models/llava/runner/llava_text_decoder_runner.h‎
Lines changed: 1 addition & 1 deletion b/‎examples/models/llava/runner/llava_text_decoder_runner.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llava/test/test_llava.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/models/llava/test/test_llava.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/models/llava/test/test_pte.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/models/llava/test/test_pte.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/models/voxtral/CMakeLists.txt‎
Lines changed: 99 additions & 0 deletions b/‎examples/models/voxtral/CMakeLists.txt‎
Lines changed: 99 additions & 0 deletions
@@ -654,6 +654,37 @@ def compute_slice_and_select_loc_constraints(
 ]
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class GenerateIdmaConstraints(PassBase):
+    """Generate constraints for idma ops."""
+
+    def __init__(self, constraint: MemConstraints) -> None:
+        self.constraint = constraint
+
+    def call(self, graph_module: torch.fx.GraphModule) -> Optional[PassResult]:
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.cadence.idma_wait.out
+        ):
+            # This is just an alias op.
+            self.constraint.add_relative_placement_constraint(node.args[0], node)
+
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.cadence.idma_load.out
+        ):
+            # TODO: set correct dtcm bank here.
+            mem_id = 1
+            self.constraint.add_absolute_placement_constraint(node, mem_id, None)
+
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.cadence.idma_store.out
+        ):
+            # TODO: set correct dtcm bank here.
+            mem_id = 1
+            self.constraint.add_absolute_placement_constraint(
+                node.args[0], mem_id, None
+            )
+
+
 # The class to generate all the constraints that will be passed on to the memory
 # planning algorithm.
 class GenerateMemConstraints:
@@ -671,6 +702,7 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
         constraint_gen_passes: Sequence[ConstraintsGenPass] = cast(
             list[ConstraintsGenPass],
             [
+                GenerateIdmaConstraints,
                 GenerateMemoryViewConstraints,
                 GenerateSliceAndSelectNopConstraints,
                 GenerateCatNopConstraints,
 
@@ -9,7 +9,7 @@
 import collections
 import itertools
 import logging
-from typing import Iterable, Optional, Sequence
+from typing import Callable, Iterable, Optional, Sequence, TypeAlias
 
 import torch
 from executorch.backends.cadence.aot.memory_constraints import MemConstraints
@@ -26,6 +26,8 @@
 
 from executorch.exir import ExecutorchProgramManager
 from executorch.exir.memory_planning import collect_specs_from_nodes, Verifier
+from executorch.exir.pass_base import PassBase
+from executorch.exir.pass_manager import PassManager
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.tensor import TensorSpec
 from tabulate import tabulate
@@ -359,6 +361,35 @@ def print_memory_planning_info(
     )
 
 
+class SimplifyIdmaOpsPass(PassBase):
+    """Replace idma_load and idma_store with idma_copy."""
+
+    def call(self, graph_module: torch.fx.GraphModule) -> Optional[PassResult]:
+        modified = False
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.cadence.idma_load.out
+        ):
+            modified = True
+            node.target = torch.ops.cadence.idma_copy.out
+            node.args = (node.args[0], *node.args[2:])
+
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=torch.ops.cadence.idma_store.out
+        ):
+            modified = True
+            node.target = torch.ops.cadence.idma_copy.out
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, modified)
+
+
+ConstraintGenPassType: TypeAlias = Callable[
+    [MemConstraints],
+    Callable[[torch.fx.GraphModule], Optional[PassResult]],
+]
+
+
 class CadenceMemoryPlanning:
     def __init__(
         self,
@@ -423,10 +454,16 @@ def run(
         # True.
         mem_planning = MemoryPlanningPass(
             self.algo,
-            allow_lifetime_and_storage_overlap=(self.opt_level >= 2),
+            # Always allow lifetime and storage overlap.
+            # At opt level 0, we need overlap for idma wait.
+            allow_lifetime_and_storage_overlap=True,
             alloc_graph_input=self.alloc_graph_input,
             alloc_graph_output=self.alloc_graph_output,
         )
         mem_planning.run(graph_module, graph_signature)
 
+        graph_module = PassManager(passes=[SimplifyIdmaOpsPass()])(
+            graph_module
+        ).graph_module
+
         return PassResult(graph_module, True)
@@ -304,7 +304,13 @@
 # Post memory planning, we check that outputs/inputs for the load/store are in
 # DTCM and replace idma_load/idma_store with idma_copy.
 lib.define("idma_load(Tensor src, int task_num=0, int channel=0) -> Tensor")
+lib.define(
+    "idma_load.out(Tensor src, int task_num=0, int channel=0, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define("idma_store(Tensor src, int task_num=0, int channel=0) -> Tensor")
+lib.define(
+    "idma_store.out(Tensor src, int task_num=0, int channel=0, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 # Non-blocking iDMA copy.
 lib.define("idma_copy(Tensor src, int task_num=0, int channel=0) -> Tensor")
 
@@ -226,11 +226,11 @@ def export_all(llava_model: LlavaModel):
         {
             "image_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
-            "text_model": text_model_ep,
+            "text_decoder": text_model_ep,
         },
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
-            "text_model": [
+            "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
                 # to avoid holding multiple unpacked and packed weight buffers in memory,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "image_encoder": ConstraintBasedSymShapeEvalPass(),
-                "text_model": ConstraintBasedSymShapeEvalPass(),
+                "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
         )
 
@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
   }
 
   inline static const std::string kTokenEmbeddingMethod = "token_embedding";
-  inline static const std::string kTextModelMethod = "text_model";
+  inline static const std::string kTextModelMethod = "text_decoder";
 };
 
 } // namespace example
@@ -96,7 +96,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_before_image,)
         )[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
         )
 
@@ -107,7 +107,7 @@ def test_llava_export(self):
         # pte prefill image
         pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (
                 torch.tensor([start_pos], dtype=torch.int64),
                 pte_embeds_img,
@@ -122,7 +122,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_after_image,)
         )[0]
         pte_prefill_after_img = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
         )[0]
 
@@ -139,7 +139,7 @@ def test_llava_export(self):
                 "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
             )[0]
             logits = llava_module.run_method(
-                "text_model",
+                "text_decoder",
                 (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
             )[0]
             new_tokens.append(torch.argmax(logits).item())
 
@@ -47,7 +47,7 @@ def main():
         "token_embedding", (prompt_before_image,)
     )[0]
     pte_prefill_before_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
     )[0]
     print(pte_prefill_before_img)
@@ -60,7 +60,7 @@ def main():
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (
             torch.tensor([start_pos], dtype=torch.int64),
             pte_embeds_img,
@@ -77,7 +77,7 @@ def main():
         "token_embedding", (prompt_after_image,)
     )[0]
     pte_prefill_after_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
     )[0]
     logging.warning("Text token prefill finished")
@@ -91,7 +91,7 @@ def main():
             "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
         )[0]
         logits = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
         )[0]
         new_tokens.append(torch.argmax(logits[..., -1, :]).item())
 
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for voxtral runner.
+#
+cmake_minimum_required(VERSION 3.24)
+project(voxtral)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find `executorch` libraries, same as for gflags
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(LINK_LIBS executorch gflags)
+set(link_libraries ${LINK_LIBS})
+set(_srcs multimodal.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# Add the required ExecutorTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(voxtral_runner ${_srcs})
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(voxtral_runner)
+  if(NOT APPLE)
+    target_link_options(voxtral_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
+target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
+target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`inline static const std::string kTokenEmbeddingMethod = "token_embedding";`
`92`		`- inline static const std::string kTextModelMethod = "text_model";`
	`92`	`+ inline static const std::string kTextModelMethod = "text_decoder";`
`93`	`93`	`};`
`94`	`94`
`95`	`95`	`} // namespace example`