Only add pass when vision model

jackzhxng · jackzhxng · commit 9e68531ac484 · 2024-12-17T21:41:04.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -23,6 +23,9 @@
 import torch
 
 from executorch.devtools.etrecord import generate_etrecord
+from executorch.exir.passes.cache_pos_init_mutable_pass import (
+    CachePosToInitializedMutableBufferPass,
+)
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 
@@ -760,6 +763,9 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
+    additional_passes = []
+    if args.model in TORCHTUNE_DEFINED_MODELS:
+        additional_passes = [CachePosToInitializedMutableBufferPass()]
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -774,7 +780,9 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
-        builder = builder.to_executorch()
+        builder = builder.to_executorch(
+            passes=additional_passes,
+        )
 
         # Generate ETRecord
         if edge_manager_copy:
@@ -792,7 +800,7 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
-        builder = builder.to_executorch()
+        builder = builder.to_executorch(passes=additional_passes)
 
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
diff --git a/examples/models/llama3_2_vision/runner/native.py b/examples/models/llama3_2_vision/runner/native.py
@@ -19,7 +19,6 @@
 )
 
 from executorch.extension.pybindings.portable_lib import (
-    _load_for_executorch,
     _load_for_executorch_from_buffer,
 )
 
@@ -50,7 +49,6 @@ def __init__(self, args):
         with open(args.pte, "rb") as f:
             self.model_bytes = f.read()
             self.model = _load_for_executorch_from_buffer(self.model_bytes)
-        # self.model = _load_for_executorch(args.pte)
         self.use_kv_cache = args.kv_cache
 
     def forward(
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -1607,7 +1607,6 @@ def placeholder(
 
         if isinstance(target, str) and isinstance(spec, TensorSpec):
             fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec)
-            print(f"fqn: {fqn}, is_mutable_buffer: {is_mutable_buffer}")
 
             # If the placeholder has a constant_tag, it is external to the PTE file
             # and requires a fqn and location=TensorDataLocation.EXTERNAL
diff --git a/exir/passes/init_mutable_buffer_pass.py b/exir/passes/init_mutable_buffer_pass.py
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -34,7 +34,6 @@
     OpReplacePass,
 )
 from executorch.exir.passes.external_constants_pass import external_constants_pass
-from executorch.exir.passes.init_mutable_buffer_pass import InitMutableBufferPass
 from executorch.exir.passes.insert_write_back_for_buffers_pass import (
     insert_write_back_for_buffers_pass,
 )
@@ -707,7 +706,6 @@ def edge_to_executorch_passes(
     passes: List[PassType] = [
         *config.passes,
         SpecPropPass(),
-        InitMutableBufferPass(),
         # ExecuTorch backend ops are unable to handle unbacked symints. So after
         # this pass, passes cannot be Interpreter-based, because it will fail if
         # there exists an unbacked symint operation.
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -25,6 +25,7 @@
 from executorch.exir.backend.utils import format_delegated_graph
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
 
+from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
@@ -395,26 +396,29 @@ def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManag
 
         return self
 
-    def to_executorch(self) -> "LLMEdgeManager":
+    def to_executorch(self, passes: Optional[List[PassType]]) -> "LLMEdgeManager":
         """
         Lower the model to executorch and get an ExecutorchProgram.
         """
         assert self.edge_manager, "Need to run export_to_edge() first"
+        to_executorch_passes = [
+            # If there are Linear operations left in the graph, let's execute
+            # them with the optimized op_linear rather than materializing a
+            # transpose followed by a regular op_mm.
+            ConvertToLinearPass(),
+            QuantFusionPass(),
+        ]
+        if passes:
+            to_executorch_passes.extend(passes)
+
         self.export_program = self.edge_manager.to_executorch(
             ExecutorchBackendConfig(
                 extract_delegate_segments=True,
-                passes=[
-                    # If there are Linear operations left in the graph, let's execute
-                    # them with the optimized op_linear rather than materializing a
-                    # transpose followed by a regular op_mm.
-                    ConvertToLinearPass(),
-                    QuantFusionPass(),
-                ],
+                passes=to_executorch_passes,
                 memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
-        print(self.export_program.dump_executorch_program(verbose=True))
         logging.info(
             "Required memory for activation in bytes: {}".format(
                 self.export_program._emitter_output.program.execution_plan[
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -11,7 +11,6 @@
 #include <cinttypes> // @donotremove
 #include <cstdint>
 #include <cstdio>
-#include <iostream>
 
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/runtime/backend/interface.h>
@@ -1181,10 +1180,6 @@ Error Method::execute_instruction() {
   if (err == Error::Ok) {
     step_state_.instr_idx = next_instr_idx;
   }
-
-  // TODO: Print an EValue.
-  std::cout << "(" << values_[1] << " ) Printing kv_cache k_cache: " << executorch::extension::evalue_edge_items(9216) << values_[2] << std::endl;
-  
   return err;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@`
`19`	`19`	`)`
`20`	`20`
`21`	`21`	`from executorch.extension.pybindings.portable_lib import (`
`22`		`- _load_for_executorch,`
`23`	`22`	`_load_for_executorch_from_buffer,`
`24`	`23`	`)`
`25`	`24`
`@@ -50,7 +49,6 @@ def __init__(self, args):`
`50`	`49`	`with open(args.pte, "rb") as f:`
`51`	`50`	`self.model_bytes = f.read()`
`52`	`51`	`self.model = _load_for_executorch_from_buffer(self.model_bytes)`
`53`		`- # self.model = _load_for_executorch(args.pte)`
`54`	`52`	`self.use_kv_cache = args.kv_cache`
`55`	`53`
`56`	`54`	`def forward(`