diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index c7828888ee5..d1c41c78937 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -446,7 +446,7 @@ def preprocess(
             config = cto.coreml.OptimizationConfig(
                 global_config=op_linear_quantizer_config,
                 # skip embedding
-                op_type_configs={"gather": None},
+                # op_type_configs={"gather": None},
             )
             mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
 
diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm
index 35608dd092a..6cf9a95c22a 100644
--- a/examples/apple/coreml/executor_runner/main.mm
+++ b/examples/apple/coreml/executor_runner/main.mm
@@ -18,6 +18,9 @@
 #import <numeric>
 #import <string>
 
+// #import <iostream>
+// #import <executorch/extension/evalue_util/print_evalue.h>
+
 static inline id check_class(id obj, Class cls) {
     return [obj isKindOfClass:cls] ? obj : nil;
 }
@@ -393,6 +396,17 @@ int main(int argc, char * argv[]) {
 
         dump_etdump_gen(etdump_gen.get(), debug_buffer, args);
 
+        for (size_t i = 0; i < method->outputs_size(); i++) {
+            auto tensor = outputs[i].toTensor();
+            NSLog(@"tensor[%zu] sizes=", i);
+            for (size_t j = 0; j < tensor.sizes().size(); j++) {
+                NSLog(@" %d,", tensor.sizes()[j]);
+            }
+        }
+        // std::cout << executorch::extension::evalue_edge_items(100);
+        // for (int i = 0; i < method->outputs_size(); i++) {
+        //     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
+        // }
         return EXIT_SUCCESS;
     }
 }
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index 9d20f289bf6..cf77af3f51c 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -29,7 +29,7 @@ rm -rf "$CMAKE_BUILD_DIR_PATH"
 # Build executorch
 echo "ExecuTorch: Building executorch"
 cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
--DCMAKE_BUILD_TYPE=Release \
+-DCMAKE_BUILD_TYPE=Debug \
 -DCMAKE_TOOLCHAIN_FILE="$IOS_TOOLCHAIN_PATH" \
 -DPLATFORM=MAC_UNIVERSAL \
 -DDEPLOYMENT_TARGET=13.0 \
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index 53316ea2001..950548809ef 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -175,9 +175,12 @@ def main():
         edge_program_manager = exir.to_edge(exir_program_aten)
         edge_copy = copy.deepcopy(edge_program_manager)
         partitioner = CoreMLPartitioner(
-            skip_ops_for_coreml_delegation=None, compile_specs=compile_specs
+            skip_ops_for_coreml_delegation=["linear.bias", "linear.weight"],
+            compile_specs=compile_specs,
         )
+        breakpoint()
         delegated_program_manager = edge_program_manager.to_backend(partitioner)
+        breakpoint()
         exec_program = delegated_program_manager.to_executorch(
             config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)
         )
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index f3822b6866d..7a0545d4ea4 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -711,6 +711,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
             logging.info("Generated etrecord.bin")
     else:
         builder = builder_exported_to_edge.to_backend(partitioners)
+        breakpoint()
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
@@ -825,6 +826,8 @@ def _load_llama_model(
         output_prune_map_path=output_prune_map_path,
         args=args,
     )
+    model(*example_inputs)
+    breakpoint()
     if dtype_override:
         assert isinstance(
             dtype_override, DType
@@ -970,6 +973,7 @@ def _get_source_transforms(  # noqa
             else:
                 transforms.append(replace_sdpa_with_simple_sdpa)
             transforms.append(replace_kv_cache_with_coreml_kv_cache)
+            transforms.append(replace_causal_mask)
 
     if args.vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 76e8730328b..c1784ee89fc 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -326,6 +326,8 @@ def forward(
         bsz, seqlen, _ = x.shape
 
         # QKV
+        # x.shape = [2048]
+        print("lfq shape: ", x.shape)
         q, k, v = self.wq(x), self.wk(x), self.wv(x)
         # We need view_copy elimination
         q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py
index 9ebe42e6621..0ec828072e1 100644
--- a/examples/models/toy_model/model.py
+++ b/examples/models/toy_model/model.py
@@ -27,7 +27,7 @@ def get_example_inputs(self):
 class LinearModule(torch.nn.Module, EagerModelBase):
     def __init__(self):
         super().__init__()
-        self.linear = torch.nn.Linear(3, 3)
+        self.linear = torch.nn.Linear(768, 768)
 
     def forward(self, arg):
         return self.linear(arg)
@@ -36,7 +36,7 @@ def get_eager_model(self) -> torch.nn.Module:
         return self
 
     def get_example_inputs(self):
-        return (torch.randn(3, 3),)
+        return (torch.randn(1, 768),)
 
 
 class AddModule(torch.nn.Module, EagerModelBase):
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index fb5e16c6bd0..550324d308e 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -380,7 +380,8 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                     )
                 # tag the data node with the same tag as the last user
                 if len(user_tags) > 0:
-                    node.meta["delegation_tag"] = user_tags.pop()
+                    breakpoint()
+                    node.meta["delegation_tag"] = None  # user_tags.pop()
 
 
 def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
diff --git a/exir/program/_program.py b/exir/program/_program.py
index b136d6cead9..c9e1d84e88e 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1279,6 +1279,7 @@ def to_backend(
             EdgeProgramManager: A copy of the calling EdgeProgramManager with the
             specified subgraphs lowered.
         """
+        breakpoint()
         new_edge_programs: Dict[str, ExportedProgram] = {}
         if isinstance(partitioner, dict):
             for name, program in self._edge_programs.items():