pytorch
diff --git a/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions b/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/fusion_g3/operators/op_clamp.cpp‎
Lines changed: 3 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_clamp.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 10 additions & 0 deletions b/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎exir/tests/test_remove_unused_parameters_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎exir/tests/test_remove_unused_parameters_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm‎
Lines changed: 21 additions & 3 deletions b/‎extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎extension/llm/runner/text_decoder_runner.h‎
Lines changed: 9 additions & 1 deletion b/‎extension/llm/runner/text_decoder_runner.h‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎extension/tensor/tensor_ptr.h‎
Lines changed: 9 additions & 1 deletion b/‎extension/tensor/tensor_ptr.h‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎extension/tensor/tensor_ptr_maker.cpp‎
Lines changed: 16 additions & 2 deletions b/‎extension/tensor/tensor_ptr_maker.cpp‎
Lines changed: 16 additions & 2 deletions
@@ -116,6 +116,9 @@ def plan_spec(
         Greedily place the spec in the first memory that can fit it.
         """
         for spec.mem_id in range(1, self.get_num_memories()):
+            if placement_constraints.is_mem_id_in_blocklist(spec, spec.mem_id):
+                # Skip placement for blocked memory id.
+                continue
             prev_offset, smallest_gap = 0, float("inf")
             for allocated_spec in state.allocated_buffers[spec.mem_id]:
                 if not Verifier.lifetime_overlap(spec, allocated_spec):
@@ -141,11 +144,11 @@ def plan_spec(
                 )
             if spec.mem_offset is None:
                 spec.mem_offset = prev_offset
-                if not self.is_valid_placement(spec, placement_constraints):
-                    spec.mem_offset = None
-                    continue
-                else:
-                    spec.mem_offset = prev_offset
+
+            if not self.is_valid_placement(spec, placement_constraints):
+                # Skip placement for invalid memory id.
+                spec.mem_offset = None
+                continue
 
             state.place_spec(spec)
             # A data structure used for maintaining the tensor order
 
@@ -204,7 +204,7 @@ def _place_memory_id_pinned_specs(
                 for spec, c in spec_with_abs_constraint.items()
                 if c is not None and c.pinned_memory_id == mem_id and c.offset is None
             }
-            logging.error(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
+            logging.debug(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
 
             with self.block_memories_except(mem_id):
                 self.plan(
@@ -220,7 +220,7 @@ def _place_memory_id_pinned_specs(
             if constraint is None:
                 continue
 
-            logging.error(f"Placing spec {spec} with {constraint}")
+            logging.debug(f"Placing spec {spec} with {constraint}")
 
             if not state.is_placed(spec):
                 raise MemoryError(
 
@@ -1044,7 +1044,7 @@ class DummyMemIdBlockConstraintGen(PassBase):
             mul: blocks 1, 3
             """
 
-            def __init__(self, memory_constraints: MemoryConfig):
+            def __init__(self, memory_constraints: MemConstraints):
                 self.memory_constraints = memory_constraints
 
             def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
@@ -45,6 +45,7 @@ bool is_out_of_bounds(CTYPE_VAL val) {
 }
 
 ET_NODISCARD bool check_bounds(
+    KernelRuntimeContext& ctx,
     const Scalar& val_scalar,
     const ScalarType& val_type,
     const ScalarType& out_type,
@@ -107,14 +108,14 @@ Tensor& clamp_out(
   if (has_min) {
     ET_KERNEL_CHECK(
         ctx,
-        check_bounds(min_opt.value(), min_type, out_type, "minimum"),
+        check_bounds(ctx, min_opt.value(), min_type, out_type, "minimum"),
         InvalidArgument,
         out);
   }
   if (has_max) {
     ET_KERNEL_CHECK(
         ctx,
-        check_bounds(max_opt.value(), max_type, out_type, "maximum"),
+        check_bounds(ctx, max_opt.value(), max_type, out_type, "maximum"),
         InvalidArgument,
         out);
   }
 
@@ -576,6 +576,10 @@ class StaticAttentionIOManager {
     }
   }
 
+  size_t input_pos() const {
+    return input_pos_;
+  }
+
   /**
    * Prefill helper. Run multiple inferences as needed depending on the length
    * of the prompt and method's input length. Returns the position in the output
@@ -586,6 +590,7 @@ class StaticAttentionIOManager {
       executorch::runtime::Span<TokenT> tokens,
       executorch::runtime::Span<TokenT> input_buffer,
       executorch::runtime::Method& method) {
+    ET_LOG(Info, "Prefilling at position %zu", input_pos_);
     size_t input_len = input_buffer.size();
     auto& masks = get_mask(input_buffer.size());
     for (auto& pair : masks) {
@@ -621,6 +626,7 @@ class StaticAttentionIOManager {
       executorch::runtime::Method& method,
       std::function<TokenT(executorch::runtime::Method&)>& sample,
       std::function<bool(TokenT)>& token_callback) {
+    ET_LOG(Info, "Decoding at position %zu", input_pos_);
     set_input(method, 0, input_buffer.data());
     auto& masks = get_mask(input_buffer.size());
     for (auto& pair : masks) {
@@ -661,6 +667,10 @@ class StaticAttentionIOManager {
       size_t window_size,
       size_t n_verifications,
       std::unordered_map<TokenT, SuffixCache<TokenT>> suffix_caches) {
+    ET_LOG(
+        Info,
+        "Decoding with lookahead and verification at position %zu",
+        input_pos_);
     set_input(method, 0, input_buffer.data());
     size_t input_len = input_buffer.size();
 
 
@@ -196,7 +196,7 @@ def _test_pass_e2e(
 
         self.assertEqual(1, len(runtime_outputs))
         self.assertTrue(
-            torch.allclose(runtime_outputs[0], eager_outputs, atol=2e-6),
+            torch.allclose(runtime_outputs[0], eager_outputs, atol=1e-5),
             "Values out of tolerance.\n"
             + f"  Strict: {strict}, ToEdge: {use_to_edge}, Delegate: {delegate}.\n"
             + f"  Eager: {eager_outputs}.\n"
 
@@ -265,9 +265,15 @@ - (NSString *)description {
   auto const count = _tensor->numel();
   os << "\n  count: " << count << ",";
   os << "\n  scalars: [";
+  // Create a minimal context for error handling in ET_SWITCH
+  struct {
+    [[noreturn]] void fail(torch::executor::Error /* error */) {
+      ET_CHECK_MSG(false, "Unsupported dtype in description");
+    }
+  } ctx;
   ET_SWITCH_REALHBBF16_TYPES(
     static_cast<ScalarType>(_tensor->scalar_type()),
-    nullptr,
+    ctx,
     "description",
     CTYPE,
     [&] {
@@ -488,9 +494,15 @@ - (instancetype)initWithScalars:(NSArray<NSNumber *> *)scalars
                "Number of scalars does not match the shape");
   std::vector<uint8_t> data;
   data.resize(count * ExecuTorchSizeOfDataType(dataType));
+  // Create a minimal context for error handling in ET_SWITCH
+  struct {
+    [[noreturn]] void fail(torch::executor::Error /* error */) {
+      ET_CHECK_MSG(false, "Unsupported dtype in initWithScalars");
+    }
+  } ctx;
   for (NSUInteger index = 0; index < count; ++index) {
     ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
-      static_cast<ScalarType>(dataType), nil, "initWithScalars", CTYPE, [&] {
+      static_cast<ScalarType>(dataType), ctx, "initWithScalars", CTYPE, [&] {
         reinterpret_cast<CTYPE *>(data.data())[index] = utils::toType<CTYPE>(scalars[index]);
       }
     );
@@ -801,8 +813,14 @@ + (instancetype)fullTensorWithShape:(NSArray<NSNumber *> *)shape
                            dataType:(ExecuTorchDataType)dataType
                       shapeDynamism:(ExecuTorchShapeDynamism)shapeDynamism {
   Scalar fillValue;
+  // Create a minimal context for error handling in ET_SWITCH
+  struct {
+    [[noreturn]] void fail(torch::executor::Error /* error */) {
+      ET_CHECK_MSG(false, "Unsupported dtype in fullTensor");
+    }
+  } ctx;
   ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
-    static_cast<ScalarType>(dataType), nil, "fullTensor", CTYPE, [&] {
+    static_cast<ScalarType>(dataType), ctx, "fullTensor", CTYPE, [&] {
       fillValue = utils::toType<CTYPE>(scalar);
     }
   );
 
@@ -68,12 +68,20 @@ class ET_EXPERIMENTAL TextDecoderRunner {
       const executorch::aten::Tensor& logits_tensor,
       const float temperature = 0.0f) {
     int32_t result = 0;
+
+    // Create a minimal context for error handling in ET_SWITCH
+    struct {
+      [[noreturn]] void fail(torch::executor::Error /* error */) {
+        ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token");
+      }
+    } ctx;
+
     ET_SWITCH_THREE_TYPES(
         Float,
         Half,
         BFloat16,
         logits_tensor.scalar_type(),
-        unused,
+        ctx,
         "logits_to_token",
         CTYPE,
         [&]() {
 
@@ -111,7 +111,15 @@ inline TensorPtr make_tensor_ptr(
         runtime::canCast(deduced_type, type),
         "Cannot cast deduced type to specified type.");
     std::vector<uint8_t> casted_data(data.size() * runtime::elementSize(type));
-    ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "make_tensor_ptr", CTYPE, [&] {
+
+    // Create a minimal context for error handling in ET_SWITCH
+    struct {
+      [[noreturn]] void fail(torch::executor::Error /* error */) {
+        ET_CHECK_MSG(false, "Unsupported dtype in make_tensor_ptr");
+      }
+    } ctx;
+
+    ET_SWITCH_REALHBBF16_TYPES(type, ctx, "make_tensor_ptr", CTYPE, [&] {
       std::transform(
           data.begin(),
           data.end(),
 
@@ -89,7 +89,14 @@ TensorPtr random_strided(
       empty_strided(std::move(sizes), std::move(strides), type, dynamism);
   std::default_random_engine gen{std::random_device{}()};
 
-  ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "random_strided", CTYPE, [&] {
+  // Create a minimal context for error handling in ET_SWITCH
+  struct {
+    [[noreturn]] void fail(torch::executor::Error /* error */) {
+      ET_CHECK_MSG(false, "Unsupported dtype in random_strided");
+    }
+  } ctx;
+
+  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "random_strided", CTYPE, [&] {
     std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
       return static_cast<CTYPE>(distribution(gen));
     });
@@ -124,7 +131,14 @@ TensorPtr full_strided(
     executorch::aten::TensorShapeDynamism dynamism) {
   auto tensor =
       empty_strided(std::move(sizes), std::move(strides), type, dynamism);
-  ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "full_strided", CTYPE, [&] {
+  // Create a minimal context for error handling in ET_SWITCH
+  struct {
+    [[noreturn]] void fail(torch::executor::Error /* error */) {
+      ET_CHECK_MSG(false, "Unsupported data type in full_strided");
+    }
+  } ctx;
+
+  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "full_strided", CTYPE, [&] {
     CTYPE value;
     ET_EXTRACT_SCALAR(fill_value, value);
     std::fill(