Update

larryliu0820 · larryliu0820 · commit 08fe952cffd2 · 2025-10-21T22:12:23.000-07:00
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -151,6 +151,8 @@ def preprocess(
             "aot_inductor.package_constants_in_so": False,
             # Store weight constants on disk in a binary blob
             "aot_inductor.package_constants_on_disk_format": "binary_blob",
+            # Avoid issues like 'NoneType' object has no attribute 'reorder_iter_loops'
+            "loop_ordering_after_fusion": False,
             # Enable maximum automatic tuning for optimal performance
             "max_autotune": True,
             # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
@@ -16,6 +16,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 
 
@@ -56,6 +57,18 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
+        # Tag constant placeholders that have no users
+        # tag_constant_data only tags constants that have users with delegation_tag
+        # but we need to tag all constants for this partition
+        for node in exported_program.graph.nodes:
+            if node.op == "placeholder" and (
+                is_param(exported_program, node)
+                or is_buffer(exported_program, node)
+                or is_lifted_tensor_constant(exported_program, node)
+            ):
+                if "delegation_tag" not in node.meta:
+                    node.meta["delegation_tag"] = tag
+
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h
@@ -168,18 +168,29 @@ inline std::vector<float> load_wav_audio_data(const std::string& fp) {
   size_t data_offset = header->dataOffset;
   size_t data_size = header->Subchunk2Size;
   int bits_per_sample = header->bitsPerSample;
+  int audio_format = header->AudioFormat;
 
   std::vector<float> audio_data;
 
   if (bits_per_sample == 32) {
     size_t num_samples = data_size / 4;
     audio_data.resize(num_samples);
-    const int32_t* input_buffer =
-        reinterpret_cast<const int32_t*>(data + data_offset);
 
-    for (size_t i = 0; i < num_samples; ++i) {
-      audio_data[i] = static_cast<float>(
-          static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+    if (audio_format == 3) {
+      // IEEE float format - read directly as floats
+      const float* input_buffer =
+          reinterpret_cast<const float*>(data + data_offset);
+      for (size_t i = 0; i < num_samples; ++i) {
+        audio_data[i] = input_buffer[i];
+      }
+    } else {
+      // PCM integer format - normalize from int32
+      const int32_t* input_buffer =
+          reinterpret_cast<const int32_t*>(data + data_offset);
+      for (size_t i = 0; i < num_samples; ++i) {
+        audio_data[i] = static_cast<float>(
+            static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+      }
     }
   } else if (bits_per_sample == 16) {
     size_t num_samples = data_size / 2;
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
@@ -79,27 +79,27 @@ TensorPtr make_tensor_ptr(
       });
     }
   }
-  std::vector<executorch::aten::StridesType> computed_strides(dim);
-
-  auto error = runtime::dim_order_to_stride(
-      sizes.data(), dim_order.data(), dim, computed_strides.data());
-  ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
-
-  if (!strides.empty()) {
-    for (size_t i = 0; i < dim; i++) {
-      ET_CHECK_MSG(
-          strides[i] == computed_strides[i] || sizes[i] == 1,
-          "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES
-          "!= %" ET_PRI_SIZES_AND_STRIDES
-          " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1",
-          i,
-          strides[i],
-          computed_strides[i],
-          sizes[i]);
-    }
-  }
-
-  strides = std::move(computed_strides);
+  // std::vector<executorch::aten::StridesType> computed_strides(dim);
+
+  // auto error = runtime::dim_order_to_stride(
+  //     sizes.data(), dim_order.data(), dim, computed_strides.data());
+  // ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+
+  // if (!strides.empty()) {
+  //   for (size_t i = 0; i < dim; i++) {
+  //     ET_CHECK_MSG(
+  //         strides[i] == computed_strides[i] || sizes[i] == 1,
+  //         "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES
+  //         "!= %" ET_PRI_SIZES_AND_STRIDES
+  //         " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1",
+  //         i,
+  //         strides[i],
+  //         computed_strides[i],
+  //         sizes[i]);
+  //   }
+  // }
+
+  // strides = std::move(computed_strides);
 
 #ifndef USE_ATEN_LIB
   executorch::aten::TensorImpl tensor_impl(