Update

GregoryComer · GregoryComer · commit 6b95ce7157bc · 2025-08-19T15:56:57.000-07:00
[ghstack-poisoned]
diff --git a/.ci/scripts/unittest-buck2.sh b/.ci/scripts/unittest-buck2.sh
@@ -11,9 +11,10 @@ set -eux
 # TODO: can't query //kernels/prim_ops because of non-buckified stuff in OSS.
 buck2 query "//backends/apple/... + //backends/example/... + \
 //backends/mediatek/... + //backends/transforms/... + \
-//backends/xnnpack/... + //configurations/... + //kernels/aten/... + \
-//kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
-//kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
+//backends/xnnpack/... + //configurations/... + //extension/flat_tensor: + \
+//kernels/aten/... + //kernels/optimized/... + //kernels/portable/... + \
+//kernels/quantized/... + //kernels/test/... + //runtime/... + //schema/... \
++ //test/... + //util/..."
 
 # TODO: optimized ops are unbuildable because they now use ATen; put
 # them back after we can use PyTorch in OSS buck.
diff --git a/.github/workflows/add-unanswered-to-project.yml b/.github/workflows/add-unanswered-to-project.yml
@@ -1,10 +1,10 @@
 name: Add Open External Contributor PRs and Issues to PyTorch Org Project 136
 
 on:
- # schedule:
-  #  - cron: '0 * * * *'
   workflow_dispatch:
-
+  pull_request: 
+   paths: 
+     .github/workflows/add-unanswered-to-project.yml
 jobs:
   add_to_project:
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ dist/
 ethos-u-scratch/
 executorch.egg-info
 pip-out/
+build-profiling/
 
 # Any exported models and profiling outputs
 *.bin
@@ -60,6 +61,7 @@ xcuserdata/
 /share/
 /version.py
 *.csv
+*_etdump
 
 # Android
 *.aar
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -817,9 +817,9 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs openvino_backend)
   endif()
 
-if(EXECUTORCH_BUILD_QNN)
-  list(APPEND _dep_libs qnn_executorch_backend)
-endif()
+  if(EXECUTORCH_BUILD_QNN)
+    list(APPEND _dep_libs qnn_executorch_backend)
+  endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
     # need to explicitly specify XNNPACK and xnnpack-microkernels-prod here
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -72,7 +72,6 @@ Tensor& _softmax_out(
   if (optimized) {
     int* p_inp = (int*)in.const_data_ptr<float>();
     int* out_data = (int*)out.mutable_data_ptr<float>();
-
     int num_inp_dims = in.dim();
     int num_out_dims = num_inp_dims;
 
@@ -99,6 +98,37 @@ Tensor& _softmax_out(
 
     outer_stride = size;
 
+    WORD32 ret_val = 0;
+
+    // Check if the input is permuted. If not, then we don't need to transpose
+    bool is_permuted = false;
+    for (int i = 0; i < num_inp_dims; i++) {
+      if (p_permute_vec[i] != i) {
+        is_permuted = true;
+        break;
+      }
+    }
+
+    if (!is_permuted) {
+      const float* p_inpf = in.const_data_ptr<float>();
+      float* out_dataf = out.mutable_data_ptr<float>();
+
+      for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+        size_t outer = outer_idx * outer_stride;
+        for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) {
+          size_t base = outer + inner_idx;
+
+          float* p_in_data = (float*)&p_inpf[base];
+          float* p_out_data = (float*)&out_dataf[base];
+
+          ret_val = xa_nn_vec_softmax_f32_f32(p_out_data, p_in_data, size);
+
+          ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+        }
+      }
+      return out;
+    }
+
     int* p_out =
         (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int));
 
@@ -109,7 +139,7 @@ Tensor& _softmax_out(
 
     ET_KERNEL_CHECK(ctx, p_out1 != nullptr, MemoryAllocationFailed, out);
 
-    WORD32 ret_val = xa_nn_transpose_32_32(
+    ret_val = xa_nn_transpose_32_32(
         p_out,
         p_out_shape,
         p_inp,
@@ -142,9 +172,7 @@ Tensor& _softmax_out(
         p_permute_vec,
         num_out_dims,
         num_inp_dims);
-
     ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
-
     return out;
   }
 
diff --git a/devtools/scripts/generate_profiling_csv.py b/devtools/scripts/generate_profiling_csv.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-25 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from executorch.devtools import Inspector
+
+
+def generate_csv(etdump_path, output):
+    """
+    Generate a CSV file from ETDump profiling data.
+
+    Args:
+        etdump_path (str): Path to the ETDump file generated by executor_runner
+        output (str): Path for the output CSV file
+    """
+    inspector = Inspector(etdump_path)
+    df = inspector.to_dataframe()
+    df.to_csv(output)
+
+
+def main():
+    """
+    Main function to parse command line arguments and generate profiling CSV.
+
+    Usage:
+        python generate_profiling_csv.py --etdump_path="my_etdump" --output="profiling.csv"
+
+    Example:
+        python generate_profiling_csv.py --etdump_path="llama3_etdump" --output="op_profiling.csv"
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate profiling CSV from a model's etdump"
+    )
+    parser.add_argument(
+        "--etdump_path",
+        type=str,
+        default="./model.etdump",
+        help="Path to the etdump file",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="./model_profiling.csv",
+        help="Path to the output CSV file",
+        required=False,
+    )
+
+    args = parser.parse_args()
+    print(f"Generating CSV from {args.etdump_path}")
+    generate_csv(args.etdump_path, args.output)
+    print(f"Saved CSV to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/devtools/scripts/profile_model.sh b/devtools/scripts/profile_model.sh
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-25 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/bin/bash
+
+# ExecutorTorch Model Profiling Script
+#
+# This script automates the process of building executor_runner with profiling enabled,
+# running model inference with ETDump collection, and generating CSV profiling reports.
+#
+# Usage:
+#   ./devtools/scripts/profile_model.sh [model_path] [etdump_path]
+#
+# Arguments:
+#   model_path  - Path to the .pte model file (default: "my_model")
+#   etdump_path - Path for ETDump output file (default: "path_to_et_dump")
+#
+# Examples:
+#   ./devtools/scripts/profile_model.sh
+#   ./devtools/scripts/profile_model.sh llama3.pte llama3_etdump
+#
+# Note: This script must be run from the top-level executorch directory.
+
+set -e
+
+echo "Building executor_runner with profiling enabled..."
+
+cmake --preset profiling -B build-profiling -DCMAKE_BUILD_TYPE=Release
+cmake --build build-profiling --target executor_runner
+
+echo "Build completed successfully!"
+
+MODEL_PATH=${1:-"my_model"}
+ETDUMP_PATH=${2:-"path_to_et_dump"}
+
+echo "Running and profiling model: $MODEL_PATH"
+echo "ETDump output path: $ETDUMP_PATH"
+
+./build-profiling/executor_runner --model_path="$MODEL_PATH" --etdump_path="$ETDUMP_PATH"
+
+echo "Profiling run completed!"
+
+echo "Generating profiling CSV..."
+python devtools/scripts/generate_profiling_csv.py --etdump_path="$ETDUMP_PATH" --output="op_profiling.csv"
+
+echo "Profiling CSV generated: op_profiling.csv"
+echo "Profiling workflow completed successfully!"
diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl
@@ -1,7 +1,7 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
-    for aten_mode in [True, False]:
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
         runtime.cxx_library(
             name = "flat_tensor_data_map" + aten_suffix,
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -290,11 +290,6 @@ QUANTIZED_KERNELS_SRCS = [
     "kernels/quantized/cpu/op_quantize.cpp",
 ]
 
-PROGRAM_SCHEMA_SRCS = [
-    "schema/program.fbs",
-    "schema/scalar_type.fbs",
-]
-
 OPTIMIZED_CPUBLAS_SRCS = [
     "kernels/optimized/blas/BlasKernel.cpp",
     "kernels/optimized/blas/CPUBlas.cpp",
@@ -375,27 +370,14 @@ THREADPOOL_SRCS = [
 EXTENSION_THREADPOOL_SRCS = ["extension/threadpool/" + x for x in THREADPOOL_SRCS]
 
 EXTENSION_TRAINING_SRCS = [
-    "extension/data_loader/file_data_loader.cpp",
-    "extension/data_loader/mmap_data_loader.cpp",
-    "extension/flat_tensor/flat_tensor_data_map.cpp",
-    "extension/flat_tensor/serialize/flat_tensor_header.cpp",
-    "extension/module/module.cpp",
     "extension/training/module/training_module.cpp",
     "extension/training/optimizer/sgd.cpp",
 ]
 
 TRAIN_XOR_SRCS = [
-    "extension/data_loader/file_data_loader.cpp",
-    "extension/data_loader/mmap_data_loader.cpp",
-    "extension/flat_tensor/flat_tensor_data_map.cpp",
-    "extension/flat_tensor/serialize/flat_tensor_header.cpp",
+    # REVIEW: removing this breaks the build; where is it supposed to come from?
     "extension/flat_tensor/serialize/serialize.cpp",
-    "extension/module/module.cpp",
-    "extension/tensor/tensor_ptr.cpp",
-    "extension/tensor/tensor_ptr_maker.cpp",
     "extension/training/examples/XOR/train.cpp",
-    "extension/training/module/training_module.cpp",
-    "extension/training/optimizer/sgd.cpp",
 ]
 
 EXECUTOR_RUNNER_SRCS = [
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
@@ -386,7 +386,6 @@ function(executorch_load_build_variables)
       KERNELS_UTIL_ALL_DEPS_SRCS
       OPTIMIZED_KERNELS_SRCS
       QUANTIZED_KERNELS_SRCS
-      PROGRAM_SCHEMA_SRCS
       OPTIMIZED_CPUBLAS_SRCS
       OPTIMIZED_NATIVE_CPU_OPS_SRCS
       TEST_BACKEND_COMPILER_LIB_SRCS
@@ -419,7 +418,6 @@ function(executorch_load_build_variables)
       _kernels_util_all_deps__srcs
       _optimized_kernels__srcs
       _quantized_kernels__srcs
-      _program_schema__srcs
       _optimized_cpublas__srcs
       _optimized_native_cpu_ops__srcs
       _test_backend_compiler_lib__srcs