add native script

haowhsu-quic · haowhsu-quic · commit b922f24f21dd · 2025-06-27T13:44:19.000+08:00
diff --git a/backends/qualcomm/builders/op_slice_copy.py b/backends/qualcomm/builders/op_slice_copy.py
@@ -56,7 +56,7 @@ def define_node(
         if start < 0:
             start = start % input_tensor.shape[dim]
 
-        if len(node.args) > 3:
+        if len(node.args) > 3 and node.args[3] is not None:
             end = min(cast(int, node.args[3]), input_tensor.shape[dim])
             if end < 0:
                 end = end % input_tensor.shape[dim]
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
@@ -78,10 +78,7 @@ def _build_op_wrappers(
                         )
                         assert node.target == context_loader_target, err_msg
                         # if graph has context binary loader node, return directly
-                        return PreprocessResult(
-                            processed_bytes=node.meta[OpContextLoader.meta_ctx_bin],
-                            debug_handle_map={},
-                        )
+                        return node.meta[OpContextLoader.meta_ctx_bin]
                     except:
                         raise RuntimeError(err_msg)
 
@@ -161,30 +158,41 @@ def preprocess_multimethod(
                 generate_qnn_executorch_option(compile_spec)
             )
             qnn_manager.Init()
-            py_op_wrapper_list = []
+            py_op_wrapper_list, ctx_binary_list = [], []
             for j, programs in enumerate(edge_programs.values()):
                 logger.info(f"Processing Method({j}): ({i+1}/{num_sub_graphs})")
                 py_op_wrappers = QnnBackend._build_op_wrappers(
                     programs[i],
                     qnn_manager.IsTensorDump(),
                     option.op_package_options.op_package_infos,
                 )
-                py_op_wrapper_list.append(
-                    [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrappers]
-                )
+                if isinstance(py_op_wrappers, bytes):
+                    ctx_binary_list.append(py_op_wrappers)
+                else:
+                    py_op_wrapper_list.append(
+                        [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrappers]
+                    )
 
-            qnn_context_binary = qnn_manager.Compile(graph_name, py_op_wrapper_list)
-            assert (
-                len(qnn_context_binary) != 0
-            ), "Failed to generate Qnn context binary."
-            qnn_manager.Destroy()
-            # methods should share the same context binary for current partition
-            for key in edge_programs.keys():
-                all_processed_results[key].append(
-                    PreprocessResult(
-                        processed_bytes=bytes(qnn_context_binary),
-                        debug_handle_map={},
+            if len(py_op_wrapper_list) == len(edge_programs.values()):
+                qnn_context_binary = qnn_manager.Compile(graph_name, py_op_wrapper_list)
+                assert (
+                    len(qnn_context_binary) != 0
+                ), "Failed to generate Qnn context binary."
+                qnn_manager.Destroy()
+                # methods should share the same context binary for current partition
+                for key in edge_programs.keys():
+                    all_processed_results[key].append(
+                        PreprocessResult(
+                            processed_bytes=bytes(qnn_context_binary),
+                            debug_handle_map={},
+                        )
                     )
-                )
+            elif len(ctx_binary_list) == len(edge_programs.values()):
+                for i, key in enumerate(edge_programs.keys()):
+                    all_processed_results[key].append(
+                        PreprocessResult(processed_bytes=ctx_binary_list[i])
+                    )
+            else:
+                raise RuntimeError("Hybrid compilation is not supported")
 
         return all_processed_results
diff --git a/benchmark.py b/benchmark.py
@@ -52,7 +52,7 @@ def get_build_dir(backend):
 
     runner = {
         "qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner",
-        "xnn": f"{get_build_dir(backend)}/backends/xnnpack/xnn_executor_runner",
+        "xnn": f"{get_build_dir(backend)}/executor_runner",
     }
     artifacts = {
         "qnn": [
@@ -110,8 +110,8 @@ def get_cmds(backend, pte_path, iteration):
             " ".join(
                 [
                     f"cd {workspace} &&",
-                    "chmod +x ./xnn_executor_runner &&",
-                    f"./xnn_executor_runner {' '.join(cmd_args[backend])}",
+                    "chmod +x ./executor_runner &&",
+                    f"./executor_runner {' '.join(cmd_args[backend])}",
                 ]
             )
         ),
@@ -134,9 +134,9 @@ def get_cmds(backend, pte_path, iteration):
             " ".join(
                 [
                     f"cd {workspace} &&",
-                    "chmod +x ./xnn_executor_runner &&",
+                    "chmod +x ./executor_runner &&",
                     f"chmod +x {memory_script_file} &&",
-                    f"./{memory_script_file} ./xnn_executor_runner {' '.join(cmd_args[backend])}",
+                    f"./{memory_script_file} ./executor_runner {' '.join(cmd_args[backend])}",
                 ]
             )
         ),
diff --git a/build_xnnpack.sh b/build_xnnpack.sh
@@ -29,6 +29,7 @@ cmake \
   -DANDROID_ABI='arm64-v8a' \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_ENABLE_LOGGING=ON \
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -52,6 +52,11 @@ DEFINE_string(
     "model.pte",
     "Model serialized in flatbuffer format.");
 DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
+DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
+DEFINE_string(
+  output_folder_path,
+  "outputs",
+  "Executorch inference data output path.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
 #endif // ET_EVENT_TRACER_ENABLED
@@ -271,57 +276,143 @@ int main(int argc, char** argv) {
   // because inputs whose space gets reused by memory planning (if
   // any such inputs exist) will not be preserved for the next
   // execution.
-
-  ET_CHECK_MSG(
+  std::ifstream input_list(FLAGS_input_list_path);
+  if (input_list.is_open()) {
+    size_t num_inputs = method->inputs_size();
+    ET_LOG(Info, "Number of inputs: %zu", num_inputs);
+
+    auto split = [](std::string s, std::string delimiter) {
+      size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+      std::string token;
+      std::vector<std::string> res;
+
+      while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+      }
+      res.push_back(s.substr(pos_start));
+      return res;
+    };
+
+    std::string file_path;
+    int inference_index = 0;
+    double elapsed_time = 0;
+    while (std::getline(input_list, file_path)) {
+      auto input_files = split(file_path, " ");
+      if (input_files.size() == 0) {
+        break;
+      }
+      ET_CHECK_MSG(
+          input_files.size() == num_inputs,
+          "Number of inputs (%zu) mismatch with input files (%zu)",
+          num_inputs,
+          input_files.size());
+
+      std::vector<std::vector<char>> input_buf(num_inputs);
+      for (int input_index = 0; input_index < num_inputs; ++input_index) {
+        MethodMeta method_meta = method->method_meta();
+        Result<executorch::runtime::TensorInfo> tensor_meta =
+            method_meta.input_tensor_meta(input_index);
+
+        std::ifstream fin(input_files[input_index], std::ios::binary);
+        fin.seekg(0, fin.end);
+        size_t file_size = fin.tellg();
+
+        input_buf[input_index].resize(file_size);
+        fin.seekg(0, fin.beg);
+        fin.read(
+            static_cast<char*>(input_buf[input_index].data()),
+            file_size);
+        fin.close();
+
+        ET_CHECK_MSG(
+            file_size == tensor_meta->nbytes(),
+            "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
+            input_index,
+            file_size,
+            tensor_meta->nbytes());
+
+        auto impl = executorch::aten::TensorImpl(
+            tensor_meta->scalar_type(),
+            /*dim=*/tensor_meta->sizes().size(),
+            const_cast<executorch::aten::TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+            input_buf[input_index].data(),
+            const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+                tensor_meta->dim_order().data()));
+        Error ret = method->set_input(executorch::aten::Tensor(&impl), input_index);
+        ET_CHECK_MSG(
+            ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
+      }
+      Error status = method->execute();
+      std::vector<EValue> outputs(method->outputs_size());
+      status = method->get_outputs(outputs.data(), method->outputs_size());
+      ET_CHECK(status == Error::Ok);
+      for (size_t output_index = 0; output_index < method->outputs_size();
+           output_index++) {
+        auto output_tensor = outputs[output_index].toTensor();
+        size_t nbytes = output_tensor.nbytes();
+        auto output_file_name = FLAGS_output_folder_path + "/output_" +
+            std::to_string(inference_index) + "_" +
+            std::to_string(output_index) + ".raw";
+        std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+        fout.write(output_tensor.const_data_ptr<char>(), nbytes);
+        fout.close();
+      }
+      ++inference_index;
+    }
+  } else {
+    ET_CHECK_MSG(
       inputs.ok(),
       "Could not prepare inputs: 0x%" PRIx32,
       (uint32_t)inputs.error());
-  ET_LOG(Debug, "Inputs prepared.");
-  auto before_exec = std::chrono::high_resolution_clock::now();
-  Error status = method->execute();
-  auto after_exec = std::chrono::high_resolution_clock::now();
-  double interval_1st_infs =
-      std::chrono::duration_cast<std::chrono::microseconds>(
-          after_exec - before_exec)
-          .count() /
-      1000.0;
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "Execution of method %s failed with status 0x%" PRIx32,
-      method_name,
-      (uint32_t)status);
-
-  // Run the model.
-  before_exec = std::chrono::high_resolution_clock::now();
-  for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
-    status = method->execute();
+    ET_LOG(Debug, "Inputs prepared.");
+
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    Error status = method->execute();
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double interval_1st_infs =
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            after_exec - before_exec)
+            .count() /
+        1000.0;
     ET_CHECK_MSG(
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
         (uint32_t)status);
-  }
-  after_exec = std::chrono::high_resolution_clock::now();
-  double interval_infs = std::chrono::duration_cast<std::chrono::microseconds>(
-                             after_exec - before_exec)
-                             .count() /
-      1000.0 / FLAGS_num_executions;
-
-  if (FLAGS_dump_statistics) {
-    auto output_file_name = "statistics.txt";
-    std::ofstream fout(output_file_name);
-    fout << "load: " + std::to_string(interval_load)
-         << "\n1st: " + std::to_string(interval_1st_infs)
-         << "\navg: " + std::to_string(interval_infs) << std::endl;
-    fout.close();
-  }
-  ET_LOG(Info, "Model executed successfully.");
 
-  if (tracer.get_event_tracer()) {
-    // Dump ETDump data containing profiling/debugging data to file specified in
-    // command line flag.
-    status = tracer.write_etdump_to_file();
-    ET_CHECK_MSG(status == Error::Ok, "Failed to save ETDump file.");
+    // Run the model.
+    before_exec = std::chrono::high_resolution_clock::now();
+    for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
+      status = method->execute();
+      ET_CHECK_MSG(
+          status == Error::Ok,
+          "Execution of method %s failed with status 0x%" PRIx32,
+          method_name,
+          (uint32_t)status);
+    }
+    after_exec = std::chrono::high_resolution_clock::now();
+    double interval_infs = std::chrono::duration_cast<std::chrono::microseconds>(
+                              after_exec - before_exec)
+                              .count() /
+        1000.0 / FLAGS_num_executions;
+
+    if (FLAGS_dump_statistics) {
+      auto output_file_name = "statistics.txt";
+      std::ofstream fout(output_file_name);
+      fout << "load: " + std::to_string(interval_load)
+          << "\n1st: " + std::to_string(interval_1st_infs)
+          << "\navg: " + std::to_string(interval_infs) << std::endl;
+      fout.close();
+    }
+    ET_LOG(Info, "Model executed successfully.");
+    if (tracer.get_event_tracer()) {
+      // Dump ETDump data containing profiling/debugging data to file specified in
+      // command line flag.
+      status = tracer.write_etdump_to_file();
+      ET_CHECK_MSG(status == Error::Ok, "Failed to save ETDump file.");
+    }
   }
 
   return 0;
diff --git a/examples/qualcomm/oss_scripts/albert.py b/examples/qualcomm/oss_scripts/albert.py
@@ -36,7 +36,7 @@ def main(args):
     data_size = 100
 
     model_name = "albert/albert-base-v2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, hidden_act="gelu")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
 
     if args.ci:
         random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32)
diff --git a/examples/qualcomm/oss_scripts/dit.py b/examples/qualcomm/oss_scripts/dit.py
@@ -30,28 +30,43 @@
 
 def get_rvlcdip_dataset(data_size):
     from datasets import load_dataset
+    from torch.utils.data import Dataset
 
-    dataset = load_dataset("nielsr/rvl_cdip_10_examples_per_class", split="train")
-    processor = AutoImageProcessor.from_pretrained(
-        "microsoft/dit-base-finetuned-rvlcdip"
-    )
+    def get_data_loader():
+        class DitDataset(Dataset):
+            def __init__(self, data_size) -> None:
+                self.data_size = data_size
+                self.dataset = self._get_dataset()
+                self.processor = AutoImageProcessor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
+
+            def _get_dataset(self):
+                dataset = list(load_dataset("nielsr/rvl_cdip_10_examples_per_class", split="test"))
+                return dataset
+
+            def __getitem__(self, idx):
+                return (
+                    self.processor(images=self.dataset[idx]["image"].convert("RGB"), return_tensors="pt"),
+                    self.dataset[idx]["label"]
+                )
+
+            def __len__(self):
+                return len(self.dataset)
+
+        dataset = DitDataset(data_size)
+        torch.manual_seed(3407)
+        return torch.utils.data.DataLoader(dataset, batch_size=None, shuffle=True)
 
     # prepare input data
     inputs, targets, input_list = [], [], ""
-    for index, data in enumerate(dataset):
+    for index, (feature, target) in enumerate(get_data_loader()):
         if index >= data_size:
             break
-        feature, target = (
-            processor(images=data["image"].convert("RGB"), return_tensors="pt"),
-            data["label"],
-        )
         inputs.append((feature["pixel_values"],))
         targets.append(torch.tensor(target))
         input_list += f"input_{index}_0.raw\n"
 
     return inputs, targets, input_list
 
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
diff --git a/gen_pte_from_qnn_native.py b/gen_pte_from_qnn_native.py

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def get_build_dir(backend):`
`52`	`52`
`53`	`53`	`runner = {`
`54`	`54`	`"qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner",`
`55`		`- "xnn": f"{get_build_dir(backend)}/backends/xnnpack/xnn_executor_runner",`
	`55`	`+ "xnn": f"{get_build_dir(backend)}/executor_runner",`
`56`	`56`	`}`
`57`	`57`	`artifacts = {`
`58`	`58`	`"qnn": [`
`@@ -110,8 +110,8 @@ def get_cmds(backend, pte_path, iteration):`
`110`	`110`	`" ".join(`
`111`	`111`	`[`
`112`	`112`	`f"cd {workspace} &&",`
`113`		`- "chmod +x ./xnn_executor_runner &&",`
`114`		`- f"./xnn_executor_runner {' '.join(cmd_args[backend])}",`
	`113`	`+ "chmod +x ./executor_runner &&",`
	`114`	`+ f"./executor_runner {' '.join(cmd_args[backend])}",`
`115`	`115`	`]`
`116`	`116`	`)`
`117`	`117`	`),`
`@@ -134,9 +134,9 @@ def get_cmds(backend, pte_path, iteration):`
`134`	`134`	`" ".join(`
`135`	`135`	`[`
`136`	`136`	`f"cd {workspace} &&",`
`137`		`- "chmod +x ./xnn_executor_runner &&",`
	`137`	`+ "chmod +x ./executor_runner &&",`
`138`	`138`	`f"chmod +x {memory_script_file} &&",`
`139`		`- f"./{memory_script_file} ./xnn_executor_runner {' '.join(cmd_args[backend])}",`
	`139`	`+ f"./{memory_script_file} ./executor_runner {' '.join(cmd_args[backend])}",`
`140`	`140`	`]`
`141`	`141`	`)`
`142`	`142`	`),`