Qualcomm AI Engine Direct - context dump utility (#7931)

haowhsu-quic · web-flow · commit 1ec88a6140f3 · 2025-02-07T11:42:44.000-08:00
summary:
- utility for dumping compiled binaries (QNN context_binary / QCIR)
- test cases
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -49,7 +49,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
       .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize)
       .def(
           "MakeBinaryInfo",
-          py::overload_cast<const py::bytes&>(&PyQnnManager::MakeBinaryInfo));
+          py::overload_cast<const py::bytes&>(&PyQnnManager::MakeBinaryInfo))
+      .def("StripProtocol", &PyQnnManager::StripProtocol);
 }
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -390,6 +390,41 @@ class PyQnnManager {
     return result;
   }
 
+  py::array_t<char> StripProtocol(const py::bytes& preprocessed_binary) {
+    py::buffer_info info(py::buffer(preprocessed_binary).request());
+
+    void* buf_ptr = nullptr;
+    size_t buf_size = 0;
+    // check if it's a qnn context binary
+    auto [status, signature, ctx_size, ctx_bin] =
+        QnnContextCustomProtocol().DeserializeContextCustomBuffer(info.ptr);
+
+    if (status == Error::Ok) {
+      buf_size = ctx_size;
+      buf_ptr = ctx_bin;
+    } else {
+      // check if it's a qcir flatbuffers, return fbs if matched
+      auto
+          [status,
+           qcir_fbs_size,
+           qcir_tensor_size,
+           qcir_fbs_ptr,
+           qcir_tensor_ptr] =
+              QnnQcirCustomProtocol().DeserializeQcirCustomBuffer(info.ptr);
+      if (status == Error::Ok) {
+        buf_size = qcir_fbs_size;
+        buf_ptr = qcir_fbs_ptr;
+      } else {
+        // the format should be DLC, return nothing here
+        return py::array_t<char>(0);
+      }
+    }
+    auto result = py::array_t<char>(buf_size);
+    auto result_buffer = result.request();
+    std::memcpy(result_buffer.ptr, buf_ptr, buf_size);
+    return result;
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -20,6 +20,8 @@
     QuantDtype,
     TestQNN,
     to_backend,
+    validate_context_binary,
+    validate_qcir,
 )
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_ANNOTATION,
@@ -30,10 +32,12 @@
 
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
+    dump_context_from_pte,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
+    PyQnnManagerAdaptor,
     skip_annotation,
     update_spill_fill_size,
 )
@@ -2041,6 +2045,81 @@ def test_qnn_backend_context_direct(self):
                 bundle_program["edge_program_manager"].to_executorch(),
             )
 
+    def test_qnn_backend_context_extraction(self):
+        from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        compiler_specs = [
+            self.compiler_specs,
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                online_prepare=True,
+            ),
+        ]
+        validators = [validate_context_binary, validate_qcir]
+
+        for compiler_spec, validate in zip(compiler_specs, validators):
+            edge_prog_mgr = EdgeProgramManager(
+                edge_programs={
+                    "forward": capture_program(module, sample_input).exported_program
+                },
+                compile_config=EdgeCompileConfig(_use_edge_ops=False),
+            ).to_backend(QnnPartitioner(compiler_spec))
+            lowered_module = edge_prog_mgr.exported_program().graph_module._modules[
+                "lowered_module_0"
+            ]
+            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                lowered_module.compile_specs[0].value
+            )
+            qnn_mgr.Init()
+            binary = qnn_mgr.StripProtocol(lowered_module.processed_bytes)
+            validate(binary)
+
+    def test_qnn_backend_dump_context_from_pte(self):
+        from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        compiler_specs = [
+            self.compiler_specs,
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                online_prepare=True,
+            ),
+        ]
+        validators = [validate_context_binary, validate_qcir]
+
+        for compiler_spec, validate in zip(compiler_specs, validators):
+            edge_prog_mgr = (
+                EdgeProgramManager(
+                    edge_programs={
+                        "forward": capture_program(
+                            module, sample_input
+                        ).exported_program
+                    },
+                    compile_config=EdgeCompileConfig(_use_edge_ops=False),
+                )
+                .to_backend(QnnPartitioner(compiler_spec))
+                .to_executorch()
+            )
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                pte_path = f"{tmp_dir}/model.pte"
+                with open(pte_path, "wb") as f:
+                    edge_prog_mgr.write_to_file(f)
+
+                dump_context_from_pte(pte_path)
+                binary_name = f"{tmp_dir}/forward_0.bin"
+                self.assertTrue(os.path.isfile(binary_name))
+                with open(binary_name, "rb") as f:
+                    stripped_binary = f.read()
+                    validate(stripped_binary)
+
     def test_qnn_backend_draw_graph(self):
         golden_data = """digraph test {
             rankdir=TB
@@ -2433,7 +2512,7 @@ def test_qnn_backend_multi_graphs(self):
             for module, sample_input in zip(modules, sample_inputs)
         ]
         backend_options = generate_htp_compiler_spec(
-            use_fp16=True,
+            use_fp16=False,
         )
         compiler_specs = [
             generate_qnn_executorch_compiler_spec(
@@ -2532,6 +2611,83 @@ def test_qnn_backend_context_direct(self):
                 bundle_program["edge_program_manager"].to_executorch(),
             )
 
+    def test_qnn_backend_context_extraction(self):
+        from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        compiler_specs = [
+            self.compiler_specs,
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                online_prepare=True,
+            ),
+        ]
+        validators = [validate_context_binary, validate_qcir]
+
+        for compiler_spec, validate in zip(compiler_specs, validators):
+            edge_prog_mgr = EdgeProgramManager(
+                edge_programs={
+                    "forward": capture_program(module, sample_input).exported_program
+                },
+                compile_config=EdgeCompileConfig(_use_edge_ops=False),
+            ).to_backend(QnnPartitioner(compiler_spec))
+            lowered_module = edge_prog_mgr.exported_program().graph_module._modules[
+                "lowered_module_0"
+            ]
+            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                lowered_module.compile_specs[0].value
+            )
+            qnn_mgr.Init()
+            binary = qnn_mgr.StripProtocol(lowered_module.processed_bytes)
+            validate(binary)
+
+    def test_qnn_backend_dump_context_from_pte(self):
+        from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        compiler_specs = [
+            self.compiler_specs,
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                online_prepare=True,
+            ),
+        ]
+        validators = [validate_context_binary, validate_qcir]
+
+        for compiler_spec, validate in zip(compiler_specs, validators):
+            edge_prog_mgr = (
+                EdgeProgramManager(
+                    edge_programs={
+                        "forward": capture_program(
+                            module, sample_input
+                        ).exported_program
+                    },
+                    compile_config=EdgeCompileConfig(_use_edge_ops=False),
+                )
+                .to_backend(QnnPartitioner(compiler_spec))
+                .to_executorch()
+            )
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                pte_path = f"{tmp_dir}/model.pte"
+                with open(pte_path, "wb") as f:
+                    edge_prog_mgr.write_to_file(f)
+
+                dump_context_from_pte(pte_path)
+                binary_name = f"{tmp_dir}/forward_0.bin"
+                self.assertTrue(os.path.isfile(binary_name))
+                with open(binary_name, "rb") as f:
+                    stripped_binary = f.read()
+                    validate(stripped_binary)
+
     def test_qnn_backend_draw_graph(self):
         golden_data = """digraph test {
             rankdir=TB
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -108,6 +108,57 @@ def generate_context_binary(
     assert os.path.isfile(f"{artifact_dir}/model_ctx.bin"), print(result.stderr)
 
 
+def validate_context_binary(ctx_bin: bytes):
+    qnn_sdk = os.environ.get("QNN_SDK_ROOT", None)
+    assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
+
+    # flow of qnn tools
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        with open(f"{tmp_dir}/ctx.bin", "wb") as binary_file:
+            binary_file.write(ctx_bin)
+
+        target = "x86_64-linux-clang"
+        cmds = [
+            # qnn-context-binary-utility
+            f"{qnn_sdk}/bin/{target}/qnn-context-binary-utility",
+            "--context_binary",
+            f"{tmp_dir}/ctx.bin",
+            "--json_file",
+            f"{tmp_dir}/ctx.json",
+        ]
+        result = subprocess.run(
+            " ".join(cmds),
+            shell=True,
+            executable="/bin/bash",
+            capture_output=True,
+        )
+        assert os.path.isfile(f"{tmp_dir}/ctx.json"), print(result.stderr)
+
+
+def validate_qcir(qcir: bytes):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        with open(f"{tmp_dir}/qcir.bin", "wb") as binary_file:
+            binary_file.write(qcir)
+
+        cmds = [
+            "flatc",
+            "-o",
+            tmp_dir,
+            "--raw-binary",
+            "-t",
+            f"{os.path.dirname(__file__)}/../aot/ir/qcir.fbs",
+            "--",
+            f"{tmp_dir}/qcir.bin",
+        ]
+        result = subprocess.run(
+            " ".join(cmds),
+            shell=True,
+            executable="/bin/bash",
+            capture_output=True,
+        )
+        assert os.path.isfile(f"{tmp_dir}/qcir.json"), print(result.stderr)
+
+
 class TestQNN(unittest.TestCase):
     rtol: float = 0
     atol: float = 0
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -218,6 +218,44 @@ def replace_linear(module: torch.nn.Module):
     return replace_linear(module)
 
 
+def dump_context_from_pte(pte_path):
+    """
+    Dump compiled binaries under the same directory of pte_path.
+    For partitioned graph, there will be multiple files with names f"{graph_name}_{index}".
+    Where 'graph_name' comes from the compiler_specs and 'index' represents the execution order.
+
+    Args:
+        pte_path (str): The path of generated pte.
+    """
+    import os
+
+    from executorch.exir._serialize._program import deserialize_pte_binary
+
+    with open(pte_path, "rb") as f:
+        program_data = f.read()
+
+    program = deserialize_pte_binary(program_data)
+
+    ctx_path = os.path.dirname(pte_path)
+    dummy_compiler_specs = generate_qnn_executorch_compiler_spec(
+        soc_model=QcomChipset.SM8650,
+        backend_options=generate_htp_compiler_spec(use_fp16=False),
+    )
+    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+        generate_qnn_executorch_option(dummy_compiler_specs)
+    )
+    qnn_mgr.Init()
+    for execution_plan in program.execution_plan:
+        for i, delegate in enumerate(execution_plan.delegates):
+            if delegate.id == "QnnBackend":
+                processed_bytes = program.backend_delegate_data[
+                    delegate.processed.index
+                ].data
+                binary = qnn_mgr.StripProtocol(processed_bytes)
+                with open(f"{ctx_path}/{execution_plan.name}_{i}.bin", "wb") as f:
+                    f.write(binary)
+
+
 def update_spill_fill_size(
     exported_program: ExportedProgram | List[LoweredBackendModule],
 ):