Add quantization and partitioner flow in the qualcomm doc

cccclai · facebook-github-bot · commit 33ad9d82874c · 2025-07-10T21:54:02.000-07:00
Summary: Add a session to describe how to lower a model to HTP, including quantization step.

Differential Revision: D78117959
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
@@ -354,6 +354,95 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by
 
 Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
+## How to Support a Custom Model in HTP Backend
+
+### Step-by-Step Implementation Guide
+
+Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more compilated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
+#### Step 1: Prepare Your Model
+```python
+import torch
+
+# Initialize your custom model
+model = YourModelClass().eval()  # Your custom PyTorch model
+
+# Create example inputs (adjust shape as needed)
+example_inputs = (torch.randn(1, 3, 224, 224),)  # Example input tensor
+```
+
+Step 2: [Optional] Quantize Your Model
+Choose between quantization approaches, post training quantization (PTQ) or quantization aware training (QAT):
+```python
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+from torch.ao.quantization.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e, convert_pt2e
+
+quantizer = QnnQuantizer()
+m = torch.export.export(model, example_inputs, strict=True).module()
+
+# PTQ (Post-Training Quantization)
+if quantization_type == "ptq":
+    prepared_model = prepare_pt2e(m, quantizer)
+    # Calibration loop would go here
+    prepared_model(*example_inputs)
+
+# QAT (Quantization-Aware Training)
+elif quantization_type == "qat":
+    prepared_model = prepare_qat_pt2e(m, quantizer)
+    # Training loop would go here
+    for _ in range(training_steps):
+        prepared_model(*example_inputs)
+
+# Convert to quantized model
+quantized_model = convert_pt2e(prepared_model)
+```
+
+Step 3: Configure Compile Specs
+During this step, you will need to specify the target SoC, data type, and other QNN compiler spec.
+```python
+from executorch.backends.qualcomm.compiler import (
+    generate_qnn_executorch_compiler_spec,
+    generate_htp_compiler_spec,
+)
+from executorch.backends.qualcomm.utils.utils import QcomChipset
+
+# HTP Compiler Configuration
+backend_options = generate_htp_compiler_spec(
+    use_fp16=not quantized,  # False for quantized models
+)
+
+# QNN Compiler Spec
+compile_spec = generate_qnn_executorch_compiler_spec(
+    soc_model=QcomChipset.SM8650,  # Your target SoC
+    backend_options=backend_options,
+    saver=False                    # Set True to save QNN artifacts
+)
+```
+Step 4: Lower and Export the Model
+```python
+from executorch.backends.qualcomm.partition.qnn_partitioner import (
+    to_edge_transform_and_lower_to_qnn,
+)
+from executorch.exir import ExecutorchBackendConfig
+
+# Lower to QNN backend
+delegated_program = to_edge_transform_and_lower_to_qnn(
+    quantized_model if quantized else model,
+    example_inputs,
+    compile_spec
+)
+
+# Export to ExecuTorch format
+executorch_program = delegated_program.to_executorch(
+    config=ExecutorchBackendConfig(extract_delegate_segments=False)
+)
+
+# Save the compiled model
+model_name = "custom_model_qnn.pte"
+with open(model_name, "wb") as f:
+    f.write(executorch_program.buffer)
+print(f"Model successfully exported to {model_name}")
+```
+
 ## What is coming?
 
  - Improve the performance for llama3-8B-Instruct and support batch prefill.
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
@@ -16,8 +16,11 @@
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import save_pte_program
 
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-
+from torchao.quantization.pt2e.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -43,6 +46,20 @@ def main() -> None:
         help="The folder to store the exported program",
     )
 
+    parser.add_argument(
+        "--soc",
+        type=str,
+        default="SM8650",
+        help="Specify the SoC model.",
+    )
+
+    parser.add_argument(
+        "-q",
+        "--quantization",
+        choices=["ptq", "qat"],
+        help="Run post-traininig quantization.",
+    )
+
     args = parser.parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
@@ -51,24 +68,38 @@ def main() -> None:
             f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
         )
 
+    # Get model and example inputs
     model, example_inputs, _, _ = EagerModelFactory.create_model(
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
 
     # Get quantizer
-    quantizer = QnnQuantizer()
-
-    # Typical pytorch 2.0 quantization flow
-    m = torch.export.export(model.eval(), example_inputs, strict=True).module()
-    m = prepare_pt2e(m, quantizer)
-    # Calibration
-    m(*example_inputs)
-    # Get the quantized model
-    m = convert_pt2e(m)
+    if args.quantization:
+        print("Quantizing model...")
+        # It is the model quantization path
+        quantizer = QnnQuantizer()
+        # Typical pytorch 2.0 quantization flow
+        m = torch.export.export(model.eval(), example_inputs, strict=True).module()
+        if args.quantization == "qat":
+            m = prepare_qat_pt2e(m, quantizer)
+            # Training loop
+            m(*example_inputs)
+        elif args.quantization == "ptq":
+            m = prepare_pt2e(m, quantizer)
+            # Calibration
+            m(*example_inputs)
+        else:
+            raise RuntimeError(f"Unknown quantization type {args.quantization}")
+        # Get the quantized model
+        m = convert_pt2e(m)
+    else:
+        # It is the fp model path
+        m = model
 
     # Capture program for edge IR and delegate to QNN backend
+    use_fp16 = True if args.quantization is None else False
     backend_options = generate_htp_compiler_spec(
-        use_fp16=False,
+        use_fp16=use_fp16,
     )
     compile_spec = generate_qnn_executorch_compiler_spec(
         soc_model=QcomChipset.SM8550,