Qualcomm AI Engine Direct - BC CI Fix and Custom Annotation Fix (#13212)

winskuo-quic · web-flow · commit 41d05fe8f8e7 · 2025-08-08T09:48:06.000-07:00
### Summary

- This PR will fix BC CI to use the right tokenizer, initially 260k
accidently use 110m tokenizer during runtime.
- Fix custom annotation where conv has bias node.

### Test plan
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -201,7 +201,6 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=bert
   elif [[ "${MODEL_NAME}" == "conv_former" ]]; then
     EXPORT_SCRIPT=conv_former
-    EXTRA_FLAGS="--dataset imagenet-mini/val"
   elif [[ "${MODEL_NAME}" == "cvt" ]]; then
     EXPORT_SCRIPT=cvt
   elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
@@ -33,12 +33,12 @@ echo "Creating tokenizer.bin"
 $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 
 set +e
-# Compile only as weight sharing is not applicable on x86
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
+# Compile only as weight sharing is not applicable on x86.
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
 exit_code1=$?
 
 # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
 exit_code2=$?
 
 # Check BC
diff --git a/backends/qualcomm/bc/test_qnn_static_llama_bc.sh b/backends/qualcomm/bc/test_qnn_static_llama_bc.sh
@@ -13,15 +13,18 @@ fi
 which "${PYTHON_EXECUTABLE}"
 
 
-llama_artifacts="."
+llama_artifacts="260k_stories"
 PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"
 
+mkdir ${llama_artifacts}
 # Download stories260K.pt and tokenizer from Github
-curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
-curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
+curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output ${llama_artifacts}/stories260K.pt
+curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output ${llama_artifacts}/tokenizer.model
+
+$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t ${llama_artifacts}/tokenizer.model -o ${llama_artifacts}/tokenizer.bin
 # Create params.json file
-touch params.json
-echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
+touch ${llama_artifacts}/params.json
+echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > ${llama_artifacts}/params.json
 
 # Checks e2e accuracy
 expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -216,6 +216,10 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
         weight = node.args[1]
         input_qspec_map[weight] = quantization_config.weight
 
+        if len(node.args) > 2 and isinstance(node.args[2], Node):
+            bias = node.args[2]
+            input_qspec_map[bias] = quantization_config.bias(node)
+
         node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
             output_qspec=quantization_config.output_activation,
diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 import sys
 from multiprocessing.connection import Client
@@ -44,8 +45,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
@@ -132,7 +136,7 @@ def main(args):
             "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
         ),
         type=str,
-        required=True,
+        required=False,
     )
 
     args = parser.parse_args()