Skip to content

Commit 41d05fe

Browse files
authored
Qualcomm AI Engine Direct - BC CI Fix and Custom Annotation Fix (#13212)
### Summary - This PR will fix BC CI to use the right tokenizer, initially 260k accidently use 110m tokenizer during runtime. - Fix custom annotation where conv has bias node. ### Test plan
1 parent b914e66 commit 41d05fe

File tree

5 files changed

+21
-11
lines changed

5 files changed

+21
-11
lines changed

.ci/scripts/test_model.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ test_model_with_qnn() {
201201
EXPORT_SCRIPT=bert
202202
elif [[ "${MODEL_NAME}" == "conv_former" ]]; then
203203
EXPORT_SCRIPT=conv_former
204-
EXTRA_FLAGS="--dataset imagenet-mini/val"
205204
elif [[ "${MODEL_NAME}" == "cvt" ]]; then
206205
EXPORT_SCRIPT=cvt
207206
elif [[ "${MODEL_NAME}" == "distilbert" ]]; then

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ echo "Creating tokenizer.bin"
3333
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
3434

3535
set +e
36-
# Compile only as weight sharing is not applicable on x86
37-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
36+
# Compile only as weight sharing is not applicable on x86.
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
3838
exit_code1=$?
3939

4040
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
4242
exit_code2=$?
4343

4444
# Check BC

backends/qualcomm/bc/test_qnn_static_llama_bc.sh

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,18 @@ fi
1313
which "${PYTHON_EXECUTABLE}"
1414

1515

16-
llama_artifacts="."
16+
llama_artifacts="260k_stories"
1717
PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"
1818

19+
mkdir ${llama_artifacts}
1920
# Download stories260K.pt and tokenizer from Github
20-
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
21-
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
21+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output ${llama_artifacts}/stories260K.pt
22+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output ${llama_artifacts}/tokenizer.model
23+
24+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t ${llama_artifacts}/tokenizer.model -o ${llama_artifacts}/tokenizer.bin
2225
# Create params.json file
23-
touch params.json
24-
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
26+
touch ${llama_artifacts}/params.json
27+
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > ${llama_artifacts}/params.json
2528

2629
# Checks e2e accuracy
2730
expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")

backends/qualcomm/quantizer/custom_annotation.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
216216
weight = node.args[1]
217217
input_qspec_map[weight] = quantization_config.weight
218218

219+
if len(node.args) > 2 and isinstance(node.args[2], Node):
220+
bias = node.args[2]
221+
input_qspec_map[bias] = quantization_config.bias(node)
222+
219223
node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
220224
input_qspec_map=input_qspec_map,
221225
output_qspec=quantization_config.output_activation,

examples/qualcomm/oss_scripts/conv_former.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import json
8+
import logging
89
import os
910
import sys
1011
from multiprocessing.connection import Client
@@ -44,8 +45,11 @@ def main(args):
4445
)
4546

4647
data_num = 100
47-
if args.compile_only:
48+
if args.ci:
4849
inputs = [(torch.rand(1, 3, 224, 224),)]
50+
logging.warning(
51+
"This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
52+
)
4953
else:
5054
inputs, targets, input_list = get_imagenet_dataset(
5155
dataset_path=f"{args.dataset}",
@@ -132,7 +136,7 @@ def main(args):
132136
"for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
133137
),
134138
type=str,
135-
required=True,
139+
required=False,
136140
)
137141

138142
args = parser.parse_args()

0 commit comments

Comments
 (0)