Skip to content

Commit 46f019a

Browse files
fix llama BC checking
1 parent 371c80a commit 46f019a

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ echo "Creating tokenizer.bin"
3333
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
3434

3535
set +e
36-
# Compile only as weight sharing is not applicable on x86
37-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
36+
# Compile only as weight sharing is not applicable on x86.
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
3838
exit_code1=$?
3939

4040
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
4242
exit_code2=$?
4343

4444
# Check BC

backends/qualcomm/bc/test_qnn_static_llama_bc.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,17 @@ fi
1313
which "${PYTHON_EXECUTABLE}"
1414

1515

16-
llama_artifacts="."
16+
llama_artifacts="260k_stories"
1717
PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"
1818

19+
mkdir ${llama_artifacts}
1920
# Download stories260K.pt and tokenizer from Github
20-
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
21-
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
21+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output ${llama_artifacts}/stories260K.pt
22+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output ${llama_artifacts}/tokenizer.model
23+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t ${llama_artifacts}/tokenizer.model -o ${llama_artifacts}/tokenizer.bin
2224
# Create params.json file
23-
touch params.json
24-
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
25+
touch ${llama_artifacts}/params.json
26+
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > ${llama_artifacts}/params.json
2527

2628
# Checks e2e accuracy
2729
expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,6 @@ def compile(args, pte_filename, tokenizer):
362362
kv_config.use_kv_cache = True
363363
kv_config.enable_masked_softmax = args.enable_masked_softmax
364364
kv_config.enable_r3 = args.r3
365-
kv_config.base_model_name_or_path = args.decoder_model
366365

367366
prefill_config = copy.copy(kv_config)
368367
prefill_config.use_kv_cache = (
@@ -566,7 +565,6 @@ def permute(w, heads):
566565
llama_instance_list[i] = SingleLlama(
567566
llama_instance_list[i].eval(), pte_filename
568567
)
569-
570568
if args.embedding_quantize:
571569
llama_instance_list[i].passes_job[I64toI32][
572570
QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY

0 commit comments

Comments
 (0)