Skip to content

Add Llama3.1 1B HTP to benchmark #444

Add Llama3.1 1B HTP to benchmark

Add Llama3.1 1B HTP to benchmark #444

Workflow file for this run

name: android-perf
on:
schedule:
- cron: 0 0 * * *
pull_request:
paths:
- .github/workflows/android-perf.yml
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
push:
branches:
- main
paths:
- .github/workflows/android-perf.yml
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
# Note: GitHub has an upper limit of 10 inputs
workflow_dispatch:
inputs:
models:
description: Models to be benchmarked
required: false
type: string
default: stories110M
devices:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
workflow_call:
inputs:
models:
description: Models to be benchmarked
required: false
type: string
default: stories110M
devices:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
set-parameters:
runs-on: ubuntu-22.04
outputs:
benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Set parameters
id: set-parameters
shell: bash
env:
# Separate default values from the workflow dispatch. To ensure defaults are accessible
# during scheduled runs and to provide flexibility for different defaults between
# on-demand and periodic benchmarking.
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
run: |
set -eux
MODELS="${{ inputs.models }}"
if [ -z "$MODELS" ]; then
MODELS="$CRON_DEFAULT_MODELS"
fi
DEVICES="${{ inputs.devices }}"
if [ -z "$DEVICES" ]; then
DEVICES="$CRON_DEFAULT_DEVICES"
fi
PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \
--os "android" \
--models $MODELS \
--devices $DEVICES
prepare-test-specs:
runs-on: linux.2xlarge
needs: set-parameters
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
fail-fast: false
steps:
- uses: actions/checkout@v3
- name: Prepare the spec
shell: bash
working-directory: extension/benchmark/android/benchmark
run: |
set -eux
# The model will be exported in the next step to this S3 path
MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
# We could write a script to properly use jinja here, but there is only one variable,
# so let's just sed it
sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
# Just print the test spec for debugging
cat android-llm-device-farm-test-spec.yml
- name: Upload the spec
uses: seemethere/upload-artifact-s3@v5
with:
s3-bucket: gha-artifacts
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
retention-days: 1
if-no-files-found: error
path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
export-models:
name: export-models
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-parameters
secrets: inherit
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
fail-fast: false
with:
runner: linux.2xlarge.memory
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
timeout: 120
upload-artifact: android-models
upload-artifact-to-s3: true
secrets-env: EXECUTORCH_HF_TOKEN
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
echo "::group::Setting up dev environment"
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
if [[ ${{ matrix.config }} == *"qnn"* ]]; then
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
fi
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
pip install -U "huggingface_hub[cli]"
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
pip install accelerate sentencepiece
pip list
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
echo "::endgroup::"
echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
BUILD_MODE="cmake"
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
HF_MODEL_REPO=${{ matrix.model }}
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
# Llama models on Hugging Face
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
# SpinQuant
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}"
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
export PYTHONPATH=$(pwd)/..
echo "PYTHONPATH=${PYTHONPATH}"
python -m examples.qualcomm.oss_scripts.llama3_2.llama \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
--compile_only \
--ptq 16a4w \
-m SM8650 \
--model_size 1B \
--model_mode kv \
-b "cmake-out" \
--prompt "Once"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}"
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
export PYTHONPATH=$(pwd)/..
echo "PYTHONPATH=${PYTHONPATH}"
python -m examples.qualcomm.oss_scripts.llama3_2.llama \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
--compile_only \
--ptq 16a4w \
-m SM8650 \
--model_size 1B \
--model_mode kv \
-b "cmake-out" \
--prompt "Once"
OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
# By default, test with the Hugging Face model and the xnnpack recipe
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
else
echo "Unsupported model ${{ matrix.model }}"
exit 1
fi
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
ls -lh model.zip
mkdir -p "${ARTIFACTS_DIR_NAME}"
mv model.zip "${ARTIFACTS_DIR_NAME}"
elif [[ ${{ matrix.model }} == "llama" ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
DELEGATE_CONFIG="xnnpack+custom+qe"
elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
DELEGATE_CONFIG="qnn"
else
echo "Unsupported delegate ${{ matrix.config }}"
exit 1
fi
DTYPE="fp32"
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
-model "${{ matrix.model }}" \
-build_tool "${BUILD_MODE}" \
-dtype "${DTYPE}" \
-mode "${DELEGATE_CONFIG}" \
-upload "${ARTIFACTS_DIR_NAME}"
else
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
"${{ matrix.model }}" \
"${BUILD_MODE}" \
"${{ matrix.config }}" \
"${ARTIFACTS_DIR_NAME}"
fi
echo "::endgroup::"
build-benchmark-app:
name: build-benchmark-app
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-parameters
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-clang12-android
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
upload-artifact: android-apps
upload-artifact-to-s3: true
script: |
set -eux
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
export ANDROID_ABIS="arm64-v8a"
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
# Let's see how expensive this job is, we might want to tone it down by running it periodically
benchmark-on-device:
if: always()
permissions:
id-token: write
contents: read
uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
needs:
- set-parameters
- prepare-test-specs
- build-benchmark-app
- export-models
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
fail-fast: false
with:
# Due to scheduling a job may be pushed beyond the default 60m threshold
timeout: 120
device-type: android
runner: linux.2xlarge
test-infra-ref: ''
# This is the ARN of ExecuTorch project on AWS
project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
device-pool-arn: ${{ matrix.device_arn }}
android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml
upload-benchmark-results:
needs:
- benchmark-on-device
if: always()
runs-on: linux.2xlarge
environment: upload-benchmark-results
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v3
with:
submodules: false
- name: Authenticate with AWS
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# The max duration enforced by the server side
role-duration-seconds: 18000
aws-region: us-east-1
- name: Setup conda
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
with:
python-version: '3.10'
- name: Download the list of artifacts from S3
env:
ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
shell: bash
run: |
set -eux
${CONDA_RUN} python -mpip install awscli==1.32.18
mkdir -p artifacts
pushd artifacts
${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
popd
ls -lah artifacts
- name: Extract the benchmark results JSON
shell: bash
run: |
set -eux
mkdir -p benchmark-results
for ARTIFACTS_BY_JOB in artifacts/*.json; do
[ -f "${ARTIFACTS_BY_JOB}" ] || break
echo "${ARTIFACTS_BY_JOB}"
${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
--artifacts "${ARTIFACTS_BY_JOB}" \
--output-dir benchmark-results \
--repo ${{ github.repository }} \
--head-branch ${{ github.head_ref || github.ref_name }} \
--workflow-name "${{ github.workflow }}" \
--workflow-run-id ${{ github.run_id }} \
--workflow-run-attempt ${{ github.run_attempt }}
done
for SCHEMA in v2 v3; do
for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
cat "${BENCHMARK_RESULTS}"
echo
done
done
# TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
- name: Upload the benchmark results (v2)
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
with:
benchmark-results-dir: benchmark-results/v2
dry-run: false
schema-version: v2
- name: Upload the benchmark results (v3)
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
with:
benchmark-results-dir: benchmark-results/v3
dry-run: false
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}