diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index 75cce6a67c2..ae7b78ecbb7 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -32,7 +32,8 @@ BENCHMARK_CONFIGS = { "xplat": [ "xnnpack_q8", - "hf_xnnpack_fp32", + "hf_xnnpack_custom_spda_kv_cache_8da4w", + "et_xnnpack_custom_spda_kv_cache_8da4w", "llama3_fb16", "llama3_spinquant", "llama3_qlora", @@ -129,8 +130,9 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]: """ configs = [] if is_valid_huggingface_model_id(model_name): + configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w") if model_name.startswith("meta-llama/"): - # LLaMA models + # etLLM recipes for Llama repo_name = model_name.split("meta-llama/")[1] if "qlora" in repo_name.lower(): configs.append("llama3_qlora") @@ -138,6 +140,7 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]: configs.append("llama3_spinquant") else: configs.append("llama3_fb16") + configs.append("et_xnnpack_custom_spda_kv_cache_8da4w") configs.extend( [ config @@ -145,9 +148,8 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]: if config.startswith("llama") ] ) - else: - # Non-LLaMA models - configs.append("hf_xnnpack_fp32") + if model_name.startswith("Qwen/Qwen3"): + configs.append("et_xnnpack_custom_spda_kv_cache_8da4w") elif model_name in MODEL_NAME_TO_MODEL: # ExecuTorch in-tree non-GenAI models configs.append("xnnpack_q8") diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml index c8a14c4ff05..1a707a29224 100644 --- a/.github/workflows/android-perf-private-device-experiment.yml +++ b/.github/workflows/android-perf-private-device-experiment.yml @@ -18,7 +18,7 @@ on: description: Models to be benchmarked required: false type: string - default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf devices: description: Target devices to run benchmark required: false @@ -34,7 +34,7 @@ on: description: Models to be benchmarked required: false type: string - default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf devices: description: Target devices to run benchmark required: false @@ -57,6 +57,6 @@ jobs: id-token: write contents: read with: - models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }} + models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }} devices: samsung_galaxy_s22_private benchmark_configs: ${{ inputs.benchmark_configs }} diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index e2f85e05d3a..34744268ff5 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -70,7 +70,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }} CRON_DEFAULT_DEVICES: samsung_galaxy_s22 run: | set -eux @@ -201,8 +201,8 @@ jobs: HF_MODEL_REPO=${{ matrix.model }} OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" + # Convert HF checkpoint to ET via etLLM path if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then - # Llama models on Hugging Face if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then # SpinQuant # Download prequantized chceckpoint from Hugging Face @@ -272,6 +272,21 @@ jobs: --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m examples.models.llama.export_llama \ + --model llama3_2 \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -d fp32 \ + -X \ + --xnnpack-extended-ops \ + -qmode 8da4w -G 32 -E 8,0 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ @@ -292,21 +307,75 @@ jobs: OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; ls -lh "${OUT_ET_MODEL_NAME}.pte" - else - # By default, test with the Hugging Face model and the xnnpack recipe - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") - python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" - ls -lh "${OUT_ET_MODEL_NAME}.pte" fi - else - echo "Unsupported model ${{ matrix.model }}" - exit 1 + elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then + if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") + python -m examples.models.llama.export_llama \ + --model qwen3-0_6b \ + --params examples/models/qwen3/0_6b_config.json \ + -kv \ + --use_sdpa_with_kv_cache \ + -d fp32 \ + -X \ + --xnnpack-extended-ops \ + -qmode 8da4w \ + -G 32 \ + -E 8,0 \ + --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + fi + + if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.json" + ) + echo "tokenizer.json is downloaded to $DOWNLOADED_PATH" + + # Install optimum-executorch + git clone https://github.com/huggingface/optimum-executorch + pushd optimum-executorch + # There is no release yet, for CI stability, always test from the same commit on main + git checkout 1c653dc49812fc431a22312c7295d97005d22e12 + python install_dev.py + pip list + + ARGS=( + "--model" "${HF_MODEL_REPO}" + "--task" "text-generation" + "--recipe" "xnnpack" + "--use_custom_sdpa" + "--qlinear" + "--qembedding" + "--output_dir" ".." + ) + + # Add conditional arguments based on model + case "${HF_MODEL_REPO}" in + *"google/gemma-3-1b-it"*) + echo "--use_custom_kv_cache can not be used for HybridCache" + ;; + *) + ARGS+=("--use_custom_kv_cache") + ;; + esac + + optimum-cli export executorch "${ARGS[@]}" + popd + + mv model.pte ${OUT_ET_MODEL_NAME}.pte + ls -lh "${OUT_ET_MODEL_NAME}.pte" fi - zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model" + zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.* ls -lh model.zip - mkdir -p "${ARTIFACTS_DIR_NAME}" - mv model.zip "${ARTIFACTS_DIR_NAME}" + mkdir -p ${ARTIFACTS_DIR_NAME} + mv model.zip ${ARTIFACTS_DIR_NAME} + ls -lh ${ARTIFACTS_DIR_NAME} elif [[ ${{ matrix.model }} == "llama" ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml index 4bb6e8f15ec..42c395519ac 100644 --- a/.github/workflows/apple-perf-private-device-experiment.yml +++ b/.github/workflows/apple-perf-private-device-experiment.yml @@ -18,7 +18,7 @@ on: description: Models to be benchmarked required: false type: string - default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf devices: description: Target devices to run benchmark required: false @@ -34,7 +34,7 @@ on: description: Models to be benchmarked required: false type: string - default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + default: Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf devices: description: Target devices to run benchmark required: false @@ -57,6 +57,6 @@ jobs: id-token: write contents: read with: - models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }} + models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }} devices: apple_iphone_15_private benchmark_configs: ${{ inputs.benchmark_configs }} diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 846dc576f43..e2f2cc2fcc3 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -70,7 +70,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'llama' }} CRON_DEFAULT_DEVICES: apple_iphone_15 run: | set -eux @@ -207,7 +207,10 @@ jobs: HF_MODEL_REPO=${{ matrix.model }} OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" + # Convert HF checkpoint to ET via etLLM path if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then + # The benchmark app replies on the _llm suffix to determine whether the model is a LLM or not + OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm # Llama models on Hugging Face if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then # SpinQuant @@ -278,6 +281,21 @@ jobs: --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model llama3_2 \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -d fp32 \ + -X \ + --xnnpack-extended-ops \ + -qmode 8da4w -G 32 -E 8,0 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then # ANE DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") @@ -293,18 +311,74 @@ jobs: --coreml-compute-units cpu_and_ne \ --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" - else - # By default, test with the Hugging Face model and the xnnpack recipe - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") - ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" - ls -lh "${OUT_ET_MODEL_NAME}.pte" fi - else - echo "Unsupported model ${{ matrix.model }}" - exit 1 + elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then + OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm + if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model qwen3-0_6b \ + --params examples/models/qwen3/0_6b_config.json \ + -kv \ + --use_sdpa_with_kv_cache \ + -d fp32 \ + -X \ + --xnnpack-extended-ops \ + -qmode 8da4w \ + -G 32 \ + -E 8,0 \ + --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + fi + + if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.json" + ) + echo "tokenizer.json is downloaded to $DOWNLOADED_PATH" + + # Install optimum-executorch + git clone https://github.com/huggingface/optimum-executorch + pushd optimum-executorch + # There is no release yet, for CI stability, always test from the same commit on main + git checkout 1c653dc49812fc431a22312c7295d97005d22e12 + ${CONDA_RUN} python install_dev.py + pip list + + ARGS=( + "--model" "${HF_MODEL_REPO}" + "--task" "text-generation" + "--recipe" "xnnpack" + "--use_custom_sdpa" + "--qlinear" + "--qembedding" + "--output_dir" ".." + ) + + # Add conditional arguments based on model + case "${HF_MODEL_REPO}" in + *"google/gemma-3-1b-it"*) + echo "--use_custom_kv_cache can not be used for HybridCache" + ;; + *) + ARGS+=("--use_custom_kv_cache") + ;; + esac + + ${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}" + popd + + # The benchmark app replies on the _llm suffix to determine whether the model is a LLM or not + OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm + mv model.pte ${OUT_ET_MODEL_NAME}.pte + ls -lh "${OUT_ET_MODEL_NAME}.pte" fi - zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model" + zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.* ls -lh model.zip mkdir -p "${ARTIFACTS_DIR_NAME}" mv model.zip "${ARTIFACTS_DIR_NAME}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 3f3fc3918fb..153c0ce2687 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -571,34 +571,40 @@ jobs: git clone https://github.com/huggingface/optimum-executorch pushd optimum-executorch # There is no release yet, for CI stability, always test from the same commit on main - git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8 + git checkout 1c653dc49812fc431a22312c7295d97005d22e12 pip install .[tests] + pip install transformers==4.52.4 popd - - if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then - # Fixes for gemma-3 is not available in the released version - git clone https://github.com/huggingface/transformers.git - pushd transformers - git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f - pip install -e . - popd - fi pip list echo "::endgroup::" echo "::group::Export to ExecuTorch" # Pass matrix variable as environment variable export MODEL_ID="${{ matrix.hf_model_id }}" - export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w" + export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w" pushd optimum-executorch - optimum-cli export executorch \ - --model ${MODEL_ID} \ - --task text-generation \ - --recipe xnnpack \ - --use_custom_sdpa \ - --output_dir ${OUTPUT_DIR} \ - --qlinear + ARGS=( + "--model" "${MODEL_ID}" + "--task" "text-generation" + "--recipe" "xnnpack" + "--use_custom_sdpa" + "--qlinear" + "--qembedding" + "--output_dir" "${OUTPUT_DIR}" + ) + + # Add conditional arguments based on model + case "${MODEL_ID}" in + *"google/gemma-3-1b-it"*) + echo "--use_custom_kv_cache can not be used for HybridCache" + ;; + *) + ARGS+=("--use_custom_kv_cache") + ;; + esac + + optimum-cli export executorch "${ARGS[@]}" ls -FlAGhp ${OUTPUT_DIR} popd @@ -701,18 +707,18 @@ jobs: timeout: 90 script: | set -eux - + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - + # Build and install Executorch PYTHON_EXECUTABLE=python \ CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \ .ci/scripts/setup-linux.sh --build-tool "cmake" - + # Install test requirements pip install -r backends/nxp/requirements-tests.txt - + # Run pytest PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh diff --git a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 index af726b36c82..7f2ba17abae 100644 --- a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 +++ b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 @@ -20,6 +20,7 @@ phases: - adb -s $DEVICEFARM_DEVICE_UDID push *.bin /sdcard > /dev/null && echo OK - adb -s $DEVICEFARM_DEVICE_UDID push *.model /sdcard > /dev/null && echo OK - adb -s $DEVICEFARM_DEVICE_UDID push *.pte /sdcard > /dev/null && echo OK + - adb -s $DEVICEFARM_DEVICE_UDID push *.json /sdcard > /dev/null && echo OK # Prepare the model and the tokenizer - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/" @@ -27,9 +28,11 @@ phases: - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/minibench/" - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.model /data/local/tmp/minibench/" - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/minibench/" + - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.json /data/local/tmp/minibench/" - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.bin" - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.model" - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.pte" + - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.json" - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/minibench/" - adb -s $DEVICEFARM_DEVICE_UDID shell "run-as org.pytorch.minibench rm -rf files" @@ -105,6 +108,13 @@ phases: echo "*.model tokenizer files found in /data/local/tmp/minibench/" fi + JSON_FOUND="$(adb -s $DEVICEFARM_DEVICE_UDID shell find /data/local/tmp/minibench/ -name '*.json')" + if [ -z "$JSON_FOUND" ]; then + echo "No *.json tokenizer files found in /data/local/tmp/minibench/" + else + echo "*.json tokenizer files found in /data/local/tmp/minibench/" + fi + - echo "Collect device state before running" - | adb -s $DEVICEFARM_DEVICE_UDID shell 'cat /sys/devices/system/cpu/cpu*/cpufreq/stats/time_in_state /sys/devices/system/cpu/cpu*/cpufreq/stats/trans_table' > $DEVICEFARM_LOG_DIR/state_before.txt @@ -128,11 +138,16 @@ phases: --es "model_dir" "/data/local/tmp/minibench" \ --es "tokenizer_path" "/data/local/tmp/minibench/tokenizer.model" \ --ei "num_iter" 5 --ei "num_warm_up_iter" 2 + elif [ -n "$JSON_FOUND" ]; then + adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.BenchmarkActivity \ + --es "model_dir" "/data/local/tmp/minibench" \ + --es "tokenizer_path" "/data/local/tmp/minibench/tokenizer.json" \ + --ei "num_iter" 5 --ei "num_warm_up_iter" 2 else adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.BenchmarkActivity \ --es "model_dir" "/data/local/tmp/minibench" fi - + - echo "Collect device state after running" - | adb -s $DEVICEFARM_DEVICE_UDID shell 'cat /sys/devices/system/cpu/cpu*/cpufreq/stats/time_in_state /sys/devices/system/cpu/cpu*/cpufreq/stats/trans_table' > $DEVICEFARM_LOG_DIR/state_after.txt diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index f6fe811b4ab..47a7af09dbd 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -33,6 +33,7 @@ 30AA4B642DC0766800B1BE50 /* std_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */; }; 30AA4B652DC0766800B1BE50 /* pre_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5B2DC0766800B1BE50 /* pre_tokenizer.cpp */; }; 30AA4B662DC0766800B1BE50 /* re2_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5C2DC0766800B1BE50 /* re2_regex.cpp */; }; + 3C6ABD332DFA27DE0015DE55 /* regex_lookahead.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3C6ABD322DFA27DE0015DE55 /* regex_lookahead.cpp */; }; F22E9E1A2DF2CBB900EC5425 /* text_llm_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */; }; F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */; }; F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */; }; @@ -95,6 +96,7 @@ 30AA4B5D2DC0766800B1BE50 /* regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = regex.cpp; path = src/regex.cpp; sourceTree = ""; }; 30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = std_regex.cpp; path = src/std_regex.cpp; sourceTree = ""; }; 30AA4B5F2DC0766800B1BE50 /* token_decoder.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = token_decoder.cpp; path = src/token_decoder.cpp; sourceTree = ""; }; + 3C6ABD322DFA27DE0015DE55 /* regex_lookahead.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = regex_lookahead.cpp; path = src/regex_lookahead.cpp; sourceTree = ""; }; F22E9E182DF2CBB900EC5425 /* text_llm_runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_llm_runner.h; sourceTree = ""; }; F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = text_llm_runner.cpp; sourceTree = ""; }; F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer_base.cpp; path = src/bpe_tokenizer_base.cpp; sourceTree = ""; }; @@ -181,6 +183,7 @@ 032A74022CAFBB7800932D36 /* tokenizers */ = { isa = PBXGroup; children = ( + 3C6ABD322DFA27DE0015DE55 /* regex_lookahead.cpp */, 30AA4B592DC0766800B1BE50 /* hf_tokenizer.cpp */, 30AA4B5A2DC0766800B1BE50 /* pcre2_regex.cpp */, 30AA4B5B2DC0766800B1BE50 /* pre_tokenizer.cpp */, @@ -397,7 +400,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"12.0\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; + shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"12.0\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF \\\n -DSUPPORT_REGEX_LOOKAHEAD=ON\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; }; /* End PBXShellScriptBuildPhase section */ @@ -434,6 +437,7 @@ 30AA4B652DC0766800B1BE50 /* pre_tokenizer.cpp in Sources */, 30AA4B662DC0766800B1BE50 /* re2_regex.cpp in Sources */, 032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */, + 3C6ABD332DFA27DE0015DE55 /* regex_lookahead.cpp in Sources */, 032A74262CAFC34800932D36 /* llama_tiktoken.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index c56f054ae3b..66f2e025749 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -60,7 +60,7 @@ @implementation LLaMATests + (NSDictionary *)predicates { return @{ @"model" : ^BOOL(NSString *filename){ - return [filename hasSuffix:@".pte"] && [filename.lowercaseString containsString:@"llama"]; + return [filename hasSuffix:@".pte"] && [filename.lowercaseString containsString:@"llm"]; }, @"tokenizer" : ^BOOL(NSString *filename) { return [filename isEqual:@"tokenizer.bin"] || [filename isEqual:@"tokenizer.model"] || [filename isEqual:@"tokenizer.json"];