diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 7d3370ee561..0afe51f0b0c 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -48,9 +48,9 @@ build_llama_runner() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-android-out/examples/models/llama2 examples/models/llama2 + -Bcmake-android-out/examples/models/llama examples/models/llama - cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release + cmake --build cmake-android-out/examples/models/llama -j4 --config Release } install_flatc_from_source install_executorch_and_backend_lib diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 94fd5d486b8..ed2a9c2558b 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -125,7 +125,7 @@ cmake_install_executorch_libraries() { cmake_build_llama_runner() { echo "Building llama runner" - dir="examples/models/llama2" + dir="examples/models/llama" retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ @@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" fi # Add dynamically linked library location -$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS} +$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} # Create tokenizer.bin. echo "Creating tokenizer.bin" @@ -219,7 +219,7 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode" if [[ "${BUILD_TOOL}" == "buck2" ]]; then # Run model. # shellcheck source=/dev/null - $BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt + $BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt elif [[ "${BUILD_TOOL}" == "cmake" ]]; then cmake_install_executorch_libraries cmake_build_llama_runner @@ -227,7 +227,7 @@ elif [[ "${BUILD_TOOL}" == "cmake" ]]; then NOW=$(date +"%H:%M:%S") echo "Starting to run llama runner at ${NOW}" # shellcheck source=/dev/null - cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt + cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt NOW=$(date +"%H:%M:%S") echo "Finished at ${NOW}" else diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 3f95fbc0b6c..4e37d0ebaa3 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -75,9 +75,9 @@ run_portable_executor_runner() { test_model() { if [[ "${MODEL_NAME}" == "llama2" ]]; then # Install requirements for export_llama - bash examples/models/llama2/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama2.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json + bash examples/models/llama/install_requirements.sh + # Test export_llama script: python3 -m examples.models.llama.export_llama + "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" fi diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 35d3568c418..c5f244a9349 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -160,7 +160,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 7de73a23ff4..7de308b1a63 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -162,7 +162,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - bash examples/models/llama2/install_requirements.sh + bash examples/models/llama/install_requirements.sh # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3bea2984184..bb66ba54c32 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -123,7 +123,7 @@ jobs: # Setup executorch PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" @@ -222,7 +222,7 @@ jobs: bash install_requirements.sh --pybind xnnpack # install Llava requirements - bash examples/models/llama2/install_requirements.sh + bash examples/models/llama/install_requirements.sh bash examples/models/llava/install_requirements.sh # run python unittest @@ -417,7 +417,7 @@ jobs: # Setup executorch PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 42f29aea9d7..2d4bb8184b8 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -257,7 +257,7 @@ jobs: fi # Install requirements for export_llama - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}" @@ -281,7 +281,7 @@ jobs: # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # # install Llava requirements - # ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + # ${CONDA_RUN} bash examples/models/llama/install_requirements.sh # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh # # run python unittest @@ -387,7 +387,7 @@ jobs: cmake --build cmake-out -j9 --target install --config Release echo "Build llama runner" - dir="examples/models/llama2" + dir="examples/models/llama" cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ @@ -439,5 +439,5 @@ jobs: python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME} - cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" + cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" echo "::endgroup::" diff --git a/README.md b/README.md index e9ab0773a11..b27845e9f55 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch. Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin. -Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. +Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. -**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch. +**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch. ## Feedback diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index 24ebcd98086..f7a3e220dee 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -53,7 +53,7 @@ def define_common_targets(): exclude = ["Logging.h"], ), define_static_target = True, - link_whole = True, # needed for executorch/examples/models/llama2:main to register QnnBackend + link_whole = True, # needed for executorch/examples/models/llama:main to register QnnBackend platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], resources = { diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 01b1014e4cf..1594ad58dbb 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -37,10 +37,7 @@ skip_annotation, ) -from executorch.examples.models.llama2.llama_transformer import ( - ModelArgs, - MOEFeedForward, -) +from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward from executorch.examples.qualcomm.utils import setup_common_args_and_variables @@ -51,7 +48,7 @@ from executorch.examples.models.inception_v3 import InceptionV3Model from executorch.examples.models.inception_v4 import InceptionV4Model -# from executorch.examples.models.llama2 import Llama2Model +# from executorch.examples.models.llama import Llama2Model from executorch.examples.models.mobilebert import MobileBertModelExample from executorch.examples.models.mobilenet_v2 import MV2Model from executorch.examples.models.mobilenet_v3 import MV3Model diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index de6c400f113..2a4faacc0c8 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -57,7 +57,7 @@ partially lower the Llama model to Vulkan. ```shell # The files will usually be downloaded to ~/.llama -python -m examples.models.llama2.export_llama \ +python -m examples.models.llama.export_llama \ --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ @@ -95,23 +95,23 @@ binary using the Android NDK toolchain. cmake --build cmake-android-out -j16 --target install) # Build LLaMA Runner library -(rm -rf cmake-android-out/examples/models/llama2 && \ - cmake examples/models/llama2 \ +(rm -rf cmake-android-out/examples/models/llama && \ + cmake examples/models/llama \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DPYTHON_EXECUTABLE=python \ - -Bcmake-android-out/examples/models/llama2 && \ - cmake --build cmake-android-out/examples/models/llama2 -j16) + -Bcmake-android-out/examples/models/llama && \ + cmake --build cmake-android-out/examples/models/llama -j16) ``` Finally, push and run the llama runner binary on your Android device. Note that your device must have sufficient GPU memory to execute the model. ```shell -adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main +adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main adb shell /data/local/tmp/llama_main \ --model_path=/data/local/tmp/vulkan_llama2.pte \ diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS index 11209e41bac..b2db8060e1a 100644 --- a/backends/xnnpack/test/TARGETS +++ b/backends/xnnpack/test/TARGETS @@ -58,7 +58,7 @@ runtime.python_test( "fbsource//third-party/pypi/torchsr:torchsr", # @manual "fbsource//third-party/pypi/transformers:transformers", # @manual "//executorch/backends/xnnpack/test/tester:tester", - "//executorch/examples/models/llama2:llama2_model", + "//executorch/examples/models/llama:llama2_model", "//pytorch/audio/src:torchaudio_core", "//pytorch/vision:torchvision", # @manual ], diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py index 6948321d532..f1dce43c3c9 100644 --- a/backends/xnnpack/test/models/llama2_et_example.py +++ b/backends/xnnpack/test/models/llama2_et_example.py @@ -9,7 +9,7 @@ import torch from executorch.backends.xnnpack.test.tester import Tester -from executorch.examples.models.llama2.model import Llama2Model +from executorch.examples.models.llama.model import Llama2Model class TestLlama2ETExample(unittest.TestCase): diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 47bcf0ce4bc..9d4e595da3d 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -383,7 +383,7 @@ deps = [ [targets.llama_runner] buck_targets = [ - "//examples/models/llama2/runner:runner", + "//examples/models/llama/runner:runner", ] filters = [ ".cpp$", diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index ac95fb21bd8..0157668d7fe 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -6,7 +6,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng - Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment. - Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device. -- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch. +- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch. - A Qualcomm device with 16GB RAM - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices. - The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above. @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure ```bash # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama2.export_llama -t +python -m examples.models.llama.export_llama -t llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` @@ -76,9 +76,9 @@ llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c ${DEVICE_DIR} adb push ${DEVICE_DIR} adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR} -adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR} +adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR} ``` **3.4 Run model** diff --git a/docs/source/llm/llama.md b/docs/source/llm/llama.md index 2d266ba7ae7..fd0e436b94d 100644 --- a/docs/source/llm/llama.md +++ b/docs/source/llm/llama.md @@ -1,5 +1,5 @@ # Llama on ExecuTorch See -[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) +[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md) for detailed information about running Llama on ExecuTorch. diff --git a/examples/README.md b/examples/README.md index 2c1093296cb..17999b15423 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,7 +39,7 @@ For specific details related to models and backend, you can explore the various ### Llama Models -[This page](./models/llama2/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. +[This page](./models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. ### Llava1.5 7B diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm index e3d0e2978b6..237645ec3f8 100644 --- a/examples/apple/mps/executor_runner/mps_executor_runner.mm +++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm @@ -372,7 +372,7 @@ HierarchicalAllocator planned_memory( strstr(model_path, "emformer_transcribe") || strstr(model_path, "emformer_join") || strstr(model_path, "edsr") || - strstr(model_path, "llama2") || + strstr(model_path, "llama") || strstr(model_path, "ic3") || strstr(model_path, "ic4")) { atol = 1e-04; diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py index 603eb5f3d9e..58a3035723f 100644 --- a/examples/cadence/models/babyllama.py +++ b/examples/cadence/models/babyllama.py @@ -14,7 +14,7 @@ from executorch.backends.cadence.aot.export_example import export_model -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index 54bf956176e..8308da6d840 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -74,7 +74,7 @@ cmake --build cmake-out -j16 --target install --config Release ### Setup Llama Runner Next we need to build and compile the Llama runner. This is similar to the requirements for running Llama with XNNPACK. ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=cmake-out \ @@ -84,9 +84,9 @@ cmake -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_QNN=ON \ - -Bcmake-out/examples/models/llama2 \ - examples/models/llama2 -cmake --build cmake-out/examples/models/llama2 -j16 --config Release + -Bcmake-out/examples/models/llama \ + examples/models/llama +cmake --build cmake-out/examples/models/llama -j16 --config Release ``` ## Export Llama Model @@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` Note: if you encountered issues below ``` @@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama2.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index 9a8b86b8a50..527922c7723 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -66,12 +66,12 @@ In this demo app, we support text-only inference with up-to-date Llama models an We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models. * You can request and download model weights for Llama through Meta official [website](https://llama.meta.com/). * For chat use-cases, download the instruct models instead of pretrained. -* Run `examples/models/llama2/install_requirements.sh` to install dependencies. +* Run `examples/models/llama/install_requirements.sh` to install dependencies. * The 1B model in BFloat16 format can run on mobile devices with 8GB RAM. The 3B model will require 12GB+ RAM. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" ``` * Rename tokenizer for Llama 3.2 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly. @@ -88,19 +88,19 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama2.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` ### For Llama 3.1 and Llama 2 models * You can download original model weights for Llama through Meta official [website](https://llama.meta.com/). * For Llama 2 models, Edit params.json file. Replace "vocab_size": -1 with "vocab_size": 32000. This is a short-term workaround -* Run `examples/models/llama2/install_requirements.sh` to install dependencies. +* Run `examples/models/llama/install_requirements.sh` to install dependencies. * The Llama 3.1 and Llama 2 models (8B and 7B) can run on devices with 12GB+ RAM. * Export Llama model and generate .pte file ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index c947e20af94..7b93af46e4e 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -99,8 +99,8 @@ 035A5E942BB4B523001E0553 /* LLaMA.entitlements */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.entitlements; path = LLaMA.entitlements; sourceTree = ""; }; 036CAF9D2BB1444500D6C2D5 /* LLaMA.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMA.app; sourceTree = BUILT_PRODUCTS_DIR; }; 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; }; - 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = ""; }; - 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = ""; }; + 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama/runner/runner.cpp; sourceTree = ""; }; + 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama/runner/runner.h; sourceTree = ""; }; 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = ""; }; 03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = ""; }; 03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = ""; }; @@ -275,7 +275,7 @@ 03729F152BB2043600152F2E /* tokenizer.h */, ); name = tokenizer; - path = ../../../../../models/llama2/tokenizer; + path = ../../../../../models/llama/tokenizer; sourceTree = ""; }; 03729F0F2BB203E100152F2E /* sampler */ = { diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm index e6a4722ddbf..e03bc7aabc4 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm @@ -9,7 +9,7 @@ #import "LLaMARunner.h" #import -#import +#import #import using executorch::extension::llm::Image; diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index 20ee73b821f..33dcc93a2b9 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -42,12 +42,12 @@ In this demo app, we support text-only inference with Llama 3.1, Llama 3, and Ll Install the required packages to export the model ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh ``` Export the model ``` -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index c3e254d77a2..faec4e3a3ac 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -43,19 +43,19 @@ In this demo app, we support text-only inference with up-to-date Llama models. Install the required packages to export the model ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh ``` ### For Llama 3.2 1B and 3B models We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models. * You can download original model weights for Llama through Meta official [website](https://llama.meta.com/). * For chat use-cases, download the instruct models instead of pretrained. -* Run “examples/models/llama2/install_requirements.sh” to install dependencies. +* Run “examples/models/llama/install_requirements.sh” to install dependencies. * The 1B model in BFloat16 format can run on mobile devices with 8GB RAM (iPhone 15 Pro and later). The 3B model will require 12GB+ RAM and hence will not fit on 8GB RAM phones. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -64,7 +64,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 1d411f07ca7..61906870e1d 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -149,7 +149,7 @@ if(${ANDROID}) PRIVATE ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp ) # Include directory for neuron headers diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index 2ebacec2c56..4fba0e20a81 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -67,7 +67,7 @@ #include "llama_runner/Utils.h" #include "llama_runner/llm_helper/include/llm_types.h" -#include +#include #include #include diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py index 592ebab1450..ee3fb560429 100644 --- a/examples/models/checkpoint.py +++ b/examples/models/checkpoint.py @@ -15,15 +15,15 @@ def get_default_model_resource_dir(model_file_path: str) -> Path: Get the default path to resouce files (which contain files such as the checkpoint and param files), either: 1. Uses the path from pkg_resources, only works with buck2 - 2. Uses default path located in examples/models/llama2/params + 2. Uses default path located in examples/models/llama/params Expected to be called from with a `model.py` file located in a `executorch/examples/models/` directory. Args: model_file_path: The file path to the eager model definition. - For example, `executorch/examples/models/llama2/model.py`, - where `executorch/examples/models/llama2` contains all + For example, `executorch/examples/models/llama/model.py`, + where `executorch/examples/models/llama` contains all the llama2-related files. Returns: @@ -35,7 +35,7 @@ def get_default_model_resource_dir(model_file_path: str) -> Path: # 1st way: If we can import this path, we are running with buck2 and all resources can be accessed with pkg_resources. # pyre-ignore - from executorch.examples.models.llama2 import params # noqa + from executorch.examples.models.llama import params # noqa # Get the model name from the cwd, assuming that this module is called from a path such as # examples/models//model.py. diff --git a/examples/models/llama2/Android3_2_1B_bf16.gif b/examples/models/llama/Android3_2_1B_bf16.gif similarity index 100% rename from examples/models/llama2/Android3_2_1B_bf16.gif rename to examples/models/llama/Android3_2_1B_bf16.gif diff --git a/examples/models/llama2/Android3_2_3B_SpinQuant.gif b/examples/models/llama/Android3_2_3B_SpinQuant.gif similarity index 100% rename from examples/models/llama2/Android3_2_3B_SpinQuant.gif rename to examples/models/llama/Android3_2_3B_SpinQuant.gif diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama/CMakeLists.txt similarity index 100% rename from examples/models/llama2/CMakeLists.txt rename to examples/models/llama/CMakeLists.txt diff --git a/examples/models/llama2/LICENSE b/examples/models/llama/LICENSE similarity index 100% rename from examples/models/llama2/LICENSE rename to examples/models/llama/LICENSE diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md new file mode 100644 index 00000000000..29d468543a6 --- /dev/null +++ b/examples/models/llama/README.md @@ -0,0 +1,497 @@ +# Summary +This example demonstrates how to run a [llama models](https://www.llama.com/) on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone. + +Here are supported models: + +- Llama 3.2 1B and 3B +- Llama 3.1 8B +- Llama 3 8B +- Llama 2 7B + +Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + +# What is Llama? +Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence. + +Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important. + +Overall, Llama models are powerful and versatile language models that can be used for a wide range of natural language processing tasks. The model’s ability to generate coherent and contextually relevant text makes it particularly useful for applications such as chatbots, virtual assistants, and language translation. + +Please note that the models are subject to the [Llama 2 Acceptable Use Policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md), [Llama 3 Acceptable Use Policy](https://github.com/meta-llama/llama3/blob/main/USE_POLICY.md) and [Responsible Use Guide](https://ai.meta.com/static-resource/responsible-use-guide/). + + +# Results + +Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. + +For Llama 3.2 1B/3B, we validated the models by running them in their original bf16 datatype and unquantized on both Android and iOS phones. The 3B version required high-end phones with larger RAMs to fit the model. + +Additionally, 1B/3B models are sensitive to accuracy loss when regular PTQ quantization is applied, so we employed 4bit quantization using [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main) to achieve a good balance between accuracy, performance and memory. + + + + + + +
+ +
+ + Llama3.1 8B, 4bit quantized on Android phone + +
+
+ Llama3.2 1B, unquantized, bf16 on Android phone. +
+ +## Quantization: +We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao). + +We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Please note that LM Eval reports perplexity normalized by word count instead of token count. You may see different perplexity for WikiText from other sources if they implement it differntly. More details could be found [here](https://github.com/EleutherAI/lm-evaluation-harness/issues/2301). + +Below are the results for two different groupsizes, with max_seq_length 2048, and limit 1000. + +|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------|-----------------| ---------------------- | --------------- +|Llama 2 7B | 9.2 | 10.2 | 10.7 +|Llama 3 8B | 7.9 | 9.4 | 9.7 + +Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32. + +### SpinQuant for Llama 3.2 1B/3B models (Optional) + +To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node. + +SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (e.g., group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We enabled SpinQuant for Llama3.2 1B/3B models on ExecuTorch. + +

+ +
+ + Running Llama3.2 3B on Android phone. + +
+ + 4bit quantization using SpinQuant + +

+ +## Enablement + +For Llama 3 8B and Llama3.1 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM). + +We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12. + +## Performance + +### Llama 3.2 1B and 3B +Llama 3.2 1B and 3B performance was measured on the OnePlus 12 device. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on) for generating 128 tokens. + +|Model | bf16 | 4bit(*) via SpinQuant +|--------| ---------------------- | --------------- +|1B | 19.4 tokens/second | 53.41 tokens/second | +|3B | 7.76 tokens/second | 22.98 tokens/second | + +(*) With SpinQuant, we currently quantize 4-bit groupwise (with groupsize 32) weight, 8bit dynamic activation of all the linear layers of the model, except embedding and output layers. The embedding and output layers are quantized as 8-bit per-channel weight and 8-bit dynamic activation. + +### Llama3 8B and Llama3.1 8B +Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). + +Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantize embedding lookup table as well. For these results embedding lookup table was groupwise quantized with 4-bits and group size of 32. + +|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------| ---------------------- | --------------- +|Galaxy S22 | 7.85 tokens/second | 8.4 tokens/second | +|Galaxy S24 | 10.91 tokens/second | 11.21 tokens/second | +|OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second | + +### Llama2 7B +Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). + +|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------| ---------------------- | --------------- +|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second | +|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second | +|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second | + +# Instructions + +## Tested on + +- MacOS M1/M2, Linux. +- For Llama 2 7B, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model. + +## Step 1: Setup +> :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. + +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` +2. Run `examples/models/llama/install_requirements.sh` to install a few dependencies. + + +## Step 2: Prepare model + +### Option A: Download and export Llama3.2 1B/3B model. + +1. Download `consolidated.00.pth`, `params.json` and `tokenizer.model` from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). For chat use-cases, download the instruct models. + +2. Export model and generate `.pte` file. Use original bfloat16 version, without any quantization. + +``` +# Set these paths to point to the downloaded files +LLAMA_CHECKPOINT=path/to/checkpoint.pth +LLAMA_PARAMS=path/to/params.json + +python -m examples.models.llama.export_llama \ + --checkpoint "${LLAMA_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="llama3_2.pte" +``` + +Optionally, we can apply SpinQuant to quantize the model without sacrifacing too much accuracy loss. + +To use SpinQuant, follow its [instruction](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) for exporting checkpoint to ExecuTorch and then export the SpinQuant checkpoint. + +``` +# Set these paths to point to the exported files +LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth +LLAMA_PARAMS=path/to/params.json + +python -m examples.models.llama.export_llama \ + --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + --use_sdpa_with_kv_cache \ + -X \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --max_seq_length 2048 \ + --output_name "llama3_2.pte" \ + -kv \ + -d fp32 \ + --preq_embedding_quantize 8,0 \ + --use_spin_quant native \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +``` + +### Option B: Download and export Llama 3 8B instruct model + +You can export and run the original Llama 3 8B instruct model. + +1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). + +2. Export model and generate `.pte` file + ``` + python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + ``` + + Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. + +### Option C: Download and export stories110M model + +If you want to deploy and run a smaller model for educational purposes. From `executorch` root: + +1. Download `stories110M.pt` and `tokenizer.model` from Github. + ``` + wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" + wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" + ``` +2. Create params file. + ``` + echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json + ``` +3. Export model and generate `.pte` file. + ``` + python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + ``` + +### Option D: Download and export Llama 2 7B model + +You can export and run the original Llama 2 7B model. + +1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). + +2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround. + +3. Export model and generate `.pte` file: + ``` + python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + ``` +4. Create tokenizer.bin. + ``` + python -m extension.llm.tokenizer.tokenizer -t -o tokenizer.bin + ``` + +### Option E: Download models from Hugging Face and convert from safetensor format to state dict + + +You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune). + + +```Python +from torchtune.utils import FullModelHFCheckpointer +from torchtune.models import convert_weights +import torch + +# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face +checkpointer = FullModelHFCheckpointer( + checkpoint_dir='/home/.cache/huggingface/hub/models/snapshots/hash-number', + checkpoint_files=['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors'], + output_dir='/the/destination/dir' , + model_type='LLAMA3' # or other types that TorchTune supports +) + +print("loading checkpoint") +sd = checkpointer.load_checkpoint() + +# Convert from TorchTune to Meta (PyTorch native) +sd = convert_weights.tune_to_meta(sd['model']) + +print("saving checkpoint") +torch.save(sd, "/the/destination/dir/checkpoint.pth") +``` + +## (Optional) Finetuning + +If you want to finetune your model based on a specific dataset, PyTorch provides [TorchTune](https://github.com/pytorch/torchtune) - a native-Pytorch library for easily authoring, fine-tuning and experimenting with LLMs. + +Once you have [TorchTune installed](https://github.com/pytorch/torchtune?tab=readme-ov-file#get-started) you can finetune Llama2 7B model using LoRA on a single GPU, using the following command. This will produce a checkpoint where the LoRA weights are merged with the base model and so the output checkpoint will be in the same format as the original Llama2 model. + +``` +tune run lora_finetune_single_device \ +--config llama2/7B_lora_single_device \ +checkpointer.checkpoint_dir= \ +tokenizer.path=/tokenizer.model +``` + +To run full finetuning with Llama2 7B on a single device, you can use the following command. + +``` +tune run full_finetune_single_device \ +--config llama2/7B_full_single_device \ +checkpointer.checkpoint_dir= \ +tokenizer.path=/tokenizer.model +``` + +## Step 3: Evaluate model accuracy + +> Forewarning: Model evaluation without a GPU may take a long time, especially on larger models. + +We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate model accuracy. + +For base models, use the following example command to calculate its perplexity based on WikiText. +``` +python -m examples.models.llama.eval_llama \ + -c \ + -p \ + -t \ + -kv \ + -d \ + --max_seq_len \ + --limit +``` + +For instruct models, use the following example command to calculate its MMLU score. +``` +python -m examples.models.llama.eval_llama \ + -c \ + -p \ + -t \ + -kv \ + -d \ + --tasks mmlu \ + --num_fewshot 5 \ + --max_seq_len +``` + +## Step 4: Run on your computer to validate + +1. Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). + ``` + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out . + + cmake --build cmake-out -j16 --target install --config Release + ``` +Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the session of Common Issues and Mitigations below for solutions. + +2. Build llama runner. + ``` + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -Bcmake-out/examples/models/llama \ + examples/models/llama + + cmake --build cmake-out/examples/models/llama -j16 --config Release + ``` + +3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40). + ``` + cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= + ``` + +For Llama2 models, pass the converted `tokenizer.bin` file instead of `tokenizer.model`. + +To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` + +## Step 5: Run benchmark on Android phone + +**1. Build llama runner binary for Android** + +*Pre-requisite*: Android NDK (tested with r27b) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it. + +**1.1 Set Android NDK** +``` +export ANDROID_NDK= +``` +**1.2 Build executorch and associated libraries for android.** +``` +cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-out-android \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out-android . + +cmake --build cmake-out-android -j16 --target install --config Release +``` + +**1.2 Build llama runner for android** +``` +cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-out-android \ + -DCMAKE_BUILD_TYPE=Release \ + -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out-android/examples/models/llama \ + examples/models/llama + +cmake --build cmake-out-android/examples/models/llama -j16 --config Release +``` + +**2. Run on Android via adb shell** + +*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone + +**2.1 Connect your android phone** + +**2.2 Upload model, tokenizer and llama runner binary to phone** +``` +adb shell mkdir -p /data/local/tmp/llama +adb push /data/local/tmp/llama/ +adb push /data/local/tmp/llama/ +adb push cmake-out-android/examples/models/llama/llama_main /data/local/tmp/llama/ +``` + +**2.3 Run model** +``` +adb shell "cd /data/local/tmp/llama && ./llama_main --model_path --tokenizer_path --prompt \"Once upon a time\" --seq_len 120" +``` +## Step 6: Build Mobile apps + +### iOS + +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. + +### Android +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. + +## Optional: Smaller models delegated to other backends +Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction +for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is + +- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` + +The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. + +For CoreML, there are 2 additional optional arguments: +* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) +* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML + +# What is coming next? +## Quantization +- Enabling FP16 model to leverage smaller groupsize for 4-bit quantization. +- Enabling GPTQ for 4-bit groupwise quantization +- Enabling custom quantization +- Lower bit quantization +## Models +- Enabling more generative AI models and architectures. +- Enable support for mult-modal models like LlaVa. +## Performance +- Performance improvement via techniques such as speculative decoding +- Enabling LLama2 7b and other architectures via Vulkan +- Enabling performant execution of widely used quantization schemes. + + +# Notes +This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch: +1. Since ExecuTorch does not support complex Tensor data type, use the customized functions to have rotary embedding with real numbers. Please see [GitHub issue: Support complex data type in ExecuTorch](https://github.com/pytorch/executorch/issues/886). +2. No CUDA. ExecuTorch is focused on Edge use cases where CUDA is not available on most of the edge devices. +3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch. + + +# Common Issues and Mitigations: +- To clean your build: +``` +git clean -xfd +pip uninstall executorch +./install_requirements.sh --pybind xnnpack + +rm -rf cmake-out +``` +- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` +- On Mac, if there is linking error in Step 4 with error message like +``` +0 0x100823648 __assert_rtn + 72 +1 0x10074bc5c ld::Fixup::applyFixup(ld::Atom const*, ld::LayoutLinkedImage const&, unsigned char*) const + 8268 +2 0x1007de7d8 ___ZN2ld16LayoutExecutable27writeContentWithoutLinkEditENSt3__14spanIhLm18446744073709551615EEEy_block_invoke + 332 +3 0x188cca428 _dispatch_client_callout2 + 20 +4 0x188cde850 _dispatch_apply_invoke3 + 336 +5 0x188cca3e8 _dispatch_client_callout + 20 +6 0x188ccbc68 _dispatch_once_callout + 32 +7 0x188cdeeec _dispatch_apply_invoke_and_wait + 372 +8 0x188cdde9c _dispatch_apply_with_attr_f + 1212 +9 0x188cde08c dispatch_apply + 96 +10 0x1007de9e4 void mapReduce(std::__1::span, unsigned long, void (unsigned long, mach_o::Error&, std::__1::span) block_pointer, void (std::__1::span) block_pointer) + 336 +11 0x1007de594 ld::LayoutExecutable::writeContentWithoutLinkEdit(std::__1::span, unsigned long long) + 1180 +12 0x1007e4020 ld::LayoutExecutable::writeToFile(char const*) + 15248 +13 0x1007962e8 main + 9424 +ld: Assertion failed: (extras.otherInstrOffset != 0 && "Kind::arm64_adrp_ldr missing extra info"), function applyFixup, file Fixup.cpp, line 793. +clang: error: linker command failed with exit code 1 (use -v to see invocation) +``` +It's a known issue for Xcode version 15.1. +Mitigation: update to most recent Xcode version, clean and rebuild. diff --git a/examples/models/llama2/TARGETS b/examples/models/llama/TARGETS similarity index 89% rename from examples/models/llama2/TARGETS rename to examples/models/llama/TARGETS index 6d806cbc0d6..751c61da977 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama/TARGETS @@ -16,7 +16,7 @@ runtime.python_library( "rope.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//executorch/...", "@EXECUTORCH_CLIENTS", @@ -34,9 +34,9 @@ runtime.python_library( "model.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", resources = { - "//executorch/examples/models/llama2/params:params": "params", + "//executorch/examples/models/llama/params:params": "params", }, visibility = [ "//bento/...", @@ -46,14 +46,14 @@ runtime.python_library( deps = [ "//caffe2:torch", "//executorch/examples/models:model_base", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", "//executorch/examples/models:checkpoint", ], ) runtime.python_binary( name = "export_llama", - main_function = "executorch.examples.models.llama2.export_llama.main", + main_function = "executorch.examples.models.llama.export_llama.main", # visibility = ["//executorch/examples/..."], preload_deps = [ "//executorch/extension/llm/custom_ops:model_sharding_py", @@ -93,7 +93,7 @@ runtime.python_library( "source_transformation/spin_quant.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", @@ -124,7 +124,7 @@ runtime.python_library( runtime.python_binary( name = "eval_llama", - main_function = "executorch.examples.models.llama2.eval_llama.main", + main_function = "executorch.examples.models.llama.eval_llama.main", preload_deps = [ "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", "//executorch/kernels/quantized:aot_lib", @@ -143,7 +143,7 @@ runtime.python_library( "evaluate/eager_eval.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", @@ -154,7 +154,7 @@ runtime.python_library( "fbsource//third-party/pypi/lm-eval:lm-eval", "fbsource//third-party/pypi/tiktoken:tiktoken", ":export_library", - "//executorch/examples/models/llama2/tokenizer:tiktoken_py", + "//executorch/examples/models/llama/tokenizer:tiktoken_py", "//executorch/extension/llm/export:export_lib", "//executorch/extension/llm/tokenizer:tokenizer_py_lib", "//executorch/extension/pybindings:portable_lib", @@ -196,7 +196,7 @@ runtime.python_test( deps = [ ":quantized_kv_cache", "//caffe2:torch", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", ], ) @@ -212,6 +212,6 @@ runtime.python_test( ":quantized_kv_cache", ":sdpa", "//caffe2:torch", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", ], ) diff --git a/examples/models/llama2/__init__.py b/examples/models/llama/__init__.py similarity index 100% rename from examples/models/llama2/__init__.py rename to examples/models/llama/__init__.py diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama/eval_llama.py similarity index 100% rename from examples/models/llama2/eval_llama.py rename to examples/models/llama/eval_llama.py diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py similarity index 98% rename from examples/models/llama2/eval_llama_lib.py rename to examples/models/llama/eval_llama_lib.py index 95b3ff0fb7c..e95e6998d9b 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -10,10 +10,10 @@ from typing import Optional, Union import torch -from executorch.examples.models.llama2.export_llama_lib import ( +from executorch.examples.models.llama.export_llama_lib import ( get_quantizer_and_quant_params, ) -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.export.builder import LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import ( diff --git a/examples/models/llama2/evaluate/__init__.py b/examples/models/llama/evaluate/__init__.py similarity index 100% rename from examples/models/llama2/evaluate/__init__.py rename to examples/models/llama/evaluate/__init__.py diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py similarity index 96% rename from examples/models/llama2/evaluate/eager_eval.py rename to examples/models/llama/evaluate/eager_eval.py index 784112e052b..3d0a9a0d70e 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama/evaluate/eager_eval.py @@ -8,7 +8,7 @@ from typing import Optional, Union import torch -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) diff --git a/examples/models/llama2/experimental/README.md b/examples/models/llama/experimental/README.md similarity index 100% rename from examples/models/llama2/experimental/README.md rename to examples/models/llama/experimental/README.md diff --git a/examples/models/llama2/experimental/TARGETS b/examples/models/llama/experimental/TARGETS similarity index 100% rename from examples/models/llama2/experimental/TARGETS rename to examples/models/llama/experimental/TARGETS diff --git a/examples/models/llama2/experimental/generate.py b/examples/models/llama/experimental/generate.py similarity index 98% rename from examples/models/llama2/experimental/generate.py rename to examples/models/llama/experimental/generate.py index bc974d7351f..d09772c3099 100644 --- a/examples/models/llama2/experimental/generate.py +++ b/examples/models/llama/experimental/generate.py @@ -11,7 +11,7 @@ import torch -from executorch.examples.models.llama2.experimental.load_gguf_q4_0 import load_gguf_q4_0 +from executorch.examples.models.llama.experimental.load_gguf_q4_0 import load_gguf_q4_0 from sentencepiece import SentencePieceProcessor diff --git a/examples/models/llama2/experimental/load_gguf_q4_0.py b/examples/models/llama/experimental/load_gguf_q4_0.py similarity index 98% rename from examples/models/llama2/experimental/load_gguf_q4_0.py rename to examples/models/llama/experimental/load_gguf_q4_0.py index 4583978394f..39b81ea64a2 100644 --- a/examples/models/llama2/experimental/load_gguf_q4_0.py +++ b/examples/models/llama/experimental/load_gguf_q4_0.py @@ -14,7 +14,7 @@ from typing import Callable, Dict, Mapping import torch -from executorch.examples.models.llama2.experimental.subclass import ( +from executorch.examples.models.llama.experimental.subclass import ( _unpack_two_uint8, GGMLInt4LinearWeight, to_float, diff --git a/examples/models/llama2/experimental/subclass.py b/examples/models/llama/experimental/subclass.py similarity index 100% rename from examples/models/llama2/experimental/subclass.py rename to examples/models/llama/experimental/subclass.py diff --git a/examples/models/llama2/experimental/targets.bzl b/examples/models/llama/experimental/targets.bzl similarity index 100% rename from examples/models/llama2/experimental/targets.bzl rename to examples/models/llama/experimental/targets.bzl diff --git a/examples/models/llama2/experimental/test_subclass.py b/examples/models/llama/experimental/test_subclass.py similarity index 100% rename from examples/models/llama2/experimental/test_subclass.py rename to examples/models/llama/experimental/test_subclass.py diff --git a/examples/models/llama2/export_llama.py b/examples/models/llama/export_llama.py similarity index 100% rename from examples/models/llama2/export_llama.py rename to examples/models/llama/export_llama.py diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama/export_llama_lib.py similarity index 99% rename from examples/models/llama2/export_llama_lib.py rename to examples/models/llama/export_llama_lib.py index 0a3c7620cb6..940bcaecbc7 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -24,7 +24,7 @@ from executorch.devtools.etrecord import generate_etrecord -from executorch.examples.models.llama2.llama_transformer import ModelArgs +from executorch.examples.models.llama.llama_transformer import ModelArgs from executorch.extension.llm.export.builder import DType, LLMEdgeManager @@ -783,8 +783,8 @@ def _load_llama_model( f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}" ) model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model( - "llama2", - "Llama2Model", + module_name="llama", + model_class_name="Llama2Model", checkpoint=checkpoint, checkpoint_dir=checkpoint_dir, params=params_path, diff --git a/examples/models/llama2/fairseq2.py b/examples/models/llama/fairseq2.py similarity index 100% rename from examples/models/llama2/fairseq2.py rename to examples/models/llama/fairseq2.py diff --git a/examples/models/llama2/install_requirement_helper.py b/examples/models/llama/install_requirement_helper.py similarity index 100% rename from examples/models/llama2/install_requirement_helper.py rename to examples/models/llama/install_requirement_helper.py diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama/install_requirements.sh similarity index 92% rename from examples/models/llama2/install_requirements.sh rename to examples/models/llama/install_requirements.sh index 99783ff29c4..470e804c2d4 100755 --- a/examples/models/llama2/install_requirements.sh +++ b/examples/models/llama/install_requirements.sh @@ -19,4 +19,4 @@ pip install lm_eval==0.4.2 pip install tiktoken blobfile # Call the install helper for further setup -python examples/models/llama2/install_requirement_helper.py +python examples/models/llama/install_requirement_helper.py diff --git a/examples/models/llama2/llama_test.py b/examples/models/llama/llama_test.py similarity index 100% rename from examples/models/llama2/llama_test.py rename to examples/models/llama/llama_test.py diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama/llama_transformer.py similarity index 99% rename from examples/models/llama2/llama_transformer.py rename to examples/models/llama/llama_transformer.py index 8e17013ae3d..4d39d131d1d 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama/llama_transformer.py @@ -14,7 +14,7 @@ import torch import torch.nn.functional as F -from executorch.examples.models.llama2.rope import ( +from executorch.examples.models.llama.rope import ( apply_rotary_emb, hf_apply_rotary_emb, hf_precompute_freqs_cis, diff --git a/examples/models/llama2/llama_via_xnnpack.gif b/examples/models/llama/llama_via_xnnpack.gif similarity index 100% rename from examples/models/llama2/llama_via_xnnpack.gif rename to examples/models/llama/llama_via_xnnpack.gif diff --git a/examples/models/llama2/main.cpp b/examples/models/llama/main.cpp similarity index 97% rename from examples/models/llama2/main.cpp rename to examples/models/llama/main.cpp index 339b2abfdb4..5fe0ce93cf6 100644 --- a/examples/models/llama2/main.cpp +++ b/examples/models/llama/main.cpp @@ -8,7 +8,7 @@ #include -#include +#include #if defined(ET_USE_THREADPOOL) #include diff --git a/examples/models/llama2/model.py b/examples/models/llama/model.py similarity index 99% rename from examples/models/llama2/model.py rename to examples/models/llama/model.py index 23f1c1b4898..ad997de64cd 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama/model.py @@ -16,7 +16,7 @@ get_default_model_resource_dir, ) -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer try: from .fairseq2 import convert_to_llama_checkpoint diff --git a/examples/models/llama2/params/TARGETS b/examples/models/llama/params/TARGETS similarity index 100% rename from examples/models/llama2/params/TARGETS rename to examples/models/llama/params/TARGETS diff --git a/examples/models/llama2/params/demo_config.json b/examples/models/llama/params/demo_config.json similarity index 100% rename from examples/models/llama2/params/demo_config.json rename to examples/models/llama/params/demo_config.json diff --git a/examples/models/llama2/params/demo_rand_params.pth b/examples/models/llama/params/demo_rand_params.pth similarity index 100% rename from examples/models/llama2/params/demo_rand_params.pth rename to examples/models/llama/params/demo_rand_params.pth diff --git a/examples/models/llama2/rope.py b/examples/models/llama/rope.py similarity index 100% rename from examples/models/llama2/rope.py rename to examples/models/llama/rope.py diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt similarity index 100% rename from examples/models/llama2/runner/CMakeLists.txt rename to examples/models/llama/runner/CMakeLists.txt diff --git a/examples/models/llama2/runner/TARGETS b/examples/models/llama/runner/TARGETS similarity index 100% rename from examples/models/llama2/runner/TARGETS rename to examples/models/llama/runner/TARGETS diff --git a/examples/models/llama2/runner/eager.py b/examples/models/llama/runner/eager.py similarity index 97% rename from examples/models/llama2/runner/eager.py rename to examples/models/llama/runner/eager.py index 7f324b4cbc8..42d11bdedfa 100644 --- a/examples/models/llama2/runner/eager.py +++ b/examples/models/llama/runner/eager.py @@ -10,7 +10,7 @@ import torch -from examples.models.llama2.llama_transformer import ModelArgs +from examples.models.llama.llama_transformer import ModelArgs from executorch.examples.models.llama2.export_llama_lib import ( _prepare_for_llama_export, build_args_parser as _build_args_parser, diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama/runner/generation.py similarity index 96% rename from examples/models/llama2/runner/generation.py rename to examples/models/llama/runner/generation.py index 6d643b3857d..885249f9b9a 100644 --- a/examples/models/llama2/runner/generation.py +++ b/examples/models/llama/runner/generation.py @@ -9,8 +9,8 @@ import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer +from executorch.examples.models.llama.llama_transformer import ModelArgs +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer class CompletionPrediction(TypedDict, total=False): diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama/runner/native.py similarity index 97% rename from examples/models/llama2/runner/native.py rename to examples/models/llama/runner/native.py index b0d6c20e961..90e7fc46dd0 100644 --- a/examples/models/llama2/runner/native.py +++ b/examples/models/llama/runner/native.py @@ -10,7 +10,7 @@ import torch -from examples.models.llama2.llama_transformer import ModelArgs +from examples.models.llama.llama_transformer import ModelArgs from executorch.extension.pybindings.portable_lib import _load_for_executorch # Load custom ops and quantized ops. diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama/runner/runner.cpp similarity index 98% rename from examples/models/llama2/runner/runner.cpp rename to examples/models/llama/runner/runner.cpp index a2ae053dd17..42a1a632dc6 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -9,13 +9,13 @@ // A simple llama2 runner that includes preprocessing and post processing logic. // The module takes in a string as input and emits a string as output. -#include +#include #include #include -#include +#include #include namespace example { diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama/runner/runner.h similarity index 100% rename from examples/models/llama2/runner/runner.h rename to examples/models/llama/runner/runner.h diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama/runner/targets.bzl similarity index 97% rename from examples/models/llama2/runner/targets.bzl rename to examples/models/llama/runner/targets.bzl index 96d47ffce21..de12dc4d106 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama/runner/targets.bzl @@ -40,7 +40,7 @@ def define_common_targets(): "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - "//executorch/examples/models/llama2/tokenizer:tiktoken", + "//executorch/examples/models/llama/tokenizer:tiktoken", "//executorch/extension/llm/tokenizer:bpe_tokenizer", ] + (_get_operator_lib(aten)) + ([ # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE) diff --git a/examples/models/llama2/source_transformation/__init__.py b/examples/models/llama/source_transformation/__init__.py similarity index 100% rename from examples/models/llama2/source_transformation/__init__.py rename to examples/models/llama/source_transformation/__init__.py diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py similarity index 100% rename from examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py rename to examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py diff --git a/examples/models/llama2/source_transformation/lora.py b/examples/models/llama/source_transformation/lora.py similarity index 100% rename from examples/models/llama2/source_transformation/lora.py rename to examples/models/llama/source_transformation/lora.py diff --git a/examples/models/llama2/source_transformation/pre_quantization.py b/examples/models/llama/source_transformation/pre_quantization.py similarity index 100% rename from examples/models/llama2/source_transformation/pre_quantization.py rename to examples/models/llama/source_transformation/pre_quantization.py diff --git a/examples/models/llama2/source_transformation/prune_output.py b/examples/models/llama/source_transformation/prune_output.py similarity index 100% rename from examples/models/llama2/source_transformation/prune_output.py rename to examples/models/llama/source_transformation/prune_output.py diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py similarity index 100% rename from examples/models/llama2/source_transformation/quantize.py rename to examples/models/llama/source_transformation/quantize.py diff --git a/examples/models/llama2/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py similarity index 99% rename from examples/models/llama2/source_transformation/quantized_kv_cache.py rename to examples/models/llama/source_transformation/quantized_kv_cache.py index 8eec7846d3c..99772569755 100644 --- a/examples/models/llama2/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/quantized_kv_cache.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa: F401 diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama/source_transformation/rms_norm.py similarity index 90% rename from examples/models/llama2/source_transformation/rms_norm.py rename to examples/models/llama/source_transformation/rms_norm.py index ff7e8b67457..3d94f73b631 100644 --- a/examples/models/llama2/source_transformation/rms_norm.py +++ b/examples/models/llama/source_transformation/rms_norm.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import torch -from executorch.examples.models.llama2.llama_transformer import RMSNorm +from executorch.examples.models.llama.llama_transformer import RMSNorm def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): diff --git a/examples/models/llama2/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py similarity index 100% rename from examples/models/llama2/source_transformation/rope.py rename to examples/models/llama/source_transformation/rope.py diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py similarity index 98% rename from examples/models/llama2/source_transformation/sdpa.py rename to examples/models/llama/source_transformation/sdpa.py index bda6966fa16..f8362648f32 100644 --- a/examples/models/llama2/source_transformation/sdpa.py +++ b/examples/models/llama/source_transformation/sdpa.py @@ -13,8 +13,8 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.llama_transformer import KVCache, SDPA +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedKVCache, ) diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama/source_transformation/spin_quant.py similarity index 97% rename from examples/models/llama2/source_transformation/spin_quant.py rename to examples/models/llama/source_transformation/spin_quant.py index f544e9e1f6e..e07b78dc6ee 100644 --- a/examples/models/llama2/source_transformation/spin_quant.py +++ b/examples/models/llama/source_transformation/spin_quant.py @@ -14,7 +14,7 @@ import torch.nn.functional as F -from executorch.examples.models.llama2.llama_transformer import FeedForward +from executorch.examples.models.llama.llama_transformer import FeedForward from torch import nn diff --git a/examples/models/llama2/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py similarity index 96% rename from examples/models/llama2/source_transformation/test_quantized_kv_cache.py rename to examples/models/llama/source_transformation/test_quantized_kv_cache.py index 5fa5d1958de..2f38f96552c 100644 --- a/examples/models/llama2/source_transformation/test_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py @@ -8,9 +8,9 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedCacheType, QuantizedKVCache, ) diff --git a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py similarity index 92% rename from examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py rename to examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py index 4755d45499d..65c6678ab25 100644 --- a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py @@ -8,14 +8,14 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedCacheType, QuantizedKVCache, ) -from executorch.examples.models.llama2.source_transformation.sdpa import SDPACustom +from executorch.examples.models.llama.source_transformation.sdpa import SDPACustom class SDPAWithQuantizedKVCacheTest(unittest.TestCase): diff --git a/examples/models/llama2/targets.bzl b/examples/models/llama/targets.bzl similarity index 91% rename from examples/models/llama2/targets.bzl rename to examples/models/llama/targets.bzl index 57e84256a49..40b26d69808 100644 --- a/examples/models/llama2/targets.bzl +++ b/examples/models/llama/targets.bzl @@ -15,7 +15,7 @@ def define_common_targets(): "-DUSE_ATEN_LIB", ] if aten else [], deps = [ - "//executorch/examples/models/llama2/runner:runner" + aten_suffix, + "//executorch/examples/models/llama/runner:runner" + aten_suffix, "//executorch/extension/evalue_util:print_evalue", "//executorch/extension/threadpool:threadpool", "//executorch/extension/threadpool:cpuinfo_utils", diff --git a/examples/models/llama2/test_llama_runner.sh b/examples/models/llama/test_llama_runner.sh similarity index 88% rename from examples/models/llama2/test_llama_runner.sh rename to examples/models/llama/test_llama_runner.sh index d0c44518abf..c55719f382f 100644 --- a/examples/models/llama2/test_llama_runner.sh +++ b/examples/models/llama/test_llama_runner.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Test Llama runner in examples/models/llama2/main.cpp +# Test Llama runner in examples/models/llama/main.cpp # 1. Export a llama-like model # 2. Build llama runner binary # 3. Run model with the llama runner binary with prompt diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama/tests/TARGETS similarity index 64% rename from examples/models/llama2/tests/TARGETS rename to examples/models/llama/tests/TARGETS index 2e4dcf7d1f6..8cea9aeff4b 100644 --- a/examples/models/llama2/tests/TARGETS +++ b/examples/models/llama/tests/TARGETS @@ -9,8 +9,8 @@ python_unittest( ], deps = [ "//caffe2:torch", - "//executorch/examples/models/llama2:export_library", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:export_library", + "//executorch/examples/models/llama:llama_transformer", ], ) @@ -21,8 +21,8 @@ python_unittest( ], deps = [ "//caffe2:torch", - "//executorch/examples/models/llama2:export_library", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:export_library", + "//executorch/examples/models/llama:llama_transformer", "//pytorch/ao:torchao", ], ) diff --git a/examples/models/llama2/tests/test_pre_quantization_transforms.py b/examples/models/llama/tests/test_pre_quantization_transforms.py similarity index 95% rename from examples/models/llama2/tests/test_pre_quantization_transforms.py rename to examples/models/llama/tests/test_pre_quantization_transforms.py index 59cec2e72ab..dc7c640dba9 100644 --- a/examples/models/llama2/tests/test_pre_quantization_transforms.py +++ b/examples/models/llama/tests/test_pre_quantization_transforms.py @@ -7,14 +7,14 @@ import unittest import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer -from executorch.examples.models.llama2.source_transformation.pre_quantization import ( +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.source_transformation.pre_quantization import ( sanitize_checkpoint_from_pre_quantization, transform_embedding_for_pre_quantization, transform_linear_for_pre_quantization, transform_output_linear_for_pre_quantization, ) -from executorch.examples.models.llama2.source_transformation.quantize import ( +from executorch.examples.models.llama.source_transformation.quantize import ( dynamically_quantize_per_channel, ) from torchao.quantization.utils import group_quantize_tensor_symmetric diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama/tests/test_simple_sdpa.py similarity index 92% rename from examples/models/llama2/tests/test_simple_sdpa.py rename to examples/models/llama/tests/test_simple_sdpa.py index 264ed3dde30..6e0c3919602 100644 --- a/examples/models/llama2/tests/test_simple_sdpa.py +++ b/examples/models/llama/tests/test_simple_sdpa.py @@ -8,8 +8,8 @@ import unittest import torch -from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA -from executorch.examples.models.llama2.source_transformation.sdpa import SDPASimple +from executorch.examples.models.llama.llama_transformer import KVCache, SDPA +from executorch.examples.models.llama.source_transformation.sdpa import SDPASimple class SDPATest(unittest.TestCase): diff --git a/examples/models/llama2/tokenizer/TARGETS b/examples/models/llama/tokenizer/TARGETS similarity index 100% rename from examples/models/llama2/tokenizer/TARGETS rename to examples/models/llama/tokenizer/TARGETS diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp similarity index 97% rename from examples/models/llama2/tokenizer/llama_tiktoken.cpp rename to examples/models/llama/tokenizer/llama_tiktoken.cpp index 5ce9d7f14cc..74eacc1b5f0 100644 --- a/examples/models/llama2/tokenizer/llama_tiktoken.cpp +++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include namespace example { diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h similarity index 100% rename from examples/models/llama2/tokenizer/llama_tiktoken.h rename to examples/models/llama/tokenizer/llama_tiktoken.h diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama/tokenizer/targets.bzl similarity index 100% rename from examples/models/llama2/tokenizer/targets.bzl rename to examples/models/llama/tokenizer/targets.bzl diff --git a/examples/models/llama2/tokenizer/test/CMakeLists.txt b/examples/models/llama/tokenizer/test/CMakeLists.txt similarity index 100% rename from examples/models/llama2/tokenizer/test/CMakeLists.txt rename to examples/models/llama/tokenizer/test/CMakeLists.txt diff --git a/examples/models/llama2/tokenizer/test/TARGETS b/examples/models/llama/tokenizer/test/TARGETS similarity index 100% rename from examples/models/llama2/tokenizer/test/TARGETS rename to examples/models/llama/tokenizer/test/TARGETS diff --git a/examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model b/examples/models/llama/tokenizer/test/resources/test_tiktoken_tokenizer.model similarity index 100% rename from examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model rename to examples/models/llama/tokenizer/test/resources/test_tiktoken_tokenizer.model diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama/tokenizer/test/targets.bzl similarity index 90% rename from examples/models/llama2/tokenizer/test/targets.bzl rename to examples/models/llama/tokenizer/test/targets.bzl index 842a5fc3968..bd07e9e88c7 100644 --- a/examples/models/llama2/tokenizer/test/targets.bzl +++ b/examples/models/llama/tokenizer/test/targets.bzl @@ -12,7 +12,7 @@ def define_common_targets(): "test_tiktoken.cpp", ], deps = [ - "//executorch/examples/models/llama2/tokenizer:tiktoken", + "//executorch/examples/models/llama/tokenizer:tiktoken", ], env = { "RESOURCES_PATH": "$(location :resources)/resources", diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama/tokenizer/test/test_tiktoken.cpp similarity index 97% rename from examples/models/llama2/tokenizer/test/test_tiktoken.cpp rename to examples/models/llama/tokenizer/test/test_tiktoken.cpp index 5bd6515b676..b9309f99212 100644 --- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp +++ b/examples/models/llama/tokenizer/test/test_tiktoken.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama/tokenizer/tiktoken.py similarity index 100% rename from examples/models/llama2/tokenizer/tiktoken.py rename to examples/models/llama/tokenizer/tiktoken.py diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 2260c8f8253..8876c5c4e41 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -1,497 +1,2 @@ # Summary -This example demonstrates how to run a [llama models](https://www.llama.com/) on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone. - -Here are supported models: - -- Llama 3.2 1B and 3B -- Llama 3.1 8B -- Llama 3 8B -- Llama 2 7B - -Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). - -# What is Llama? -Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence. - -Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important. - -Overall, Llama models are powerful and versatile language models that can be used for a wide range of natural language processing tasks. The model’s ability to generate coherent and contextually relevant text makes it particularly useful for applications such as chatbots, virtual assistants, and language translation. - -Please note that the models are subject to the [Llama 2 Acceptable Use Policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md), [Llama 3 Acceptable Use Policy](https://github.com/meta-llama/llama3/blob/main/USE_POLICY.md) and [Responsible Use Guide](https://ai.meta.com/static-resource/responsible-use-guide/). - - -# Results - -Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. - -For Llama 3.2 1B/3B, we validated the models by running them in their original bf16 datatype and unquantized on both Android and iOS phones. The 3B version required high-end phones with larger RAMs to fit the model. - -Additionally, 1B/3B models are sensitive to accuracy loss when regular PTQ quantization is applied, so we employed 4bit quantization using [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main) to achieve a good balance between accuracy, performance and memory. - - - - - - -
- -
- - Llama3.1 8B, 4bit quantized on Android phone - -
-
- Llama3.2 1B, unquantized, bf16 on Android phone. -
- -## Quantization: -We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao). - -We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Please note that LM Eval reports perplexity normalized by word count instead of token count. You may see different perplexity for WikiText from other sources if they implement it differntly. More details could be found [here](https://github.com/EleutherAI/lm-evaluation-harness/issues/2301). - -Below are the results for two different groupsizes, with max_seq_length 2048, and limit 1000. - -|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------|-----------------| ---------------------- | --------------- -|Llama 2 7B | 9.2 | 10.2 | 10.7 -|Llama 3 8B | 7.9 | 9.4 | 9.7 - -Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32. - -### SpinQuant for Llama 3.2 1B/3B models (Optional) - -To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node. - -SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (e.g., group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We enabled SpinQuant for Llama3.2 1B/3B models on ExecuTorch. - -

- -
- - Running Llama3.2 3B on Android phone. - -
- - 4bit quantization using SpinQuant - -

- -## Enablement - -For Llama 3 8B and Llama3.1 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM). - -We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12. - -## Performance - -### Llama 3.2 1B and 3B -Llama 3.2 1B and 3B performance was measured on the OnePlus 12 device. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on) for generating 128 tokens. - -|Model | bf16 | 4bit(*) via SpinQuant -|--------| ---------------------- | --------------- -|1B | 19.4 tokens/second | 53.41 tokens/second | -|3B | 7.76 tokens/second | 22.98 tokens/second | - -(*) With SpinQuant, we currently quantize 4-bit groupwise (with groupsize 32) weight, 8bit dynamic activation of all the linear layers of the model, except embedding and output layers. The embedding and output layers are quantized as 8-bit per-channel weight and 8-bit dynamic activation. - -### Llama3 8B and Llama3.1 8B -Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). - -Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantize embedding lookup table as well. For these results embedding lookup table was groupwise quantized with 4-bits and group size of 32. - -|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------| ---------------------- | --------------- -|Galaxy S22 | 7.85 tokens/second | 8.4 tokens/second | -|Galaxy S24 | 10.91 tokens/second | 11.21 tokens/second | -|OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second | - -### Llama2 7B -Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). - -|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------| ---------------------- | --------------- -|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second | -|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second | -|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second | - -# Instructions - -## Tested on - -- MacOS M1/M2, Linux. -- For Llama 2 7B, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model. - -## Step 1: Setup -> :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. - -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` -2. Run `examples/models/llama2/install_requirements.sh` to install a few dependencies. - - -## Step 2: Prepare model - -### Option A: Download and export Llama3.2 1B/3B model. - -1. Download `consolidated.00.pth`, `params.json` and `tokenizer.model` from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). For chat use-cases, download the instruct models. - -2. Export model and generate `.pte` file. Use original bfloat16 version, without any quantization. - -``` -# Set these paths to point to the downloaded files -LLAMA_CHECKPOINT=path/to/checkpoint.pth -LLAMA_PARAMS=path/to/params.json - -python -m examples.models.llama2.export_llama \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" -``` - -Optionally, we can apply SpinQuant to quantize the model without sacrifacing too much accuracy loss. - -To use SpinQuant, follow its [instruction](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) for exporting checkpoint to ExecuTorch and then export the SpinQuant checkpoint. - -``` -# Set these paths to point to the exported files -LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth -LLAMA_PARAMS=path/to/params.json - -python -m examples.models.llama2.export_llama \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - --use_sdpa_with_kv_cache \ - -X \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --output_name "llama3_2.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' -``` - -### Option B: Download and export Llama 3 8B instruct model - -You can export and run the original Llama 3 8B instruct model. - -1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). - -2. Export model and generate `.pte` file - ``` - python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" - ``` - - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. - -### Option C: Download and export stories110M model - -If you want to deploy and run a smaller model for educational purposes. From `executorch` root: - -1. Download `stories110M.pt` and `tokenizer.model` from Github. - ``` - wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" - wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" - ``` -2. Create params file. - ``` - echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json - ``` -3. Export model and generate `.pte` file. - ``` - python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv - ``` - -### Option D: Download and export Llama 2 7B model - -You can export and run the original Llama 2 7B model. - -1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). - -2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround. - -3. Export model and generate `.pte` file: - ``` - python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 - ``` -4. Create tokenizer.bin. - ``` - python -m extension.llm.tokenizer.tokenizer -t -o tokenizer.bin - ``` - -### Option E: Download models from Hugging Face and convert from safetensor format to state dict - - -You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune). - - -```Python -from torchtune.utils import FullModelHFCheckpointer -from torchtune.models import convert_weights -import torch - -# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face -checkpointer = FullModelHFCheckpointer( - checkpoint_dir='/home/.cache/huggingface/hub/models/snapshots/hash-number', - checkpoint_files=['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors'], - output_dir='/the/destination/dir' , - model_type='LLAMA3' # or other types that TorchTune supports -) - -print("loading checkpoint") -sd = checkpointer.load_checkpoint() - -# Convert from TorchTune to Meta (PyTorch native) -sd = convert_weights.tune_to_meta(sd['model']) - -print("saving checkpoint") -torch.save(sd, "/the/destination/dir/checkpoint.pth") -``` - -## (Optional) Finetuning - -If you want to finetune your model based on a specific dataset, PyTorch provides [TorchTune](https://github.com/pytorch/torchtune) - a native-Pytorch library for easily authoring, fine-tuning and experimenting with LLMs. - -Once you have [TorchTune installed](https://github.com/pytorch/torchtune?tab=readme-ov-file#get-started) you can finetune Llama2 7B model using LoRA on a single GPU, using the following command. This will produce a checkpoint where the LoRA weights are merged with the base model and so the output checkpoint will be in the same format as the original Llama2 model. - -``` -tune run lora_finetune_single_device \ ---config llama2/7B_lora_single_device \ -checkpointer.checkpoint_dir= \ -tokenizer.path=/tokenizer.model -``` - -To run full finetuning with Llama2 7B on a single device, you can use the following command. - -``` -tune run full_finetune_single_device \ ---config llama2/7B_full_single_device \ -checkpointer.checkpoint_dir= \ -tokenizer.path=/tokenizer.model -``` - -## Step 3: Evaluate model accuracy - -> Forewarning: Model evaluation without a GPU may take a long time, especially on larger models. - -We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate model accuracy. - -For base models, use the following example command to calculate its perplexity based on WikiText. -``` -python -m examples.models.llama2.eval_llama \ - -c \ - -p \ - -t \ - -kv \ - -d \ - --max_seq_len \ - --limit -``` - -For instruct models, use the following example command to calculate its MMLU score. -``` -python -m examples.models.llama2.eval_llama \ - -c \ - -p \ - -t \ - -kv \ - -d \ - --tasks mmlu \ - --num_fewshot 5 \ - --max_seq_len -``` - -## Step 4: Run on your computer to validate - -1. Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). - ``` - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out . - - cmake --build cmake-out -j16 --target install --config Release - ``` -Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the session of Common Issues and Mitigations below for solutions. - -2. Build llama runner. - ``` - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -Bcmake-out/examples/models/llama2 \ - examples/models/llama2 - - cmake --build cmake-out/examples/models/llama2 -j16 --config Release - ``` - -3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40). - ``` - cmake-out/examples/models/llama2/llama_main --model_path= --tokenizer_path= --prompt= - ``` - -For Llama2 models, pass the converted `tokenizer.bin` file instead of `tokenizer.model`. - -To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` - -## Step 5: Run benchmark on Android phone - -**1. Build llama runner binary for Android** - -*Pre-requisite*: Android NDK (tested with r27b) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it. - -**1.1 Set Android NDK** -``` -export ANDROID_NDK= -``` -**1.2 Build executorch and associated libraries for android.** -``` -cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX=cmake-out-android \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out-android . - -cmake --build cmake-out-android -j16 --target install --config Release -``` - -**1.2 Build llama runner for android** -``` -cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX=cmake-out-android \ - -DCMAKE_BUILD_TYPE=Release \ - -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out-android/examples/models/llama2 \ - examples/models/llama2 - -cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release -``` - -**2. Run on Android via adb shell** - -*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone - -**2.1 Connect your android phone** - -**2.2 Upload model, tokenizer and llama runner binary to phone** -``` -adb shell mkdir -p /data/local/tmp/llama -adb push /data/local/tmp/llama/ -adb push /data/local/tmp/llama/ -adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/ -``` - -**2.3 Run model** -``` -adb shell "cd /data/local/tmp/llama && ./llama_main --model_path --tokenizer_path --prompt \"Once upon a time\" --seq_len 120" -``` -## Step 6: Build Mobile apps - -### iOS - -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. - -### Android -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. - -## Optional: Smaller models delegated to other backends -Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction -for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is - -- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` - -The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. - -For CoreML, there are 2 additional optional arguments: -* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) -* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML - -# What is coming next? -## Quantization -- Enabling FP16 model to leverage smaller groupsize for 4-bit quantization. -- Enabling GPTQ for 4-bit groupwise quantization -- Enabling custom quantization -- Lower bit quantization -## Models -- Enabling more generative AI models and architectures. -- Enable support for mult-modal models like LlaVa. -## Performance -- Performance improvement via techniques such as speculative decoding -- Enabling LLama2 7b and other architectures via Vulkan -- Enabling performant execution of widely used quantization schemes. - - -# Notes -This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch: -1. Since ExecuTorch does not support complex Tensor data type, use the customized functions to have rotary embedding with real numbers. Please see [GitHub issue: Support complex data type in ExecuTorch](https://github.com/pytorch/executorch/issues/886). -2. No CUDA. ExecuTorch is focused on Edge use cases where CUDA is not available on most of the edge devices. -3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch. - - -# Common Issues and Mitigations: -- To clean your build: -``` -git clean -xfd -pip uninstall executorch -./install_requirements.sh --pybind xnnpack - -rm -rf cmake-out -``` -- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` -- On Mac, if there is linking error in Step 4 with error message like -``` -0 0x100823648 __assert_rtn + 72 -1 0x10074bc5c ld::Fixup::applyFixup(ld::Atom const*, ld::LayoutLinkedImage const&, unsigned char*) const + 8268 -2 0x1007de7d8 ___ZN2ld16LayoutExecutable27writeContentWithoutLinkEditENSt3__14spanIhLm18446744073709551615EEEy_block_invoke + 332 -3 0x188cca428 _dispatch_client_callout2 + 20 -4 0x188cde850 _dispatch_apply_invoke3 + 336 -5 0x188cca3e8 _dispatch_client_callout + 20 -6 0x188ccbc68 _dispatch_once_callout + 32 -7 0x188cdeeec _dispatch_apply_invoke_and_wait + 372 -8 0x188cdde9c _dispatch_apply_with_attr_f + 1212 -9 0x188cde08c dispatch_apply + 96 -10 0x1007de9e4 void mapReduce(std::__1::span, unsigned long, void (unsigned long, mach_o::Error&, std::__1::span) block_pointer, void (std::__1::span) block_pointer) + 336 -11 0x1007de594 ld::LayoutExecutable::writeContentWithoutLinkEdit(std::__1::span, unsigned long long) + 1180 -12 0x1007e4020 ld::LayoutExecutable::writeToFile(char const*) + 15248 -13 0x1007962e8 main + 9424 -ld: Assertion failed: (extras.otherInstrOffset != 0 && "Kind::arm64_adrp_ldr missing extra info"), function applyFixup, file Fixup.cpp, line 793. -clang: error: linker command failed with exit code 1 (use -v to see invocation) -``` -It's a known issue for Xcode version 15.1. -Mitigation: update to most recent Xcode version, clean and rebuild. +For Llama2, please see the [Llama README page](../llama/README.md) for details. diff --git a/examples/models/llama3/README.md b/examples/models/llama3/README.md index 5ea3e6b9e1e..1056f3d93f5 100644 --- a/examples/models/llama3/README.md +++ b/examples/models/llama3/README.md @@ -1,2 +1,2 @@ # Summary -For Llama3, use the same example code, minus tokenizer, as Llama2. Please see the [Llama2 README page](../llama2/README.md) for details. +For Llama3, use the same example code, minus tokenizer, as Llama2. Please see the [Llama README page](../llama/README.md) for details. diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index cc475f1b196..bdb30db735b 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -12,15 +12,15 @@ ConfigPrecisionType, ) from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner -from executorch.examples.models.llama2.export_llama_lib import ( +from executorch.examples.models.llama.export_llama_lib import ( build_args_parser, get_quantizer_and_quant_params, ) -from executorch.examples.models.llama2.source_transformation.quantize import ( +from executorch.examples.models.llama.source_transformation.quantize import ( EmbeddingQuantHandler, get_quant_weight_transform, ) -from executorch.examples.models.llama2.source_transformation.sdpa import ( +from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) from executorch.examples.models.llava.image_util import serialize_image diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh index facf3032b98..4dcdeea83bf 100755 --- a/examples/models/llava/install_requirements.sh +++ b/examples/models/llava/install_requirements.sh @@ -12,4 +12,4 @@ pip install transformers accelerate sentencepiece tiktoken # Run llama2/install requirements for torchao deps SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -bash "$SCRIPT_DIR"/../llama2/install_requirements.sh +bash "$SCRIPT_DIR"/../llama/install_requirements.sh diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 8dcf286727b..a24249d9534 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -12,9 +12,9 @@ import requests import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer -from executorch.examples.models.llama2.source_transformation.sdpa import ( +from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) from executorch.examples.models.llava.image_util import prepare_image diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py index 85f018e71f5..ca9afb6fa9e 100644 --- a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py +++ b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import ( +from executorch.examples.models.llama.llama_transformer import ( FeedForward, ModelArgs, precompute_freqs_cis, diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index 947b3ef975c..4c493eb5a5c 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -78,7 +78,7 @@ list( list( APPEND _qaihub_llama3_8b_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp ) # build qaihub llama3 8b runner diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md index d49ca4cc946..b0a3ca46457 100644 --- a/examples/qualcomm/qaihub_scripts/llama/README.md +++ b/examples/qualcomm/qaihub_scripts/llama/README.md @@ -21,7 +21,7 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o ```bash # tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model # tokenizer.bin: -python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin +python -m examples.models.llama.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin ``` #### Step3: Run default examples diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index 721c16209c2..959f6810ae5 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -10,7 +10,7 @@ // logic. The module takes in a string as input and emits a string as output. #if defined(QAIHUB_LLAMA3_RUNNER) -#include +#include #else #include #endif diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl index 35df8999b47..ce9575e8cca 100644 --- a/examples/xnnpack/targets.bzl +++ b/examples/xnnpack/targets.bzl @@ -40,7 +40,7 @@ def define_common_targets(): name = "aot_compiler", main_module = "executorch.examples.xnnpack.aot_compiler", resources = { - "//executorch/examples/models/llama2/params:params": "params", + "//executorch/examples/models/llama/params:params": "params", }, deps = [ ":xnnpack_aot_lib", diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index cc080716111..0ee8b042a22 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -152,8 +152,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) ) add_subdirectory( - ${EXECUTORCH_ROOT}/examples/models/llama2/runner - ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner + ${EXECUTORCH_ROOT}/examples/models/llama/runner + ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner ) endif() diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index a400138a049..6f269739c06 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -92,7 +92,7 @@ fb_android_cxx_library( "//fbandroid/native/fb:fb", "//third-party/glog:glog", "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", - "//xplat/executorch/examples/models/llama2/runner:runner_static", + "//xplat/executorch/examples/models/llama/runner:runner_static", "//xplat/executorch/examples/models/llava/runner:runner_static", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 367ed3a966c..6ffc88d8103 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index fe25a173843..c43b701e885 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -67,10 +67,10 @@ 032A73FD2CAFBB7800932D36 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = ""; }; 032A73FE2CAFBB7800932D36 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tiktoken.cpp; sourceTree = ""; }; 032A73FF2CAFBB7800932D36 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = ""; }; - 032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../../examples/models/llama2/runner/runner.h; sourceTree = SOURCE_ROOT; }; - 032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../../examples/models/llama2/runner/runner.cpp; sourceTree = SOURCE_ROOT; }; - 032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../examples/models/llama2/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; }; - 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../../examples/models/llama2/tokenizer/llama_tiktoken.cpp; sourceTree = SOURCE_ROOT; }; + 032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../../examples/models/llama/runner/runner.h; sourceTree = SOURCE_ROOT; }; + 032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../../examples/models/llama/runner/runner.cpp; sourceTree = SOURCE_ROOT; }; + 032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; }; + 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../../examples/models/llama/tokenizer/llama_tiktoken.cpp; sourceTree = SOURCE_ROOT; }; 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = ""; }; 03B0118B2CAC567900054791 /* DynamicTestCase.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DynamicTestCase.h; sourceTree = ""; }; 03B0118C2CAC567900054791 /* DynamicTestCase.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DynamicTestCase.m; sourceTree = ""; }; diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index f3558308c82..c03ad145175 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -8,7 +8,7 @@ #import "ResourceTestCase.h" -#import +#import using namespace ::executorch::extension; using namespace ::executorch::runtime; diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py index 463e5a0fcfe..63839adc5c8 100644 --- a/extension/gguf_util/converters/llama_converter.py +++ b/extension/gguf_util/converters/llama_converter.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import ( +from executorch.examples.models.llama.llama_transformer import ( ModelArgs as LlamaModelArgs, Transformer as LlamaTransformer, ) diff --git a/extension/llm/README.md b/extension/llm/README.md index 7f4baed7d31..ad504966824 100644 --- a/extension/llm/README.md +++ b/extension/llm/README.md @@ -18,7 +18,7 @@ Commonly used methods in this class include: - _to_executorch_: get the executorch graph with optional optimization passes. - _save_to_pte_: finally, the lowered and optimized graph can be saved into a .pte file for the runtime. -Some usage of LLMEdgeManager can be found in executorch/examples/models/llama2, and executorch/examples/models/llava. +Some usage of LLMEdgeManager can be found in executorch/examples/models/llama, and executorch/examples/models/llava. When the .pte file is exported and saved, we can load and run it in a runner (see below). @@ -44,6 +44,6 @@ Contains custom op, such as: ## runner It hosts the libary components used in a C++ llm runner. Currently, it hosts _stats.h_ on runtime status like token numbers and latency. -With the components above, an actual runner can be built for a model or a series of models. An example is in //executorch/examples/models/llama2/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture. +With the components above, an actual runner can be built for a model or a series of models. An example is in //executorch/examples/models/llama/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture. Usages can also be found in the [torchchat repo](https://github.com/pytorch/torchchat/tree/main/runner). diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index d2a413fc793..bd12c374b51 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -216,15 +216,15 @@ def pt2e_calibrate( ): logging.info("Run calibration...") try: - from executorch.examples.models.llama2.eval_llama_lib import ( + from executorch.examples.models.llama.eval_llama_lib import ( GraphModuleEvalWrapper, ) - from executorch.examples.models.llama2.evaluate import ( # pyre-ignore[21] + from executorch.examples.models.llama.evaluate import ( # pyre-ignore[21] evaluate_model, ) except ImportError: raise ImportError( - "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh" + "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" ) tokenizer = get_tokenizer(tokenizer_path) diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl index fa6cc915c4b..3549083eda4 100644 --- a/extension/llm/tokenizer/targets.bzl +++ b/extension/llm/tokenizer/targets.bzl @@ -23,7 +23,7 @@ def define_common_targets(): ], _is_external_target = True, deps = [ - "//executorch/examples/models/llama2/tokenizer:tiktoken_py", + "//executorch/examples/models/llama/tokenizer:tiktoken_py", ], external_deps = [ "sentencepiece-py", diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py index 763febdf478..126a1203274 100644 --- a/extension/llm/tokenizer/utils.py +++ b/extension/llm/tokenizer/utils.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) diff --git a/pytest.ini b/pytest.ini index 166890bd251..1ca39f0a508 100644 --- a/pytest.ini +++ b/pytest.ini @@ -15,7 +15,7 @@ addopts = examples/models/test devtools/ # examples - examples/models/llama2/tests + examples/models/llama/tests # examples/models/llava/test TODO: enable this # exir exir/_serialize/test @@ -45,7 +45,7 @@ addopts = --ignore=backends/xnnpack/test/ops/linear.py --ignore=backends/xnnpack/test/models/llama2_et_example.py # T200992559: Add torchao to ET as core dependency - --ignore=examples/models/llama2/tests/test_pre_quantization_transforms.py + --ignore=examples/models/llama/tests/test_pre_quantization_transforms.py --ignore=exir/backend/test/demos --ignore=exir/backend/test/test_backends.py --ignore=exir/backend/test/test_backends_lifted.py diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index e771fd4b12e..3b786bb49ce 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -90,8 +90,8 @@ build_and_run_test() { -Bcmake-out/"${test_dir}" cmake --build cmake-out/"${test_dir}" -j9 - if [[ "$test_dir" =~ .*examples/models/llama2/tokenizer.* ]]; then - RESOURCES_PATH=$(realpath examples/models/llama2/tokenizer/test/resources) + if [[ "$test_dir" =~ .*examples/models/llama/tokenizer.* ]]; then + RESOURCES_PATH=$(realpath examples/models/llama/tokenizer/test/resources) elif [[ "$test_dir" =~ .*extension/llm/tokenizer.* ]]; then RESOURCES_PATH=$(realpath extension/llm/tokenizer/test/resources) else