pytorch · mergennachin · Oct 16, 2024
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
@@ -48,9 +48,9 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -Bcmake-android-out/examples/models/llama2 examples/models/llama2
+    -Bcmake-android-out/examples/models/llama examples/models/llama
 
-    cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release
+    cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 }
 install_flatc_from_source
 install_executorch_and_backend_lib

diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -125,7 +125,7 @@ cmake_install_executorch_libraries() {
 
 cmake_build_llama_runner() {
     echo "Building llama runner"
-    dir="examples/models/llama2"
+    dir="examples/models/llama"
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Debug \
@@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
 # Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 echo "Creating tokenizer.bin"
@@ -219,15 +219,15 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
 if [[ "${BUILD_TOOL}" == "buck2" ]]; then
   # Run model.
   # shellcheck source=/dev/null
-  $BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt
+  $BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt
 elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
   cmake_install_executorch_libraries
   cmake_build_llama_runner
   # Run llama runner
   NOW=$(date +"%H:%M:%S")
   echo "Starting to run llama runner at ${NOW}"
   # shellcheck source=/dev/null
-  cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt
+  cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt
   NOW=$(date +"%H:%M:%S")
   echo "Finished at ${NOW}"
 else

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -75,9 +75,9 @@ run_portable_executor_runner() {
 test_model() {
   if [[ "${MODEL_NAME}" == "llama2" ]]; then
     # Install requirements for export_llama
-    bash examples/models/llama2/install_requirements.sh
-    # Test export_llama script: python3 -m examples.models.llama2.export_llama
-    "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json
+    bash examples/models/llama/install_requirements.sh
+    # Test export_llama script: python3 -m examples.models.llama.export_llama
+    "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
     run_portable_executor_runner
     rm "./${MODEL_NAME}.pte"
   fi

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -160,7 +160,7 @@ jobs:
 
         if [[ ${{ matrix.model }} =~ ^stories* ]]; then
             # Install requirements for export_llama
-            PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
             if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -162,7 +162,7 @@ jobs:
         if [[ ${{ matrix.model }} =~ ^stories* ]]; then
           # Install requirements for export_llama
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash examples/models/llama2/install_requirements.sh
+            bash examples/models/llama/install_requirements.sh
 
           # Test llama2
           if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -117,7 +117,7 @@ jobs:
         # Setup executorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
 
@@ -216,7 +216,7 @@ jobs:
         bash install_requirements.sh --pybind xnnpack
 
         # install Llava requirements
-        bash examples/models/llama2/install_requirements.sh
+        bash examples/models/llama/install_requirements.sh
         bash examples/models/llava/install_requirements.sh
 
         # run python unittest
@@ -411,7 +411,7 @@ jobs:
         # Setup executorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
 

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -255,7 +255,7 @@ jobs:
         fi
 
         # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
 
@@ -279,7 +279,7 @@ jobs:
   #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
 
   #       # install Llava requirements
-  #       ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+  #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
   #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
 
   #       # run python unittest
@@ -385,7 +385,7 @@ jobs:
         cmake --build cmake-out -j9 --target install --config Release
 
         echo "Build llama runner"
-        dir="examples/models/llama2"
+        dir="examples/models/llama"
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
             -DCMAKE_BUILD_TYPE=Release \
@@ -437,5 +437,5 @@ jobs:
 
         python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
 
-        cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+        cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
         echo "::endgroup::"
diff --git a/README.md b/README.md
@@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch.
 
 Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
 
-Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
 
 
-**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
+**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
 
 ## Feedback
 

@@ -53,7 +53,7 @@ def define_common_targets():
             exclude = ["Logging.h"],
         ),
         define_static_target = True,
-        link_whole = True,  # needed for executorch/examples/models/llama2:main to register QnnBackend
+        link_whole = True,  # needed for executorch/examples/models/llama:main to register QnnBackend
         platforms = [ANDROID],
         visibility = ["@EXECUTORCH_CLIENTS"],
         resources = {

@@ -37,7 +37,7 @@
     skip_annotation,
 )
 
-from executorch.examples.models.llama2.llama_transformer import (
+from executorch.examples.models.llama.llama_transformer import (
     ModelArgs,
     MOEFeedForward,
 )
@@ -51,7 +51,7 @@
 from executorch.examples.models.inception_v3 import InceptionV3Model
 from executorch.examples.models.inception_v4 import InceptionV4Model
 
-# from executorch.examples.models.llama2 import Llama2Model
+# from executorch.examples.models.llama import Llama2Model
 from executorch.examples.models.mobilebert import MobileBertModelExample
 from executorch.examples.models.mobilenet_v2 import MV2Model
 from executorch.examples.models.mobilenet_v3 import MV3Model

@@ -57,7 +57,7 @@ partially lower the Llama model to Vulkan.
 
 ```shell
 # The files will usually be downloaded to ~/.llama
-python -m examples.models.llama2.export_llama \
+python -m examples.models.llama.export_llama \
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
@@ -95,23 +95,23 @@ binary using the Android NDK toolchain.
   cmake --build cmake-android-out -j16 --target install)
 
 # Build LLaMA Runner library
-(rm -rf cmake-android-out/examples/models/llama2 && \
-  cmake examples/models/llama2 \
+(rm -rf cmake-android-out/examples/models/llama && \
+  cmake examples/models/llama \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI=$ANDROID_ABI \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DPYTHON_EXECUTABLE=python \
-    -Bcmake-android-out/examples/models/llama2 && \
-  cmake --build cmake-android-out/examples/models/llama2 -j16)
+    -Bcmake-android-out/examples/models/llama && \
+  cmake --build cmake-android-out/examples/models/llama -j16)
 ```
 
 Finally, push and run the llama runner binary on your Android device. Note that
 your device must have sufficient GPU memory to execute the model.
 
 ```shell
-adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
+adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main
 
 adb shell /data/local/tmp/llama_main \
     --model_path=/data/local/tmp/vulkan_llama2.pte \

@@ -58,7 +58,7 @@ runtime.python_test(
         "fbsource//third-party/pypi/torchsr:torchsr",  # @manual
         "fbsource//third-party/pypi/transformers:transformers",  # @manual
         "//executorch/backends/xnnpack/test/tester:tester",
-        "//executorch/examples/models/llama2:llama2_model",
+        "//executorch/examples/models/llama:llama2_model",
         "//pytorch/audio/src:torchaudio_core",
         "//pytorch/vision:torchvision",  # @manual
     ],

@@ -9,7 +9,7 @@
 import torch
 
 from executorch.backends.xnnpack.test.tester import Tester
-from executorch.examples.models.llama2.model import Llama2Model
+from executorch.examples.models.llama.model import Llama2Model
 
 
 class TestLlama2ETExample(unittest.TestCase):

diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
@@ -383,7 +383,7 @@ deps = [
 
 [targets.llama_runner]
 buck_targets = [
-  "//examples/models/llama2/runner:runner",
+  "//examples/models/llama/runner:runner",
 ]
 filters = [
   ".cpp$",

@@ -6,7 +6,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 
 - Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
 - Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
-- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch.
+- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch.
 - A Qualcomm device with 16GB RAM
   - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
 - The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above.
@@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
 
 ```bash
 # Please note that calibration_data must include the prompt template for special tokens.
-python -m examples.models.llama2.export_llama  -t <path_to_tokenizer.model>
+python -m examples.models.llama.export_llama  -t <path_to_tokenizer.model>
 llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
 ```
 
@@ -76,9 +76,9 @@ llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <pat
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-        -Bcmake-android-out/examples/models/llama2 examples/models/llama2
+        -Bcmake-android-out/examples/models/llama examples/models/llama
 
-    cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release
+    cmake --build cmake-android-out/examples/models/llama -j16 --config Release
 ```
 3. Run on Android via adb shell
 *Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
@@ -105,7 +105,7 @@ adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_D
 adb push <model.pte> ${DEVICE_DIR}
 adb push <tokenizer.model> ${DEVICE_DIR}
 adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
-adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR}
+adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR}
 ```
 
 **3.4 Run model**

@@ -1,5 +1,5 @@
 # Llama on ExecuTorch
 
 See
-[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md)
+[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md)
 for detailed information about running Llama on ExecuTorch.
diff --git a/examples/README.md b/examples/README.md
@@ -39,7 +39,7 @@ For specific details related to models and backend, you can explore the various
 
 ### Llama Models
 
-[This page](./models/llama2/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+[This page](./models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ### Llava1.5 7B
 

@@ -372,7 +372,7 @@ HierarchicalAllocator planned_memory(
         strstr(model_path, "emformer_transcribe")  ||
         strstr(model_path, "emformer_join")        ||
         strstr(model_path, "edsr")                 ||
-        strstr(model_path, "llama2")               ||
+        strstr(model_path, "llama")                ||
         strstr(model_path, "ic3")                  ||
         strstr(model_path, "ic4")) {
       atol = 1e-04;

diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py
@@ -14,7 +14,7 @@
 
 from executorch.backends.cadence.aot.export_example import export_model
 
-from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"