Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/scripts/build_llama_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ build_llama_runner() {
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-Bcmake-android-out/examples/models/llama2 examples/models/llama2
-Bcmake-android-out/examples/models/llama examples/models/llama

cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release
cmake --build cmake-android-out/examples/models/llama -j4 --config Release
}
install_flatc_from_source
install_executorch_and_backend_lib
Expand Down
8 changes: 4 additions & 4 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ cmake_install_executorch_libraries() {

cmake_build_llama_runner() {
echo "Building llama runner"
dir="examples/models/llama2"
dir="examples/models/llama"
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
Expand Down Expand Up @@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
fi
# Add dynamically linked library location
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

# Create tokenizer.bin.
echo "Creating tokenizer.bin"
Expand All @@ -219,15 +219,15 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
# Run model.
# shellcheck source=/dev/null
$BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt
$BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
cmake_install_executorch_libraries
cmake_build_llama_runner
# Run llama runner
NOW=$(date +"%H:%M:%S")
echo "Starting to run llama runner at ${NOW}"
# shellcheck source=/dev/null
cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt
cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt
NOW=$(date +"%H:%M:%S")
echo "Finished at ${NOW}"
else
Expand Down
6 changes: 3 additions & 3 deletions .ci/scripts/test_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ run_portable_executor_runner() {
test_model() {
if [[ "${MODEL_NAME}" == "llama2" ]]; then
# Install requirements for export_llama
bash examples/models/llama2/install_requirements.sh
# Test export_llama script: python3 -m examples.models.llama2.export_llama
"${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json
bash examples/models/llama/install_requirements.sh
# Test export_llama script: python3 -m examples.models.llama.export_llama
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
run_portable_executor_runner
rm "./${MODEL_NAME}.pte"
fi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ jobs:
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
DELEGATE_CONFIG="xnnpack+custom+qe"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/apple-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ jobs:
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
bash examples/models/llama2/install_requirements.sh
bash examples/models/llama/install_requirements.sh

# Test llama2
if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
# Setup executorch
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"

Expand Down Expand Up @@ -216,7 +216,7 @@ jobs:
bash install_requirements.sh --pybind xnnpack

# install Llava requirements
bash examples/models/llama2/install_requirements.sh
bash examples/models/llama/install_requirements.sh
bash examples/models/llava/install_requirements.sh

# run python unittest
Expand Down Expand Up @@ -411,7 +411,7 @@ jobs:
# Setup executorch
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ jobs:
fi

# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"

Expand All @@ -279,7 +279,7 @@ jobs:
# GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"

# # install Llava requirements
# ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
# ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
# ${CONDA_RUN} bash examples/models/llava/install_requirements.sh

# # run python unittest
Expand Down Expand Up @@ -385,7 +385,7 @@ jobs:
cmake --build cmake-out -j9 --target install --config Release

echo "Build llama runner"
dir="examples/models/llama2"
dir="examples/models/llama"
cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
Expand Down Expand Up @@ -437,5 +437,5 @@ jobs:

python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}

cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
echo "::endgroup::"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch.

Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.

Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.


**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.

## Feedback

Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/runtime/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def define_common_targets():
exclude = ["Logging.h"],
),
define_static_target = True,
link_whole = True, # needed for executorch/examples/models/llama2:main to register QnnBackend
link_whole = True, # needed for executorch/examples/models/llama:main to register QnnBackend
platforms = [ANDROID],
visibility = ["@EXECUTORCH_CLIENTS"],
resources = {
Expand Down
4 changes: 2 additions & 2 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
skip_annotation,
)

from executorch.examples.models.llama2.llama_transformer import (
from executorch.examples.models.llama.llama_transformer import (
ModelArgs,
MOEFeedForward,
)
Expand All @@ -51,7 +51,7 @@
from executorch.examples.models.inception_v3 import InceptionV3Model
from executorch.examples.models.inception_v4 import InceptionV4Model

# from executorch.examples.models.llama2 import Llama2Model
# from executorch.examples.models.llama import Llama2Model
from executorch.examples.models.mobilebert import MobileBertModelExample
from executorch.examples.models.mobilenet_v2 import MV2Model
from executorch.examples.models.mobilenet_v3 import MV3Model
Expand Down
12 changes: 6 additions & 6 deletions backends/vulkan/docs/android_demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ partially lower the Llama model to Vulkan.

```shell
# The files will usually be downloaded to ~/.llama
python -m examples.models.llama2.export_llama \
python -m examples.models.llama.export_llama \
--disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
-c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
-p ~/.llama/checkpoints/Llama3.2-1B/params.json \
Expand Down Expand Up @@ -95,23 +95,23 @@ binary using the Android NDK toolchain.
cmake --build cmake-android-out -j16 --target install)

# Build LLaMA Runner library
(rm -rf cmake-android-out/examples/models/llama2 && \
cmake examples/models/llama2 \
(rm -rf cmake-android-out/examples/models/llama && \
cmake examples/models/llama \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=$ANDROID_ABI \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DCMAKE_INSTALL_PREFIX=cmake-android-out \
-DPYTHON_EXECUTABLE=python \
-Bcmake-android-out/examples/models/llama2 && \
cmake --build cmake-android-out/examples/models/llama2 -j16)
-Bcmake-android-out/examples/models/llama && \
cmake --build cmake-android-out/examples/models/llama -j16)
```

Finally, push and run the llama runner binary on your Android device. Note that
your device must have sufficient GPU memory to execute the model.

```shell
adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main

adb shell /data/local/tmp/llama_main \
--model_path=/data/local/tmp/vulkan_llama2.pte \
Expand Down
2 changes: 1 addition & 1 deletion backends/xnnpack/test/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ runtime.python_test(
"fbsource//third-party/pypi/torchsr:torchsr", # @manual
"fbsource//third-party/pypi/transformers:transformers", # @manual
"//executorch/backends/xnnpack/test/tester:tester",
"//executorch/examples/models/llama2:llama2_model",
"//executorch/examples/models/llama:llama2_model",
"//pytorch/audio/src:torchaudio_core",
"//pytorch/vision:torchvision", # @manual
],
Expand Down
2 changes: 1 addition & 1 deletion backends/xnnpack/test/models/llama2_et_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import torch

from executorch.backends.xnnpack.test.tester import Tester
from executorch.examples.models.llama2.model import Llama2Model
from executorch.examples.models.llama.model import Llama2Model


class TestLlama2ETExample(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion build/cmake_deps.toml
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ deps = [

[targets.llama_runner]
buck_targets = [
"//examples/models/llama2/runner:runner",
"//examples/models/llama/runner:runner",
]
filters = [
".cpp$",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng

- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch.
- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch.
- A Qualcomm device with 16GB RAM
- We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above.
Expand Down Expand Up @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure

```bash
# Please note that calibration_data must include the prompt template for special tokens.
python -m examples.models.llama2.export_llama -t <path_to_tokenizer.model>
python -m examples.models.llama.export_llama -t <path_to_tokenizer.model>
llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct> --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```

Expand Down Expand Up @@ -76,9 +76,9 @@ llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <pat
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-Bcmake-android-out/examples/models/llama2 examples/models/llama2
-Bcmake-android-out/examples/models/llama examples/models/llama

cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release
cmake --build cmake-android-out/examples/models/llama -j16 --config Release
```
3. Run on Android via adb shell
*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
Expand All @@ -105,7 +105,7 @@ adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_D
adb push <model.pte> ${DEVICE_DIR}
adb push <tokenizer.model> ${DEVICE_DIR}
adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR}
adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR}
```

**3.4 Run model**
Expand Down
2 changes: 1 addition & 1 deletion docs/source/llm/llama.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Llama on ExecuTorch

See
[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md)
[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md)
for detailed information about running Llama on ExecuTorch.
2 changes: 1 addition & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ For specific details related to models and backend, you can explore the various

### Llama Models

[This page](./models/llama2/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
[This page](./models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.

### Llava1.5 7B

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ HierarchicalAllocator planned_memory(
strstr(model_path, "emformer_transcribe") ||
strstr(model_path, "emformer_join") ||
strstr(model_path, "edsr") ||
strstr(model_path, "llama2") ||
strstr(model_path, "llama") ||
strstr(model_path, "ic3") ||
strstr(model_path, "ic4")) {
atol = 1e-04;
Expand Down
2 changes: 1 addition & 1 deletion examples/cadence/models/babyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from executorch.backends.cadence.aot.export_example import export_model

from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer


FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
Expand Down
Loading
Loading