|  | 
|  | 1 | +#!/bin/bash | 
|  | 2 | +# Copyright (c) Qualcomm Innovation Center, Inc. | 
|  | 3 | +# All rights reserved | 
|  | 4 | +# | 
|  | 5 | +# This source code is licensed under the BSD-style license found in the | 
|  | 6 | +# LICENSE file in the root directory of this source tree. | 
|  | 7 | + | 
|  | 8 | +set -exu | 
|  | 9 | + | 
|  | 10 | +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" | 
|  | 11 | + | 
|  | 12 | +export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.." | 
|  | 13 | + | 
|  | 14 | +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then | 
|  | 15 | +  PYTHON_EXECUTABLE=python3 | 
|  | 16 | +fi | 
|  | 17 | + | 
|  | 18 | +which "${PYTHON_EXECUTABLE}" | 
|  | 19 | + | 
|  | 20 | +# Update tokenizers submodule | 
|  | 21 | +pushd $EXECUTORCH_ROOT/extension/llm/tokenizers | 
|  | 22 | +echo "Update tokenizers submodule" | 
|  | 23 | +git submodule update --init | 
|  | 24 | +popd | 
|  | 25 | + | 
|  | 26 | +# Install ET with CMake | 
|  | 27 | +cmake -DPYTHON_EXECUTABLE=python \ | 
|  | 28 | +    -DCMAKE_INSTALL_PREFIX=cmake-out \ | 
|  | 29 | +    -DEXECUTORCH_ENABLE_LOGGING=1 \ | 
|  | 30 | +    -DCMAKE_BUILD_TYPE=Release \ | 
|  | 31 | +    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ | 
|  | 32 | +    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ | 
|  | 33 | +    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ | 
|  | 34 | +    -DEXECUTORCH_BUILD_XNNPACK=OFF \ | 
|  | 35 | +    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ | 
|  | 36 | +    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ | 
|  | 37 | +    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ | 
|  | 38 | +    -Bcmake-out . | 
|  | 39 | +cmake --build cmake-out -j16 --target install --config Release | 
|  | 40 | + | 
|  | 41 | +# Install llama runner with torchao | 
|  | 42 | +cmake -DPYTHON_EXECUTABLE=python \ | 
|  | 43 | +    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \ | 
|  | 44 | +    -DCMAKE_BUILD_TYPE=Release \ | 
|  | 45 | +    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ | 
|  | 46 | +    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ | 
|  | 47 | +    -DEXECUTORCH_BUILD_XNNPACK=OFF \ | 
|  | 48 | +    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ | 
|  | 49 | +    -DEXECUTORCH_BUILD_TORCHAO=ON \ | 
|  | 50 | +    -Bcmake-out/examples/models/llama \ | 
|  | 51 | +    examples/models/llama | 
|  | 52 | +cmake --build cmake-out/examples/models/llama -j16 --config Release | 
|  | 53 | + | 
|  | 54 | +# Download stories llama110m artifacts | 
|  | 55 | +download_stories_model_artifacts | 
|  | 56 | + | 
|  | 57 | +echo "Creating tokenizer.bin" | 
|  | 58 | +$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin | 
|  | 59 | + | 
|  | 60 | +# Export model | 
|  | 61 | +LLAMA_CHECKPOINT=stories110M.pt | 
|  | 62 | +LLAMA_PARAMS=params.json | 
|  | 63 | +MODEL_OUT=model.pte | 
|  | 64 | +TOKENIZER=tokenizer.bin | 
|  | 65 | + | 
|  | 66 | +# Set low-bit quantization parameters | 
|  | 67 | +QLINEAR_BITWIDTH=3 # Can be 1-8 | 
|  | 68 | +QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 | 
|  | 69 | +QEMBEDDING_BITWIDTH=4 # Can be 1-8 | 
|  | 70 | +QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 | 
|  | 71 | + | 
|  | 72 | +${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \ | 
|  | 73 | +    --checkpoint "${LLAMA_CHECKPOINT:?}" \ | 
|  | 74 | +    --params "${LLAMA_PARAMS:?}" \ | 
|  | 75 | +    -kv \ | 
|  | 76 | +    --use_sdpa_with_kv_cache \ | 
|  | 77 | +    --output_name=${MODEL_OUT} \ | 
|  | 78 | +    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ | 
|  | 79 | +    --group_size ${QLINEAR_GROUP_SIZE} \ | 
|  | 80 | +    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ | 
|  | 81 | +    --disable_dynamic_shape \ | 
|  | 82 | +    -d fp32 | 
|  | 83 | + | 
|  | 84 | +# Test run | 
|  | 85 | +./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," | 
0 commit comments