| 
 | 1 | +#!/bin/bash  | 
 | 2 | +# Copyright (c) Qualcomm Innovation Center, Inc.  | 
 | 3 | +# All rights reserved  | 
 | 4 | +#  | 
 | 5 | +# This source code is licensed under the BSD-style license found in the  | 
 | 6 | +# LICENSE file in the root directory of this source tree.  | 
 | 7 | + | 
 | 8 | +set -exu  | 
 | 9 | + | 
 | 10 | +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"  | 
 | 11 | + | 
 | 12 | +export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."  | 
 | 13 | + | 
 | 14 | +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then  | 
 | 15 | +  PYTHON_EXECUTABLE=python3  | 
 | 16 | +fi  | 
 | 17 | + | 
 | 18 | +which "${PYTHON_EXECUTABLE}"  | 
 | 19 | + | 
 | 20 | +# Update tokenizers submodule  | 
 | 21 | +pushd $EXECUTORCH_ROOT/extension/llm/tokenizers  | 
 | 22 | +echo "Update tokenizers submodule"  | 
 | 23 | +git submodule update --init  | 
 | 24 | +popd  | 
 | 25 | + | 
 | 26 | +# Install ET with CMake  | 
 | 27 | +cmake -DPYTHON_EXECUTABLE=python \  | 
 | 28 | +    -DCMAKE_INSTALL_PREFIX=cmake-out \  | 
 | 29 | +    -DEXECUTORCH_ENABLE_LOGGING=1 \  | 
 | 30 | +    -DCMAKE_BUILD_TYPE=Release \  | 
 | 31 | +    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \  | 
 | 32 | +    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \  | 
 | 33 | +    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \  | 
 | 34 | +    -DEXECUTORCH_BUILD_XNNPACK=OFF \  | 
 | 35 | +    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \  | 
 | 36 | +    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \  | 
 | 37 | +    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \  | 
 | 38 | +    -Bcmake-out .  | 
 | 39 | +cmake --build cmake-out -j16 --target install --config Release  | 
 | 40 | + | 
 | 41 | +# Install llama runner with torchao  | 
 | 42 | +cmake -DPYTHON_EXECUTABLE=python \  | 
 | 43 | +    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \  | 
 | 44 | +    -DCMAKE_BUILD_TYPE=Release \  | 
 | 45 | +    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \  | 
 | 46 | +    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \  | 
 | 47 | +    -DEXECUTORCH_BUILD_XNNPACK=OFF \  | 
 | 48 | +    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \  | 
 | 49 | +    -DEXECUTORCH_BUILD_TORCHAO=ON \  | 
 | 50 | +    -Bcmake-out/examples/models/llama \  | 
 | 51 | +    examples/models/llama  | 
 | 52 | +cmake --build cmake-out/examples/models/llama -j16 --config Release  | 
 | 53 | + | 
 | 54 | +# Download stories llama110m artifacts  | 
 | 55 | +download_stories_model_artifacts  | 
 | 56 | + | 
 | 57 | +echo "Creating tokenizer.bin"  | 
 | 58 | +$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin  | 
 | 59 | + | 
 | 60 | +# Export model  | 
 | 61 | +LLAMA_CHECKPOINT=stories110M.pt  | 
 | 62 | +LLAMA_PARAMS=params.json  | 
 | 63 | +MODEL_OUT=model.pte  | 
 | 64 | +TOKENIZER=tokenizer.bin  | 
 | 65 | + | 
 | 66 | +# Set low-bit quantization parameters  | 
 | 67 | +QLINEAR_BITWIDTH=3 # Can be 1-8  | 
 | 68 | +QLINEAR_GROUP_SIZE=128 # Must be multiple of 16  | 
 | 69 | +QEMBEDDING_BITWIDTH=4 # Can be 1-8  | 
 | 70 | +QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16  | 
 | 71 | + | 
 | 72 | +${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \  | 
 | 73 | +    --checkpoint "${LLAMA_CHECKPOINT:?}" \  | 
 | 74 | +    --params "${LLAMA_PARAMS:?}" \  | 
 | 75 | +    -kv \  | 
 | 76 | +    --use_sdpa_with_kv_cache \  | 
 | 77 | +    --output_name=${MODEL_OUT} \  | 
 | 78 | +    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \  | 
 | 79 | +    --group_size ${QLINEAR_GROUP_SIZE} \  | 
 | 80 | +    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \  | 
 | 81 | +    --disable_dynamic_shape \  | 
 | 82 | +    -d fp32  | 
 | 83 | + | 
 | 84 | +# Test run  | 
 | 85 | +./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"  | 
0 commit comments