Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 5 additions & 22 deletions .ci/scripts/test_phi_3_mini.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,14 @@ NPROC=8
if hash nproc &> /dev/null; then NPROC=$(nproc); fi

cmake_install_executorch_libraries() {
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
rm -rf cmake-out
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
}

cmake_build_phi_3_mini() {
cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
cmake -DCMAKE_PREFIX_PATH=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-B${BUILD_DIR}/${MODEL_DIR} \
${MODEL_DIR}

Expand Down Expand Up @@ -81,7 +64,7 @@ run_and_verify() {
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
--model_path=phi-3-mini.pte \
--tokenizer_path=tokenizer.bin \
--seq_len=128 \
--seq_len=60 \
--temperature=0 \
--prompt="<|system|>
You are a helpful assistant.<|end|>
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ jobs:
bash examples/models/phi-3-mini/install_requirements.sh

# run e2e (export, tokenizer and runner)
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release

test-eval_llama-wikitext-linux:
name: test-eval_llama-wikitext-linux
Expand Down
43 changes: 20 additions & 23 deletions examples/models/phi-3-mini/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,43 +13,40 @@
# It should also be cmake-lint clean.
#

cmake_minimum_required(VERSION 3.19)
cmake_minimum_required(VERSION 3.24)
cmake_policy(SET CMP0144 NEW)
project(phi_3_mini_runner)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_BUILD_TYPE Release)

# Set options for executorch build.
option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON)
option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
option(EXECUTORCH_BUILD_XNNPACK "" ON)
set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
set(_common_include_directories
${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
)
set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
find_package(executorch CONFIG REQUIRED)

target_link_options_shared_lib(executorch)

set(BUILD_TESTING OFF)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
${EXECUTORCH_ROOT}/extension/llm/runner
${CMAKE_BINARY_DIR}/../../../extension/llm/runner
)

if(NOT TARGET gflags)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
${CMAKE_BINARY_DIR}/gflags
)
endif()

add_executable(
phi_3_mini_runner
main.cpp runner.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
)
target_include_directories(
phi_3_mini_runner
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
)
add_executable(phi_3_mini_runner main.cpp)

target_link_directories(phi_3_mini_runner PUBLIC ${_common_include_directories})

target_link_libraries(
phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
optimized_native_cpu_ops_lib xnnpack_backend gflags
phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib
xnnpack_backend gflags extension_llm_runner
)
42 changes: 13 additions & 29 deletions examples/models/phi-3-mini/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro
# Instructions
## Step 1: Setup
1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
2. Currently, we support transformers v4.44.2. Install transformers with the following command:
2. Currently, we support transformers v4.53.1. Install transformers with the following command:
```
pip uninstall -y transformers ; pip install transformers==4.44.2
pip uninstall -y transformers ; pip install transformers==4.53.1
```
## Step 2: Prepare and run the model
1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
Expand All @@ -17,41 +17,25 @@ python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokeniz
```
2. Export the model. This step will take a few minutes to finish.
```
python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
python -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
```
3. Build and run the model.
- Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-Bcmake-out .
- Build executorch with LLM preset:
```
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out

cmake --build cmake-out -j16 --target install --config Release
```
cmake --build cmake-out -j16 --target install --config Release
```
- Build Phi-3-mini runner.
```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-Bcmake-out/examples/models/phi-3-mini \
examples/models/phi-3-mini
cmake -DCMAKE_PREFIX_PATH=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out/examples/models/phi-3-mini \
examples/models/phi-3-mini

cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
```
- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L13-L30)
- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L16-L33)
```
cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
--model_path=phi-3-mini.pte \
Expand Down
114 changes: 81 additions & 33 deletions examples/models/phi-3-mini/export_phi-3-mini.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,42 @@
XNNPACKQuantizer,
)
from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
from executorch.exir import to_edge
from executorch.exir import to_edge_transform_and_lower
from executorch.exir.capture._config import ExecutorchBackendConfig
from executorch.exir.passes import MemoryPlanningPass
from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
from torch.export import export_for_training
from torch.nn.attention import SDPBackend
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e

from transformers import Phi3ForCausalLM
from transformers.cache_utils import StaticCacheConfig

from .phi_3_mini import Phi3Mini
from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM


def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
"""
Prepare example inputs and configurations for export.

Returns:
example_input_ids (torch.Tensor): Example input IDs tensor.
example_cache_position (torch.Tensor): Example cache position tensor.
dynamic_shapes (dict or None): Dynamic shape specifications for export.
strict (bool): Whether to use strict export mode.
"""
# Prepare inputs with dynamic shapes
seq_length = 3 # Sequence length > 1 to avoid specialization issues
example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
example_cache_position = torch.arange(seq_length, dtype=torch.long)
max_dim = min(max_seq_len, sliding_window) - 1
seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
dynamic_shapes = {
"input_ids": {1: seq_len_dim},
"cache_position": {0: seq_len_dim},
}

return example_input_ids, example_cache_position, dynamic_shapes


def export(args) -> None:
Expand All @@ -40,51 +69,70 @@ def export(args) -> None:
f"Invalid context length {args.context_length}. Should be either 4k or 128k"
)

with torch.no_grad():
model = Phi3Mini(
# pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
model=Phi3ForCausalLM.from_pretrained(model_name),
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
model = Phi3ForCausalLM.from_pretrained(model_name)
model.generation_config.cache_implementation = "static"
model.generation_config.cache_config = StaticCacheConfig(
batch_size=1, max_cache_len=model.config.max_position_embeddings
)

exportable_module = TorchExportableModuleForDecoderOnlyLM(
model,
max_batch_size=1,
max_seq_len=args.seq_len,
max_cache_len=model.config.max_position_embeddings,
)
example_inputs = (
torch.tensor(
[[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
),
input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
model.config.max_position_embeddings, model.config.sliding_window
)
example_inputs = (input_ids, cache_position)
exported_program = exportable_module.export(
input_ids, cache_position, dynamic_shapes, strict=False
)
# Apply RemoveTransposes pass to remove
# any back-to-back transpose ops that are not needed
# e.g. output of update_cache is transposed and
# input to custom_sdpa is transposed.
from executorch.extension.llm.export.export_passes import (
RemoveRedundantTransposes,
)
dynamic_shapes = {
"input_ids": {
1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
}
}

mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]

xnnpack_quant_config = get_symmetric_quantization_config(
is_per_channel=True, is_dynamic=True
)
xnnpack_quantizer = XNNPACKQuantizer()
xnnpack_quantizer.set_global(xnnpack_quant_config)

model = export_for_training(
model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
).module()
model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6]
model(*example_inputs)
model = convert_pt2e(model)
DuplicateDynamicQuantChainPass()(model)
# TODO(lunwenh): update it to use export once
# https://github.com/pytorch/pytorch/issues/128394 is resolved.
model = torch.export._trace._export(
model,
example_inputs,
dynamic_shapes=dynamic_shapes,
strict=False,
pre_dispatch=False,
gm = prepare_pt2e(mutated_gm, xnnpack_quantizer) # pyre-fixme[6]
gm(*example_inputs)
gm = convert_pt2e(gm)
DuplicateDynamicQuantChainPass()(gm)
exported_program = export_for_training(
gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
)

edge_config = get_xnnpack_edge_compile_config()
edge_manager = to_edge(model, compile_config=edge_config)
edge_manager = to_edge_transform_and_lower(
exported_program,
partitioner=[XnnpackPartitioner()],
compile_config=edge_config,
constant_methods={
"get_eos_ids": [32000],
"use_kv_cache": True,
"enable_dynamic_shape": True,
"get_max_seq_len": model.config.max_position_embeddings - 1,
},
)
edge_manager = edge_manager.to_backend(XnnpackPartitioner())
et_program = edge_manager.to_executorch()
et_program = edge_manager.to_executorch(
ExecutorchBackendConfig(
extract_delegate_segments=True,
do_quant_fusion_and_const_prop=True,
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
)
)

with open(args.output_name, "wb") as file:
file.write(et_program.buffer)
Expand Down
2 changes: 0 additions & 2 deletions examples/models/phi-3-mini/install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

set -x

pip install transformers==4.44.2

pip install sentencepiece

pip list
16 changes: 13 additions & 3 deletions examples/models/phi-3-mini/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <gflags/gflags.h>
#include <pytorch/tokenizers/llama2c_tokenizer.h>
#include <iostream>

#include <executorch/examples/models/phi-3-mini/runner.h>
using executorch::extension::llm::TextLLMRunner;

DEFINE_string(
model_path,
Expand Down Expand Up @@ -42,9 +45,16 @@ int main(int32_t argc, char** argv) {

int32_t seq_len = FLAGS_seq_len;

example::Runner runner(model_path, tokenizer_path, temperature);
std::unique_ptr<tokenizers::Tokenizer> tokenizer =
std::make_unique<tokenizers::Llama2cTokenizer>();
tokenizer->load(tokenizer_path);

runner.generate(prompt, seq_len);
auto runner = executorch::extension::llm::create_text_llm_runner(
model_path, std::move(tokenizer));

runner->generate(
prompt,
{.seq_len = seq_len, .temperature = static_cast<float>(temperature)});

return 0;
}
2 changes: 1 addition & 1 deletion examples/models/phi-3-mini/phi_3_mini.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int
def forward(
self,
# pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
input_ids: torch.LongTensor = None,
input_ids: torch.LongTensor,
) -> torch.FloatTensor:
# pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
return self.model.forward(
Expand Down
Loading
Loading