Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 4 additions & 21 deletions .ci/scripts/test_phi_3_mini.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,14 @@ NPROC=8
if hash nproc &> /dev/null; then NPROC=$(nproc); fi

cmake_install_executorch_libraries() {
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}

cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
}

cmake_build_phi_3_mini() {
cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
cmake -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-B${BUILD_DIR}/${MODEL_DIR} \
${MODEL_DIR}

Expand Down
32 changes: 11 additions & 21 deletions examples/models/phi-3-mini/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,14 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_BUILD_TYPE Release)

# Set options for executorch build.
option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON)
option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
option(EXECUTORCH_BUILD_XNNPACK "" ON)

add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
)
set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
find_package(executorch CONFIG REQUIRED)

target_link_options_shared_lib(executorch)

set(BUILD_TESTING OFF)
add_subdirectory(${EXECUTORCH_ROOT}/extension/llm/runner ${CMAKE_BINARY_DIR}/../../../extension/llm/runner)

if(NOT TARGET gflags)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
Expand All @@ -40,16 +37,9 @@ endif()

add_executable(
phi_3_mini_runner
main.cpp runner.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
)
target_include_directories(
phi_3_mini_runner
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
main.cpp
)

target_link_libraries(
phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
optimized_native_cpu_ops_lib xnnpack_backend gflags
phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib xnnpack_backend gflags extension_llm_runner
)
32 changes: 8 additions & 24 deletions examples/models/phi-3-mini/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,17 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m
```
3. Build and run the model.
- Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-Bcmake-out .
```
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out

cmake --build cmake-out -j16 --target install --config Release
```
cmake --build cmake-out -j16 --target install --config Release
```
- Build Phi-3-mini runner.
```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-Bcmake-out/examples/models/phi-3-mini \
examples/models/phi-3-mini
cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out/examples/models/phi-3-mini \
examples/models/phi-3-mini

cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
```
Expand Down
114 changes: 81 additions & 33 deletions examples/models/phi-3-mini/export_phi-3-mini.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,42 @@
XNNPACKQuantizer,
)
from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
from executorch.exir import to_edge
from executorch.exir import to_edge_transform_and_lower
from executorch.exir.capture._config import ExecutorchBackendConfig
from executorch.exir.passes import MemoryPlanningPass
from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
from torch.export import export_for_training
from torch.nn.attention import SDPBackend
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e

from transformers import Phi3ForCausalLM
from transformers.cache_utils import StaticCacheConfig

from .phi_3_mini import Phi3Mini
from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM


def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
"""
Prepare example inputs and configurations for export.

Returns:
example_input_ids (torch.Tensor): Example input IDs tensor.
example_cache_position (torch.Tensor): Example cache position tensor.
dynamic_shapes (dict or None): Dynamic shape specifications for export.
strict (bool): Whether to use strict export mode.
"""
# Prepare inputs with dynamic shapes
seq_length = 3 # Sequence length > 1 to avoid specialization issues
example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
example_cache_position = torch.arange(seq_length, dtype=torch.long)
max_dim = min(max_seq_len, sliding_window) - 1
seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
dynamic_shapes = {
"input_ids": {1: seq_len_dim},
"cache_position": {0: seq_len_dim},
}

return example_input_ids, example_cache_position, dynamic_shapes


def export(args) -> None:
Expand All @@ -40,51 +69,70 @@ def export(args) -> None:
f"Invalid context length {args.context_length}. Should be either 4k or 128k"
)

with torch.no_grad():
model = Phi3Mini(
# pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
model=Phi3ForCausalLM.from_pretrained(model_name),
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
model = Phi3ForCausalLM.from_pretrained(model_name)
model.generation_config.cache_implementation = "static"
model.generation_config.cache_config = StaticCacheConfig(
batch_size=1, max_cache_len=model.config.max_position_embeddings
)

exportable_module = TorchExportableModuleForDecoderOnlyLM(
model,
max_batch_size=1,
max_seq_len=args.seq_len,
max_cache_len=model.config.max_position_embeddings,
)
example_inputs = (
torch.tensor(
[[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
),
input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
model.config.max_position_embeddings, model.config.sliding_window
)
example_inputs = (input_ids, cache_position)
exported_program = exportable_module.export(
input_ids, cache_position, dynamic_shapes, strict=False
)
# Apply RemoveTransposes pass to remove
# any back-to-back transpose ops that are not needed
# e.g. output of update_cache is transposed and
# input to custom_sdpa is transposed.
from executorch.extension.llm.export.export_passes import (
RemoveRedundantTransposes,
)
dynamic_shapes = {
"input_ids": {
1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
}
}

mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]

xnnpack_quant_config = get_symmetric_quantization_config(
is_per_channel=True, is_dynamic=True
)
xnnpack_quantizer = XNNPACKQuantizer()
xnnpack_quantizer.set_global(xnnpack_quant_config)

model = export_for_training(
model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
).module()
model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6]
model(*example_inputs)
model = convert_pt2e(model)
DuplicateDynamicQuantChainPass()(model)
# TODO(lunwenh): update it to use export once
# https://github.com/pytorch/pytorch/issues/128394 is resolved.
model = torch.export._trace._export(
model,
example_inputs,
dynamic_shapes=dynamic_shapes,
strict=False,
pre_dispatch=False,
gm = prepare_pt2e(mutated_gm, xnnpack_quantizer) # pyre-fixme[6]
gm(*example_inputs)
gm = convert_pt2e(gm)
DuplicateDynamicQuantChainPass()(gm)
exported_program = export_for_training(
gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
)

edge_config = get_xnnpack_edge_compile_config()
edge_manager = to_edge(model, compile_config=edge_config)
edge_manager = to_edge_transform_and_lower(
exported_program,
partitioner=[XnnpackPartitioner()],
compile_config=edge_config,
constant_methods={
"get_eos_ids": [32000],
"use_kv_cache": True,
"enable_dynamic_shape": True,
"get_max_seq_len": model.config.max_position_embeddings - 1,
},
)
edge_manager = edge_manager.to_backend(XnnpackPartitioner())
et_program = edge_manager.to_executorch()
et_program = edge_manager.to_executorch(
ExecutorchBackendConfig(
extract_delegate_segments=True,
do_quant_fusion_and_const_prop=True,
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
)
)

with open(args.output_name, "wb") as file:
file.write(et_program.buffer)
Expand Down
19 changes: 15 additions & 4 deletions examples/models/phi-3-mini/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <gflags/gflags.h>
#include <pytorch/tokenizers/llama2c_tokenizer.h>
#include <iostream>

#include <executorch/examples/models/phi-3-mini/runner.h>
using executorch::extension::llm::TextLLMRunner;

DEFINE_string(
model_path,
Expand Down Expand Up @@ -42,9 +45,17 @@ int main(int32_t argc, char** argv) {

int32_t seq_len = FLAGS_seq_len;

example::Runner runner(model_path, tokenizer_path, temperature);

runner.generate(prompt, seq_len);
std::unique_ptr<tokenizers::Tokenizer> tokenizer =
std::make_unique<tokenizers::Llama2cTokenizer>();
tokenizer->load(tokenizer_path);
std::cout << "Tokenizer loaded, eos_id = " << tokenizer->eos_tok()
<< std::endl;
auto runner = executorch::extension::llm::create_text_llm_runner(
model_path, std::move(tokenizer));

runner->generate(
prompt,
{.seq_len = seq_len, .temperature = static_cast<float>(temperature)});

return 0;
}
4 changes: 3 additions & 1 deletion examples/models/phi-3-mini/phi_3_mini.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int
def forward(
self,
# pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
input_ids: torch.LongTensor = None,
input_ids: torch.LongTensor,
cache_positions: torch.Tensor,
) -> torch.FloatTensor:
# pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
return self.model.forward(
input_ids=input_ids,
cache_positions=cache_positions,
use_cache=True,
return_dict=True,
past_key_values=self.cache,
Expand Down
Loading
Loading