Phi3 runner uses TextLLMRunner (pytorch#12482)

pytorchbot · larryliu0820 · web-flow · commit e1db3418f3a3 · 2025-07-15T07:48:49.000-07:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: pytorch#12477 by @larryliu0820 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/head Merge bot PR base: https://github.com/pytorch/executorch/tree/main Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/orig @diff-train-skip-merge Co-authored-by: Mengwei Liu <larryliu@meta.com>
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
@@ -22,31 +22,14 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
-  cmake -DPYTHON_EXECUTABLE=python \
-      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DEXECUTORCH_ENABLE_LOGGING=1 \
-      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-      -B${BUILD_DIR} .
-
-  cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+  rm -rf cmake-out
+  cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+  cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
 }
 
 cmake_build_phi_3_mini() {
-  cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+  cmake -DCMAKE_PREFIX_PATH=${BUILD_DIR} \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-      -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
       -B${BUILD_DIR}/${MODEL_DIR} \
       ${MODEL_DIR}
 
@@ -81,7 +64,7 @@ run_and_verify() {
     ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
     --model_path=phi-3-mini.pte \
     --tokenizer_path=tokenizer.bin \
-    --seq_len=128 \
+    --seq_len=60 \
     --temperature=0 \
     --prompt="<|system|>
 You are a helpful assistant.<|end|>
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -603,7 +603,7 @@ jobs:
         bash examples/models/phi-3-mini/install_requirements.sh
 
         # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release
 
   test-eval_llama-wikitext-linux:
     name: test-eval_llama-wikitext-linux
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
@@ -13,43 +13,40 @@
 # It should also be cmake-lint clean.
 #
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)
+cmake_policy(SET CMP0144 NEW)
 project(phi_3_mini_runner)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
-set(CMAKE_BUILD_TYPE Release)
 
-# Set options for executorch build.
-option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
-option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
+set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+)
+set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
+find_package(executorch CONFIG REQUIRED)
+
+target_link_options_shared_lib(executorch)
 
+set(BUILD_TESTING OFF)
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
+  ${EXECUTORCH_ROOT}/extension/llm/runner
+  ${CMAKE_BINARY_DIR}/../../../extension/llm/runner
 )
+
 if(NOT TARGET gflags)
   add_subdirectory(
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
     ${CMAKE_BINARY_DIR}/gflags
   )
 endif()
 
-add_executable(
-  phi_3_mini_runner
-  main.cpp runner.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
-)
-target_include_directories(
-  phi_3_mini_runner
-  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
-         ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
-)
+add_executable(phi_3_mini_runner main.cpp)
+
+target_link_directories(phi_3_mini_runner PUBLIC ${_common_include_directories})
+
 target_link_libraries(
-  phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
-                            optimized_native_cpu_ops_lib xnnpack_backend gflags
+  phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib
+                           xnnpack_backend gflags extension_llm_runner
 )
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
@@ -4,9 +4,9 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro
 # Instructions
 ## Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
-2. Currently, we support transformers v4.44.2. Install transformers with the following command:
+2. Currently, we support transformers v4.53.1. Install transformers with the following command:
 ```
-pip uninstall -y transformers ; pip install transformers==4.44.2
+pip uninstall -y transformers ; pip install transformers==4.53.1
 ```
 ## Step 2: Prepare and run the model
 1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
@@ -17,41 +17,25 @@ python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokeniz
 ```
 2. Export the model. This step will take a few minutes to finish.
 ```
-python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+python -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
 ```
 3. Build and run the model.
-- Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
- ```
- cmake -DPYTHON_EXECUTABLE=python \
-     -DCMAKE_INSTALL_PREFIX=cmake-out \
-     -DEXECUTORCH_ENABLE_LOGGING=1 \
-     -DCMAKE_BUILD_TYPE=Release \
-     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-     -DEXECUTORCH_BUILD_XNNPACK=ON \
-     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-     -Bcmake-out .
+- Build executorch with LLM preset:
+```
+cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out
 
- cmake --build cmake-out -j16 --target install --config Release
- ```
+cmake --build cmake-out -j16 --target install --config Release
+```
 - Build Phi-3-mini runner.
 ```
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -Bcmake-out/examples/models/phi-3-mini \
-    examples/models/phi-3-mini
+cmake -DCMAKE_PREFIX_PATH=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out/examples/models/phi-3-mini \
+      examples/models/phi-3-mini
 
 cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
 ```
-- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L13-L30)
+- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L16-L33)
 ```
 cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
     --model_path=phi-3-mini.pte \
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -19,13 +19,42 @@
     XNNPACKQuantizer,
 )
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import to_edge
+from executorch.exir import to_edge_transform_and_lower
+from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes import MemoryPlanningPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from torch.export import export_for_training
+from torch.nn.attention import SDPBackend
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from transformers import Phi3ForCausalLM
+from transformers.cache_utils import StaticCacheConfig
 
-from .phi_3_mini import Phi3Mini
+from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
+
+
+def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
+    """
+    Prepare example inputs and configurations for export.
+
+    Returns:
+        example_input_ids (torch.Tensor): Example input IDs tensor.
+        example_cache_position (torch.Tensor): Example cache position tensor.
+        dynamic_shapes (dict or None): Dynamic shape specifications for export.
+        strict (bool): Whether to use strict export mode.
+    """
+    # Prepare inputs with dynamic shapes
+    seq_length = 3  # Sequence length > 1 to avoid specialization issues
+    example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
+    example_cache_position = torch.arange(seq_length, dtype=torch.long)
+    max_dim = min(max_seq_len, sliding_window) - 1
+    seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
+    dynamic_shapes = {
+        "input_ids": {1: seq_len_dim},
+        "cache_position": {0: seq_len_dim},
+    }
+
+    return example_input_ids, example_cache_position, dynamic_shapes
 
 
 def export(args) -> None:
@@ -40,51 +69,70 @@ def export(args) -> None:
             f"Invalid context length {args.context_length}. Should be either 4k or 128k"
         )
 
-    with torch.no_grad():
-        model = Phi3Mini(
-            # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
-            model=Phi3ForCausalLM.from_pretrained(model_name),
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+        model = Phi3ForCausalLM.from_pretrained(model_name)
+        model.generation_config.cache_implementation = "static"
+        model.generation_config.cache_config = StaticCacheConfig(
+            batch_size=1, max_cache_len=model.config.max_position_embeddings
+        )
+
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model,
             max_batch_size=1,
-            max_seq_len=args.seq_len,
+            max_cache_len=model.config.max_position_embeddings,
         )
-        example_inputs = (
-            torch.tensor(
-                [[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
-            ),
+        input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
+            model.config.max_position_embeddings, model.config.sliding_window
+        )
+        example_inputs = (input_ids, cache_position)
+        exported_program = exportable_module.export(
+            input_ids, cache_position, dynamic_shapes, strict=False
+        )
+        # Apply RemoveTransposes pass to remove
+        # any back-to-back transpose ops that are not needed
+        # e.g. output of update_cache is transposed and
+        # input to custom_sdpa is transposed.
+        from executorch.extension.llm.export.export_passes import (
+            RemoveRedundantTransposes,
         )
-        dynamic_shapes = {
-            "input_ids": {
-                1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
-            }
-        }
+
+        mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
 
         xnnpack_quant_config = get_symmetric_quantization_config(
             is_per_channel=True, is_dynamic=True
         )
         xnnpack_quantizer = XNNPACKQuantizer()
         xnnpack_quantizer.set_global(xnnpack_quant_config)
 
-        model = export_for_training(
-            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
-        ).module()
-        model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
-        model(*example_inputs)
-        model = convert_pt2e(model)
-        DuplicateDynamicQuantChainPass()(model)
-        # TODO(lunwenh): update it to use export once
-        # https://github.com/pytorch/pytorch/issues/128394 is resolved.
-        model = torch.export._trace._export(
-            model,
-            example_inputs,
-            dynamic_shapes=dynamic_shapes,
-            strict=False,
-            pre_dispatch=False,
+        gm = prepare_pt2e(mutated_gm, xnnpack_quantizer)  # pyre-fixme[6]
+        gm(*example_inputs)
+        gm = convert_pt2e(gm)
+        DuplicateDynamicQuantChainPass()(gm)
+        exported_program = export_for_training(
+            gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
         )
 
     edge_config = get_xnnpack_edge_compile_config()
-    edge_manager = to_edge(model, compile_config=edge_config)
+    edge_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[XnnpackPartitioner()],
+        compile_config=edge_config,
+        constant_methods={
+            "get_eos_ids": [32000],
+            "use_kv_cache": True,
+            "enable_dynamic_shape": True,
+            "get_max_seq_len": model.config.max_position_embeddings - 1,
+        },
+    )
     edge_manager = edge_manager.to_backend(XnnpackPartitioner())
-    et_program = edge_manager.to_executorch()
+    et_program = edge_manager.to_executorch(
+        ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            do_quant_fusion_and_const_prop=True,
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+        )
+    )
 
     with open(args.output_name, "wb") as file:
         file.write(et_program.buffer)
diff --git a/examples/models/phi-3-mini/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
@@ -7,8 +7,6 @@
 
 set -x
 
-pip install transformers==4.44.2
-
 pip install sentencepiece
 
 pip list
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
@@ -6,9 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <gflags/gflags.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <iostream>
 
-#include <executorch/examples/models/phi-3-mini/runner.h>
+using executorch::extension::llm::TextLLMRunner;
 
 DEFINE_string(
     model_path,
@@ -42,9 +45,16 @@ int main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
-  example::Runner runner(model_path, tokenizer_path, temperature);
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer =
+      std::make_unique<tokenizers::Llama2cTokenizer>();
+  tokenizer->load(tokenizer_path);
 
-  runner.generate(prompt, seq_len);
+  auto runner = executorch::extension::llm::create_text_llm_runner(
+      model_path, std::move(tokenizer));
+
+  runner->generate(
+      prompt,
+      {.seq_len = seq_len, .temperature = static_cast<float>(temperature)});
 
   return 0;
 }
diff --git a/examples/models/phi-3-mini/phi_3_mini.py b/examples/models/phi-3-mini/phi_3_mini.py
@@ -30,7 +30,7 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int
     def forward(
         self,
         # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
-        input_ids: torch.LongTensor = None,
+        input_ids: torch.LongTensor,
     ) -> torch.FloatTensor:
         # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
         return self.model.forward(
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake