diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt index 30b9427824f..4cf99a4f78e 100644 --- a/.ci/docker/ci_commit_pins/optimum-executorch.txt +++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt @@ -1 +1 @@ -828ae02053a6e0e20a2dfd6e737ba10c6f4dee6b +bd06b54e627fbfd354a2cffa4c80fb21883209a9 diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py index 05b25299522..e5d815cfc00 100644 --- a/.ci/scripts/test_huggingface_optimum_model.py +++ b/.ci/scripts/test_huggingface_optimum_model.py @@ -43,7 +43,9 @@ def cli_export(command, model_dir): def check_causal_lm_output_quality( - model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0 + model_id: str, + generated_tokens: List[int], + max_perplexity_threshold: float = 100.0, ): """ Evaluates the quality of text generated by a causal language model by calculating its perplexity. @@ -58,12 +60,24 @@ def check_causal_lm_output_quality( """ logging.info(f"Starting perplexity check with model '{model_id}' ...") # Load model - model = AutoModelForCausalLM.from_pretrained( - model_id, - low_cpu_mem_usage=True, - use_cache=False, - torch_dtype=torch.bfloat16, - ) + cls_name = AutoModelForCausalLM + if "llava" in model_id: + from transformers import LlavaForConditionalGeneration + + cls_name = LlavaForConditionalGeneration + try: + model = cls_name.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_cache=False, + torch_dtype=torch.bfloat16, + ) + except TypeError: + model = cls_name.from_pretrained( + model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + ) with torch.no_grad(): outputs = model(input_ids=generated_tokens, labels=generated_tokens) @@ -156,6 +170,86 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only assert check_causal_lm_output_quality(model_id, generated_tokens) is True +def test_llm_with_image_modality( + model_id, model_dir, recipe, *, quantize=True, run_only=False +): + command = [ + "optimum-cli", + "export", + "executorch", + "--model", + model_id, + "--task", + "multimodal-text-to-text", + "--recipe", + recipe, + "--output_dir", + model_dir, + "--use_custom_sdpa", + "--use_custom_kv_cache", + "--qlinear", + "8da4w", + "--qembedding", + "8w", + ] + if not run_only: + cli_export(command, model_dir) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(model_dir) + + # input + processor = AutoProcessor.from_pretrained(model_id) + image_url = "https://llava-vl.github.io/static/images/view.jpg" + conversation = [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", + } + ], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": image_url}, + { + "type": "text", + "text": "What are the things I should be cautious about when I visit here?", + }, + ], + }, + ] + inputs = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + from executorch.extension.llm.runner import GenerationConfig, MultimodalRunner + + runner = MultimodalRunner(f"{model_dir}/model.pte", f"{model_dir}/tokenizer.model") + generated_text = runner.generate_text_hf( + inputs, + GenerationConfig(max_new_tokens=128, temperature=0, echo=False), + processor.image_token_id, + ) + print(f"\nGenerated text:\n\t{generated_text}") + # Free memory before loading eager for quality check + del runner + gc.collect() + assert ( + check_causal_lm_output_quality( + model_id, tokenizer.encode(generated_text, return_tensors="pt") + ) + is True + ) + + def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False): command = [ "optimum-cli", @@ -353,6 +447,9 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): required=False, help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.", ) + parser.add_argument( + "--run_only", action="store_true", help="Skip export and only run the test" + ) args = parser.parse_args() _text_generation_mapping = { @@ -384,8 +481,16 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): "vit": ("google/vit-base-patch16-224", test_vit), } + _multimodal_model_mapping = { + "gemma3-4b": ("google/gemma-3-4b-it", test_llm_with_image_modality), + "llava": ("llava-hf/llava-1.5-7b-hf", test_llm_with_image_modality), + } + model_to_model_id_and_test_function = ( - _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping + _text_generation_mapping + | _mask_fill_mapping + | _misc_model_mapping + | _multimodal_model_mapping ) if args.model not in model_to_model_id_and_test_function: @@ -400,4 +505,5 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): model_dir=tmp_dir if args.model_dir is None else args.model_dir, recipe=args.recipe, quantize=args.quantize, + run_only=args.run_only, ) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index f372be0e46f..4215db1e2ca 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -286,15 +286,20 @@ jobs: # Test selective build PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}" - test-llava-runner-linux: - name: test-llava-runner-linux + test-multimodal-linux: + if: ${{ !github.event.pull_request.head.repo.fork }} + name: test-multimodal-linux uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write contents: read + secrets: inherit strategy: fail-fast: false + matrix: + model: ["gemma3-4b"] # llava gives segfault so not covering. with: + secrets-env: EXECUTORCH_HF_TOKEN runner: linux.24xlarge docker-image: ci-image:executorch-ubuntu-22.04-clang12 submodules: 'recursive' @@ -305,17 +310,20 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + echo "::group::Setup ExecuTorch" PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake" - - # install Llava requirements - bash examples/models/llama/install_requirements.sh - bash examples/models/llava/install_requirements.sh - - # run python unittest - python -m unittest examples.models.llava.test.test_llava - - # run e2e (export, tokenizer and runner) - PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + echo "::endgroup::" + + echo "::group::Test ${{ matrix.model }}" + python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack + echo "::endgroup::" test-moshi-linux: name: test-moshi-linux diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 629c84847f6..362df17dc9b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -616,34 +616,45 @@ jobs: bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} - # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. - # test-llava-runner-macos: - # name: test-llava-runner-macos - # uses: pytorch/test-infra/.github/workflows/macos_job.yml@main - # strategy: - # fail-fast: false - # with: - # runner: macos-14-xlarge - # python-version: '3.11' - # submodules: 'recursive' - # ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # timeout: 900 - # script: | - # BUILD_TOOL=cmake - - # bash .ci/scripts/setup-conda.sh - # # Setup MacOS dependencies as there is no Docker support on MacOS atm - # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}" - - # # install Llava requirements - # ${CONDA_RUN} bash examples/models/llama/install_requirements.sh - # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh - - # # run python unittest - # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava - - # # run e2e (export, tokenizer and runner) - # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh + test-multimodal-macos: + if: ${{ !github.event.pull_request.head.repo.fork }} + name: test-multimodal-macos + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + matrix: + model: ["gemma3-4b"] # llava gives segfault so not covering. + with: + secrets-env: EXECUTORCH_HF_TOKEN + runner: macos-15-xlarge + python-version: '3.11' + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + echo "::group::Set up ExecuTorch" + bash .ci/scripts/setup-conda.sh + eval "$(conda shell.bash hook)" + + # Install requirements + ${CONDA_RUN} python install_executorch.py + echo "::endgroup::" + + echo "::group::Set up Huggingface" + ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate + ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + ${CONDA_RUN} pip list + echo "::endgroup::" + + echo "::group::Test ${{ matrix.model }}" + ${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack + echo "::endgroup::" test-qnn-model: name: test-qnn-model diff --git a/CMakeLists.txt b/CMakeLists.txt index e419a45a879..0fbd77aeec7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -650,15 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM) list(APPEND _executorch_extensions tokenizers) endif() -if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) - list(APPEND _executorch_extensions extension_llm_runner) -endif() - -if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) -endif() - if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) install( @@ -904,6 +895,15 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING) list(APPEND _executorch_extensions extension_training) endif() +if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) + list(APPEND _executorch_extensions extension_llm_runner) +endif() + +if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) +endif() + if(EXECUTORCH_BUILD_KERNELS_LLM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh index 4dcdeea83bf..9dfccf11600 100755 --- a/examples/models/llava/install_requirements.sh +++ b/examples/models/llava/install_requirements.sh @@ -7,9 +7,4 @@ set -x -pip install transformers accelerate sentencepiece tiktoken - -# Run llama2/install requirements for torchao deps -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - -bash "$SCRIPT_DIR"/../llama/install_requirements.sh +pip install git+https://github.com/huggingface/optimum-executorch.git@d4d3046738ca31b5542506aaa76a28d540600227 diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 3946a629ade..635fd7888d2 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -131,8 +131,7 @@ int32_t main(int32_t argc, char** argv) { #endif // Load tokenizer std::unique_ptr<::tokenizers::Tokenizer> tokenizer = - std::make_unique(); - tokenizer->load(tokenizer_path); + ::executorch::extension::llm::load_tokenizer(tokenizer_path); if (tokenizer == nullptr) { ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); return 1; diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index cf8983db1fb..8d280b4eaf9 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -79,3 +79,43 @@ install( if(BUILD_TESTING) add_subdirectory(test) endif() + +# Python bindings for MultimodalRunner +if(EXECUTORCH_BUILD_PYBIND) + # Create the Python extension module for LLM runners + pybind11_add_module( + _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp + ) + + find_package_torch() + find_library( + TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib" + ) + # Link with the extension_llm_runner library and its dependencies + target_link_libraries( + _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers + portable_lib ${TORCH_PYTHON_LIBRARY} ${TORCH_LIBRARIES} + ) + + # Set properties for the Python extension + set_target_properties( + _llm_runner + PROPERTIES POSITION_INDEPENDENT_CODE ON + CXX_VISIBILITY_PRESET "hidden" + INTERPROCEDURAL_OPTIMIZATION TRUE + ) + if(APPLE) + set(RPATH "@loader_path/../../pybindings") + else() + set(RPATH "$ORIGIN/../../pybindings") + endif() + set_target_properties(_llm_runner PROPERTIES INSTALL_RPATH ${RPATH}) + # Add include directories + target_include_directories( + _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS} + ) + + install(TARGETS _llm_runner + LIBRARY DESTINATION executorch/extension/llm/runner + ) +endif() diff --git a/extension/llm/runner/README.md b/extension/llm/runner/README.md index ab8ec8964dd..0bede23a228 100644 --- a/extension/llm/runner/README.md +++ b/extension/llm/runner/README.md @@ -164,6 +164,301 @@ int main() { } ``` +## Python API + +The LLM Runner framework provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features like torch tensor support and Hugging Face compatibility. + +### Installation + +Build the Python bindings as part of the ExecuTorch build: + +```bash +# Build from source with Python bindings enabled: +# In executorch root directory +bash install_executorch.sh +``` + +### Quick Start Examples + +#### Basic Multimodal Generation + +```python +from executorch.extension.llm.runner import ( + GenerationConfig, MultimodalRunner, + make_text_input, make_image_input, make_audio_input +) +import torch + +# Create a multimodal runner +runner = MultimodalRunner( + model_path="/path/to/model.pte", + tokenizer_path="/path/to/tokenizer.bin" +) + +# Create multimodal inputs +inputs = [] +inputs.append(make_text_input("What do you see in this image?")) + +# Add image from torch tensor (supports both CHW and HWC formats) +image_tensor = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) # CHW format +inputs.append(make_image_input(image_tensor)) + +# Configure generation +config = GenerationConfig( + max_new_tokens=100, + temperature=0.7, + echo=False +) + +# Generate with streaming output +def token_callback(token: str): + print(token, end='', flush=True) + +def stats_callback(stats): + print(f"\n[Stats] Generated {stats.num_generated_tokens} tokens") + inference_time = stats.inference_end_ms - stats.inference_start_ms + if inference_time > 0: + tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time + print(f"[Stats] Speed: {tokens_per_sec:.1f} tokens/sec") + +runner.generate(inputs, config, token_callback, stats_callback) +``` + +#### Working with Different Input Types + +```python +from executorch.extension.llm.runner import ( + MultimodalRunner, GenerationConfig, + make_text_input, make_token_input, make_image_input, + make_audio_input, make_raw_audio_input +) +import torch + +runner = MultimodalRunner("model.pte", "tokenizer.bin") + +# 1. Text input +text_input = make_text_input("Analyze this multimodal content:") + +# 2. Pre-tokenized input (useful for chat templates) +token_ids = [1, 15043, 445, 2420] # Example token IDs +token_input = make_token_input(token_ids) + +# 3. Image input from torch tensor +# Supports multiple formats: (H,W,C), (C,H,W), (1,H,W,C), (1,C,H,W) +image_hwc = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8) # HWC +image_input = make_image_input(image_hwc) + +# Float tensors also supported for normalized images +image_float = torch.rand(3, 224, 224, dtype=torch.float32) # CHW, normalized +image_input_float = make_image_input(image_float) + +# 4. Preprocessed audio input (e.g., mel spectrograms) +audio_features = torch.rand(1, 80, 100, dtype=torch.float32) # (batch, n_bins, n_frames) +audio_input = make_audio_input(audio_features) + +# 5. Raw audio input (for models with built-in audio processing) +raw_audio = torch.randint(0, 255, (1, 1, 16000), dtype=torch.uint8) # (batch, channels, samples) +raw_audio_input = make_raw_audio_input(raw_audio) + +# Combine inputs and generate +inputs = [text_input, image_input, audio_input] +config = GenerationConfig(max_new_tokens=50, temperature=0.8) +response = runner.generate_text(inputs, config) +print(f"Response: {response}") +``` + +#### Hugging Face Integration + +```python +from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig +from transformers import AutoProcessor +from PIL import Image +import torch + +# Load HF processor for your model +processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + +# Create runner +runner = MultimodalRunner("llava_model.pte", "tokenizer.bin") + +# Process inputs with HF processor +image = Image.open("photo.jpg") +conversation = [ + {"role": "user", "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image"} + ]} +] + +# Apply chat template and process +prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) +inputs_hf = processor(prompt, image, return_tensors="pt") + +# Generate using HF inputs directly +config = GenerationConfig(max_new_tokens=100, temperature=0.7) +runner.generate_hf( + inputs_hf, + config, + image_token_id=processor.tokenizer.convert_tokens_to_ids(""), + token_callback=lambda token: print(token, end='', flush=True) +) +``` + +#### Chat Session with State Management + +```python +from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig, make_text_input + +class ChatSession: + def __init__(self, model_path: str, tokenizer_path: str): + self.runner = MultimodalRunner(model_path, tokenizer_path) + self.config = GenerationConfig(max_new_tokens=150, temperature=0.7, echo=False) + + def send_message(self, message: str) -> str: + """Send a message and get response""" + inputs = [make_text_input(message)] + response = self.runner.generate_text(inputs, self.config) + return response + + def send_multimodal(self, text: str, image_tensor: torch.Tensor) -> str: + """Send text + image and get response""" + inputs = [ + make_text_input(text), + make_image_input(image_tensor) + ] + response = self.runner.generate_text(inputs, self.config) + return response + + def reset_conversation(self): + """Reset the conversation state""" + self.runner.reset() + +# Usage +chat = ChatSession("model.pte", "tokenizer.bin") +print(chat.send_message("Hello! How are you?")) + +# Continue conversation (KV cache maintains context) +print(chat.send_message("What's the weather like?")) + +# Reset when starting new conversation +chat.reset_conversation() +``` + +### Python API Classes + +#### GenerationConfig +```python +from executorch.extension.llm.runner import GenerationConfig + +# Create with defaults +config = GenerationConfig() + +# Or specify parameters +config = GenerationConfig( + max_new_tokens=100, # Maximum tokens to generate (-1 = auto) + temperature=0.8, # Sampling temperature (0.0 = deterministic) + echo=True, # Echo input prompt in output + seq_len=2048, # Maximum sequence length (-1 = auto) + num_bos=0, # Number of BOS tokens + num_eos=0 # Number of EOS tokens +) + +# Modify after creation +config.temperature = 0.5 +config.max_new_tokens = 50 +``` + +#### MultimodalInput Types +```python +from executorch.extension.llm.runner import ( + MultimodalInput, make_text_input, make_token_input, + make_image_input, make_audio_input +) + +# Text input +text_input = make_text_input("Hello, world!") +print(text_input.is_text()) # True +print(text_input.get_text()) # "Hello, world!" + +# Token input (pre-tokenized) +token_input = make_token_input([1, 2, 3, 4]) +print(token_input.is_tokens()) # True +print(token_input.get_tokens()) # [1, 2, 3, 4] + +# Image input from torch tensor +import torch +image_tensor = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8) +image_input = make_image_input(image_tensor) +print(image_input.is_image()) # True +image = image_input.get_image() +print(f"Image: {image.width}x{image.height}x{image.channels}") + +# Check input types safely +if text_input.is_text(): + text = text_input.get_text() +elif text_input.is_image(): + image = text_input.get_image() +``` + +#### Stats and Performance Monitoring +```python +def detailed_stats_callback(stats): + """Comprehensive stats monitoring""" + print(f"\n=== Generation Statistics ===") + print(f"Prompt tokens: {stats.num_prompt_tokens}") + print(f"Generated tokens: {stats.num_generated_tokens}") + + # Timing breakdown + model_load_time = stats.model_load_end_ms - stats.model_load_start_ms + if model_load_time > 0: + print(f"Model load time: {model_load_time}ms") + + inference_time = stats.inference_end_ms - stats.inference_start_ms + if inference_time > 0: + print(f"Total inference time: {inference_time}ms") + + # Calculate throughput + tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time + print(f"Generation speed: {tokens_per_sec:.1f} tokens/sec") + + # Time to first token + if stats.first_token_ms > stats.inference_start_ms: + ttft = stats.first_token_ms - stats.inference_start_ms + print(f"Time to first token: {ttft}ms") + + # Export to JSON for logging + json_stats = stats.to_json_string() + print(f"JSON stats: {json_stats}") + +# Use in generation +runner.generate(inputs, config, token_callback, detailed_stats_callback) +``` + +### Error Handling + +```python +from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig +import torch + +try: + runner = MultimodalRunner("model.pte", "tokenizer.bin") + + # Invalid image tensor will raise RuntimeError + invalid_image = torch.rand(2, 224, 224, 3) # Wrong number of dimensions + inputs = [make_image_input(invalid_image)] + + config = GenerationConfig(max_new_tokens=50) + runner.generate_text(inputs, config) + +except RuntimeError as e: + print(f"Generation failed: {e}") + +except FileNotFoundError as e: + print(f"Model or tokenizer file not found: {e}") +``` + +For more C++ API documentation and implementation details, see the [Core Components](#core-components) section below. + ## Core Components ### Component Architecture diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py new file mode 100644 index 00000000000..f62d62d3429 --- /dev/null +++ b/extension/llm/runner/__init__.py @@ -0,0 +1,235 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Python bindings for ExecuTorch MultimodalRunner. + +This module provides a Python interface to the ExecuTorch multimodal LLM runner, +enabling processing of mixed inputs (text, images, audio) and text generation. +""" + +try: + # Import shared components from the compiled C++ extension + from executorch.extension.llm.runner._llm_runner import ( # noqa: F401 + GenerationConfig, + Image, + make_audio_input, + make_image_input, + make_raw_audio_input, + make_text_input, + make_token_input, + MultimodalInput, + MultimodalRunner, + Stats, + ) +except ImportError: + raise RuntimeError( + "LLM runner is not installed. Please build ExecuTorch from source with EXECUTORCH_BUILD_PYBIND=ON" + ) + + +import logging +from typing import Callable, List, Optional, Union + +import torch +from transformers.feature_extraction_utils import BatchFeature + + +def _find_image_token_runs( + input_ids: torch.Tensor, image_token_id: Optional[int] +) -> List[tuple[int, int, int]]: + """Return contiguous runs (start, end, length) of image_token_id in input_ids. + + input_ids must be a 1D torch.Tensor. If image_token_id is None, returns an empty list. + """ + if image_token_id is None: + return [] + + ids_list = input_ids.tolist() + runs: List[tuple[int, int, int]] = [] + i = 0 + L = len(ids_list) + while i < L: + if ids_list[i] == image_token_id: + j = i + while j < L and ids_list[j] == image_token_id: + j += 1 + runs.append((i, j - 1, j - i)) + i = j + else: + i += 1 + + return runs + + +def _hf_to_multimodal_inputs( # noqa: C901 + inputs: BatchFeature, image_token_id: Optional[int] = None +) -> List[MultimodalInput]: + """Convert a HuggingFace AutoProcessor dict to ExecuTorch MultimodalInputs. + Currently only support 1 image inside the input. + + Args: + - inputs: A BatchFeature containing the input data. + - image_token_id: The token ID for the image, if present. + + `inputs` expected keys: + - 'input_ids': torch.Tensor of shape (L,) or (1, L) + - Optional 'pixel_values': torch.Tensor; if present, must also provide + 'image_token_id' (or alias 'image_token_index') and there must be + exactly one image token occurrence in input_ids. + + Raises: + RuntimeError: missing keys, invalid shapes/dtypes, or unsupported cases. + """ + if "input_ids" not in inputs: + raise RuntimeError("HF inputs dict must contain 'input_ids' (torch.Tensor)") + + input_ids = inputs["input_ids"] + if not isinstance(input_ids, torch.Tensor): + raise RuntimeError("'input_ids' must be a torch.Tensor") + + if input_ids.dim() == 2: + if input_ids.size(0) != 1: + raise RuntimeError( + "Expected 'input_ids' with batch size 1 when 2D (shape (1, L))" + ) + input_ids = input_ids.squeeze(0) + if input_ids.dim() != 1: + raise RuntimeError("'input_ids' must be 1D (L) or 2D with batch size 1") + + has_pixel_values = "pixel_values" in inputs + + # If pixel_values in dict, require image_token_id + if has_pixel_values and image_token_id is None: + raise RuntimeError("'pixel_values' provided but missing 'image_token_id'") + + # If there are image token ids but no pixel_values, it's an error + if ( + image_token_id is not None + and (input_ids == image_token_id).any().item() + and not has_pixel_values + ): + raise RuntimeError( + "Found image token(s) in input_ids but 'pixel_values' not provided" + ) + + # No images: return a single tokens input + if not has_pixel_values: + return [make_token_input(input_ids.to(torch.long).tolist())] + + # Determine number of images from pixel_values shape + pv = inputs["pixel_values"] + if not isinstance(pv, torch.Tensor): + raise RuntimeError( + "'pixel_values' must be a torch.Tensor, run with `return_tensors='pt'` in HF processor" + ) + if pv.dim() == 4: + num_images = int(pv.size(0)) + elif pv.dim() == 3: + num_images = 1 + else: + raise RuntimeError( + f"'pixel_values' must be 3D (C,H,W) or 4D (N,C,H,W)/(N,H,W,C), got shape {pv.shape}" + ) + + # Only support batch size 1 for now: + if num_images != 1: + raise RuntimeError("Only 1 image is supported for now") + # Find contiguous runs of image_token_id in input_ids + runs = _find_image_token_runs(input_ids, image_token_id) + + if len(runs) == 0: + raise RuntimeError( + "'pixel_values' provided but no occurrence of 'image_token_id' in input_ids" + ) + + # Support only one image/run for now; enforce exact match + if num_images != 1 or len(runs) != 1: + raise RuntimeError( + f"Mismatch between images and image token runs: images={num_images}, runs={len(runs)} (only batch=1 and a single contiguous run are supported)" + ) + + first, last, _ = runs[0] + + combined: List[MultimodalInput] = [] + if first > 0: + combined.append(make_token_input(input_ids[:first].to(torch.long).tolist())) + + # Use C++ checked creator for images (handles 3D/4D, CHW/HWC, uint8/float32) + combined.append(make_image_input(inputs["pixel_values"])) + + if (last + 1) < input_ids.numel(): + combined.append(make_token_input(input_ids[last + 1 :].to(torch.long).tolist())) + + return combined + + +def generate_hf( + runner: MultimodalRunner, + inputs: Union[BatchFeature, List[MultimodalInput]], + config: GenerationConfig, + image_token_id: Optional[int] = None, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Stats], None]] = None, +) -> None: + """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput.""" + if isinstance(inputs, BatchFeature): + logging.info( + "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs." + ) + converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id) + elif isinstance(inputs, list) and all( + isinstance(i, MultimodalInput) for i in inputs + ): + converted = inputs + else: + raise RuntimeError( + "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput" + ) + + runner.generate(converted, config, token_callback, stats_callback) + + +def generate_text_hf( + runner: MultimodalRunner, + inputs: Union[BatchFeature, List[MultimodalInput]], + config: GenerationConfig, + image_token_id: Optional[int] = None, +) -> str: + """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput.""" + if isinstance(inputs, BatchFeature): + logging.info( + "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs." + ) + converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id) + elif isinstance(inputs, list) and all( + isinstance(i, MultimodalInput) for i in inputs + ): + converted = inputs + else: + raise RuntimeError( + "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput" + ) + + return runner.generate_text(converted, config) + + +setattr(MultimodalRunner, "generate_hf", generate_hf) # noqa B010 +setattr(MultimodalRunner, "generate_text_hf", generate_text_hf) # noqa B010 + + +__all__ = [ + "GenerationConfig", + "Image", + "make_audio_input", + "make_image_input", + "make_raw_audio_input", + "make_text_input", + "make_token_input", + "MultimodalInput", + "MultimodalRunner", + "Stats", +] diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi new file mode 100644 index 00000000000..295601b092c --- /dev/null +++ b/extension/llm/runner/_llm_runner.pyi @@ -0,0 +1,523 @@ +""" +Type stubs for _llm_runner module. + +This file provides type annotations for the ExecuTorch LLM Runner Python bindings. +""" + +from typing import Callable, List, Optional, overload + +import torch + +class GenerationConfig: + """Configuration for text generation.""" + + echo: bool + """Whether to echo the input prompt in the output.""" + + max_new_tokens: int + """Maximum number of new tokens to generate (-1 for auto).""" + + warming: bool + """Whether this is a warmup run (affects perf benchmarking).""" + + seq_len: int + """Maximum number of total tokens (-1 for auto).""" + + temperature: float + """Temperature for sampling (higher = more random).""" + + num_bos: int + """Number of BOS tokens to add to the prompt.""" + + num_eos: int + """Number of EOS tokens to add to the prompt.""" + + def __init__( + self, + *, + echo: bool = True, + max_new_tokens: int = -1, + warming: bool = False, + seq_len: int = -1, + temperature: float = 0.8, + num_bos: int = 0, + num_eos: int = 0, + ) -> None: + """Initialize GenerationConfig with optional keyword arguments for all fields.""" + ... + + def resolve_max_new_tokens( + self, max_context_len: int, num_prompt_tokens: int + ) -> int: + """ + Resolve the maximum number of new tokens to generate based on constraints. + + Args: + max_context_len: The maximum context length supported by the model + num_prompt_tokens: The number of tokens in the input prompt + + Returns: + The resolved maximum number of new tokens to generate + """ + ... + + def __repr__(self) -> str: ... + +class Stats: + """Statistics for LLM generation performance.""" + + SCALING_FACTOR_UNITS_PER_SECOND: int + """Scaling factor for timestamps (1000 for milliseconds).""" + + model_load_start_ms: int + """Start time of model loading in milliseconds.""" + + model_load_end_ms: int + """End time of model loading in milliseconds.""" + + inference_start_ms: int + """Start time of inference in milliseconds.""" + + token_encode_end_ms: int + """End time of tokenizer encoding in milliseconds.""" + + model_execution_start_ms: int + """Start time of model execution in milliseconds.""" + + model_execution_end_ms: int + """End time of model execution in milliseconds.""" + + prompt_eval_end_ms: int + """End time of prompt evaluation in milliseconds.""" + + first_token_ms: int + """Timestamp when the first generated token is emitted.""" + + inference_end_ms: int + """End time of inference/generation in milliseconds.""" + + aggregate_sampling_time_ms: int + """Total time spent in sampling across all tokens.""" + + num_prompt_tokens: int + """Number of tokens in the input prompt.""" + + num_generated_tokens: int + """Number of tokens generated.""" + + def on_sampling_begin(self) -> None: + """Mark the beginning of a sampling operation.""" + ... + + def on_sampling_end(self) -> None: + """Mark the end of a sampling operation.""" + ... + + def reset(self, all_stats: bool = False) -> None: + """ + Reset statistics. + + Args: + all_stats: If True, reset all stats including model load times. + If False, preserve model load times. + """ + ... + + def to_json_string(self) -> str: + """Convert stats to JSON string representation.""" + ... + + def __repr__(self) -> str: ... + +class Image: + """Container for image data.""" + + @overload + def __init__(self) -> None: + """Initialize an empty Image.""" + ... + + @overload + def __init__(self, data: List[int], width: int, height: int, channels: int) -> None: + """Initialize an Image with uint8 data.""" + ... + + @overload + def __init__( + self, data: List[float], width: int, height: int, channels: int + ) -> None: + """Initialize an Image with float data.""" + ... + + def is_uint8(self) -> bool: + """Check if image data is uint8 format.""" + ... + + def is_float(self) -> bool: + """Check if image data is float format.""" + ... + + @property + def width(self) -> int: + """Image width in pixels.""" + ... + + @property + def height(self) -> int: + """Image height in pixels.""" + ... + + @property + def channels(self) -> int: + """Number of color channels (3 for RGB, 4 for RGBA).""" + ... + + @property + def uint8_data(self) -> List[int]: + """Raw image data as uint8 values.""" + ... + + @property + def float_data(self) -> List[float]: + """Raw image data as float values.""" + ... + + def __repr__(self) -> str: ... + +class Audio: + """Container for preprocessed audio data.""" + + data: List[int] + """Raw audio data as a list of uint8 values.""" + + batch_size: int + """Batch size of the audio data.""" + + n_bins: int + """Number of frequency bins (for spectrograms).""" + + n_frames: int + """Number of time frames.""" + + @overload + def __init__(self) -> None: + """Initialize an empty Audio.""" + ... + + @overload + def __init__( + self, data: List[int], batch_size: int, n_bins: int, n_frames: int + ) -> None: + """Initialize Audio with preprocessed data.""" + ... + + def __repr__(self) -> str: ... + +class RawAudio: + """Container for raw audio data.""" + + data: List[int] + """Raw audio data as a list of uint8 values.""" + + batch_size: int + """Batch size of the audio data.""" + + n_channels: int + """Number of audio channels (1 for mono, 2 for stereo).""" + + n_samples: int + """Number of audio samples.""" + + @overload + def __init__(self) -> None: + """Initialize an empty RawAudio.""" + ... + + @overload + def __init__( + self, data: List[int], batch_size: int, n_channels: int, n_samples: int + ) -> None: + """Initialize RawAudio with raw data.""" + ... + + def __repr__(self) -> str: ... + +class MultimodalInput: + """Container for multimodal input data (text, image, audio, etc.).""" + + @overload + def __init__(self, text: str) -> None: + """ + Create a MultimodalInput with text. + + Args: + text: The input text string + """ + ... + + @overload + def __init__(self, image: Image) -> None: + """ + Create a MultimodalInput with an image. + + Args: + image: The input image + """ + ... + + @overload + def __init__(self, audio: Audio) -> None: + """ + Create a MultimodalInput with preprocessed audio. + + Args: + audio: The input audio data + """ + ... + + @overload + def __init__(self, raw_audio: RawAudio) -> None: + """ + Create a MultimodalInput with raw audio. + + Args: + raw_audio: The input raw audio data + """ + ... + + def is_text(self) -> bool: + """Check if this input contains text.""" + ... + + def is_image(self) -> bool: + """Check if this input contains an image.""" + ... + + def is_audio(self) -> bool: + """Check if this input contains preprocessed audio.""" + ... + + def is_raw_audio(self) -> bool: + """Check if this input contains raw audio.""" + ... + + def get_text(self) -> Optional[str]: + """ + Get the text content if this is a text input. + + Returns: + The text string if this is a text input, None otherwise + """ + ... + + def get_image(self) -> Optional[Image]: + """ + Get the image content if this is an image input. + + Returns: + The Image object if this is an image input, None otherwise + """ + ... + + def get_audio(self) -> Optional[Audio]: + """ + Get the audio content if this is an audio input. + + Returns: + The Audio object if this is an audio input, None otherwise + """ + ... + + def get_raw_audio(self) -> Optional[RawAudio]: + """ + Get the raw audio content if this is a raw audio input. + + Returns: + The RawAudio object if this is a raw audio input, None otherwise + """ + ... + + def __repr__(self) -> str: ... + +class MultimodalRunner: + """Runner for multimodal language models.""" + + def __init__( + self, model_path: str, tokenizer_path: str, data_path: Optional[str] = None + ) -> None: + """ + Initialize a MultimodalRunner. + + Args: + model_path: Path to the model file (.pte) + tokenizer_path: Path to the tokenizer file + data_path: Optional path to additional data file + Raises: + RuntimeError: If initialization fails + """ + ... + + def generate( + self, + inputs: List[MultimodalInput], + config: GenerationConfig, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Stats], None]] = None, + ) -> None: + """ + Generate text from multimodal inputs. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration + token_callback: Optional callback called for each generated token + stats_callback: Optional callback called with generation statistics + + Raises: + RuntimeError: If generation fails + """ + ... + + def generate_hf( + self, + inputs: dict, + config: GenerationConfig, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Stats], None]] = None, + image_token_id: Optional[int] = None, + ) -> None: + """ + Generate text directly from a HuggingFace processor dict. + + Expects at least 'input_ids' (torch.Tensor). If 'pixel_values' is provided, + an 'image_token_id' (or 'image_token_index') must also be present to locate + the image position(s) in input_ids. + + Args: + inputs: HF processor outputs (e.g., from AutoProcessor.apply_chat_template) + config: Generation configuration + token_callback: Optional per-token callback + stats_callback: Optional stats callback + image_token_id: Optional image token ID (or index) + + Raises: + RuntimeError: If required keys are missing, shapes are invalid, or generation fails + """ + ... + + def prefill(self, inputs: List[MultimodalInput]) -> None: + """ + Prefill multimodal inputs (e.g., to rebuild KV cache from chat history) + without generating tokens. + + Args: + inputs: List of multimodal inputs to prefill + + Raises: + RuntimeError: If prefill fails + """ + ... + + def generate_text( + self, inputs: List[MultimodalInput], config: GenerationConfig + ) -> str: + """ + Generate text and return the complete result as a string. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration + + Returns: + The generated text as a string + + Raises: + RuntimeError: If generation fails + """ + ... + + def generate_text_hf( + self, inputs: dict, config: GenerationConfig, image_token_id + ) -> str: + """ + Generate text directly from a HuggingFace processor dict and return as string. + + See generate_hf(inputs: dict, ...) for expected keys and constraints. + """ + ... + + def stop(self) -> None: + """Stop the current generation process.""" + ... + + def reset(self) -> None: + """Reset the runner state and KV cache.""" + ... + + def get_vocab_size(self) -> int: + """ + Get the vocabulary size of the model. + + Returns: + The vocabulary size, or -1 if not available + """ + ... + + def __repr__(self) -> str: ... + +def make_text_input(text: str) -> MultimodalInput: + """ + Create a text input for multimodal processing. + + Args: + text: The input text string + + Returns: + A MultimodalInput containing the text + """ + ... + +def make_image_input(image_tensor: torch.Tensor) -> MultimodalInput: + """ + Create an image input from a torch tensor. + + Args: + image_tensor: Torch tensor with shape (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W) + + Returns: + A MultimodalInput containing the image + + Raises: + RuntimeError: If the tensor has invalid dimensions or number of channels + """ + ... + +def make_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput: + """ + Create a preprocessed audio input from a torch tensor. + + Args: + audio_tensor: Torch tensor with shape (batch_size, n_bins, n_frames) + + Returns: + A MultimodalInput containing the preprocessed audio + + Raises: + RuntimeError: If the tensor has invalid dimensions or dtype + """ + ... + +def make_raw_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput: + """ + Create a raw audio input from a torch tensor. + + Args: + audio_tensor: Torch tensor with shape (batch_size, n_channels, n_samples) + + Returns: + A MultimodalInput containing the raw audio + + Raises: + RuntimeError: If the tensor has invalid dimensions or dtype + """ + ... diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h index 737821f51e9..01f73e3314c 100644 --- a/extension/llm/runner/multimodal_input.h +++ b/extension/llm/runner/multimodal_input.h @@ -31,7 +31,7 @@ class ET_EXPERIMENTAL MultimodalInput { /// Type of multimodal input data enum class Type { TEXT, ///< Text string input - TOKENS, ///< Pre-tokenized input (vector of token IDs) + TOKENS, ///< Tokenizer encoded input (vector of token IDs) IMAGE, ///< Processed image input AUDIO, ///< Processed audio input RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file) diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp new file mode 100644 index 00000000000..bcc6aba0f8e --- /dev/null +++ b/extension/llm/runner/pybindings.cpp @@ -0,0 +1,647 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace py = pybind11; +using namespace executorch::extension::llm; +using namespace executorch::extension; +using namespace executorch::runtime; + +// Helper macro for error handling +#define THROW_IF_ERROR(error, message, ...) \ + ({ \ + if ((error) != Error::Ok) { \ + char msg_buf[256]; \ + snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \ + throw std::runtime_error(msg_buf); \ + } \ + }) + +// Python wrapper class for MultimodalRunner +class PyMultimodalRunner { + public: + // Constructor that takes a tokenizer path + PyMultimodalRunner( + const std::string& model_path, + const std::string& tokenizer_path, + std::optional data_path = std::nullopt) { + // Load tokenizer using the helper function + auto tokenizer = + load_tokenizer(tokenizer_path, nullptr, std::nullopt, 0, 0); + if (!tokenizer) { + throw std::runtime_error( + "Failed to load tokenizer from: " + tokenizer_path); + } + + // Create multimodal runner using the helper function + runner_ = + create_multimodal_runner(model_path, std::move(tokenizer), data_path); + if (!runner_) { + throw std::runtime_error( + "Failed to create multimodal runner with model: " + model_path); + } + } + + void generate( + const std::vector& inputs, + const GenerationConfig& config, + py::object token_callback = py::none(), + py::object stats_callback = py::none()) { + if (!runner_) { + throw std::runtime_error("Runner not initialized"); + } + + // Convert Python callbacks to C++ std::function + std::function cpp_token_callback = nullptr; + if (!token_callback.is_none()) { + cpp_token_callback = [token_callback](const std::string& token) { + py::gil_scoped_acquire acquire; + token_callback(token); + }; + } + + std::function cpp_stats_callback = nullptr; + if (!stats_callback.is_none()) { + cpp_stats_callback = [stats_callback](const Stats& stats) { + py::gil_scoped_acquire acquire; + stats_callback(stats); + }; + } + + // Release GIL during generation + { + py::gil_scoped_release release; + Error error = runner_->generate( + inputs, config, cpp_token_callback, cpp_stats_callback); + THROW_IF_ERROR(error, "Generation failed"); + } + } + + std::string generate_text( + const std::vector& inputs, + const GenerationConfig& config) { + if (!runner_) { + throw std::runtime_error("Runner not initialized"); + } + + std::string generated_text; + auto cpp_token_callback = [&generated_text](const std::string& token) { + generated_text += token; + }; + Error error = + runner_->generate(inputs, config, cpp_token_callback, nullptr); + THROW_IF_ERROR(error, "Generation failed"); + + return generated_text; + } + + void stop() { + if (runner_) { + runner_->stop(); + } + } + + void reset() { + if (runner_) { + runner_->reset(); + } + } + + void prefill(std::vector inputs) { + if (!runner_) { + throw std::runtime_error("Runner not initialized"); + } + { + py::gil_scoped_release release; + Error error = runner_->prefill(inputs); + THROW_IF_ERROR(error, "Prefill failed"); + } + } + + // Note: Since the runner owns the tokenizer and metadata after creation, + // we cannot directly access them. This is a limitation of the current design. + // For now, we'll return a placeholder value. + int32_t get_vocab_size() const { + // TODO: Consider exposing metadata through the MultimodalRunner interface + return -1; // Indicate that vocab size is not available + } + + private: + std::unique_ptr runner_; +}; + +PYBIND11_MODULE(_llm_runner, m) { + m.doc() = "Python bindings for ExecuTorch LLM Runners"; + + // Initialize ExecuTorch runtime + runtime_init(); + + // Bind GenerationConfig + py::class_(m, "GenerationConfig") + // Constructor with keyword arguments for all fields (all optional via + // defaults) + .def( + py::init([](bool echo, + int32_t max_new_tokens, + bool warming, + int32_t seq_len, + float temperature, + int32_t num_bos, + int32_t num_eos) { + GenerationConfig cfg; + cfg.echo = echo; + cfg.max_new_tokens = max_new_tokens; + cfg.warming = warming; + cfg.seq_len = seq_len; + cfg.temperature = temperature; + cfg.num_bos = num_bos; + cfg.num_eos = num_eos; + return cfg; + }), + py::arg("echo") = true, + py::arg("max_new_tokens") = -1, + py::arg("warming") = false, + py::arg("seq_len") = -1, + py::arg("temperature") = 0.8f, + py::arg("num_bos") = 0, + py::arg("num_eos") = 0) + .def_readwrite("echo", &GenerationConfig::echo) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("warming", &GenerationConfig::warming) + .def_readwrite("seq_len", &GenerationConfig::seq_len) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("num_bos", &GenerationConfig::num_bos) + .def_readwrite("num_eos", &GenerationConfig::num_eos) + .def( + "resolve_max_new_tokens", + &GenerationConfig::resolve_max_new_tokens, + py::arg("max_context_len"), + py::arg("num_prompt_tokens"), + "Resolve the maximum number of new tokens to generate based on constraints") + .def("__repr__", [](const GenerationConfig& config) { + return ""; + }); + + // Bind Stats + py::class_(m, "Stats") + .def_readonly( + "SCALING_FACTOR_UNITS_PER_SECOND", + &Stats::SCALING_FACTOR_UNITS_PER_SECOND) + .def_readonly("model_load_start_ms", &Stats::model_load_start_ms) + .def_readonly("model_load_end_ms", &Stats::model_load_end_ms) + .def_readonly("inference_start_ms", &Stats::inference_start_ms) + .def_readonly("token_encode_end_ms", &Stats::token_encode_end_ms) + .def_readonly( + "model_execution_start_ms", &Stats::model_execution_start_ms) + .def_readonly("model_execution_end_ms", &Stats::model_execution_end_ms) + .def_readonly("prompt_eval_end_ms", &Stats::prompt_eval_end_ms) + .def_readonly("first_token_ms", &Stats::first_token_ms) + .def_readonly("inference_end_ms", &Stats::inference_end_ms) + .def_readonly( + "aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms) + .def_readonly("num_prompt_tokens", &Stats::num_prompt_tokens) + .def_readonly("num_generated_tokens", &Stats::num_generated_tokens) + .def("on_sampling_begin", &Stats::on_sampling_begin) + .def("on_sampling_end", &Stats::on_sampling_end) + .def( + "reset", + &Stats::reset, + py::arg("all_stats") = false, + "Reset stats, optionally including model load times") + .def( + "to_json_string", + [](const Stats& stats) { return stats_to_json_string(stats); }, + "Convert stats to JSON string representation") + .def("__repr__", [](const Stats& stats) { + double tokens_per_second = 0.0; + if (stats.inference_end_ms > stats.inference_start_ms) { + tokens_per_second = static_cast(stats.num_generated_tokens) * + stats.SCALING_FACTOR_UNITS_PER_SECOND / + (stats.inference_end_ms - stats.inference_start_ms); + } + return ""; + }); + + // Bind Image class + py::class_(m, "Image") + .def( + py::init&&, int32_t, int32_t, int32_t>(), + py::arg("data"), + py::arg("width"), + py::arg("height"), + py::arg("channels")) + .def( + py::init&&, int32_t, int32_t, int32_t>(), + py::arg("data"), + py::arg("width"), + py::arg("height"), + py::arg("channels")) + .def("is_uint8", &Image::is_uint8) + .def("is_float", &Image::is_float) + .def_property_readonly("width", &Image::width) + .def_property_readonly("height", &Image::height) + .def_property_readonly("channels", &Image::channels) + .def_property_readonly( + "uint8_data", + static_cast& (Image::*)() const&>( + &Image::get_uint8_data)) + .def_property_readonly( + "float_data", + static_cast& (Image::*)() const&>( + &Image::get_float_data)) + .def("__repr__", [](const Image& img) { + std::string dtype = "unknown"; + if (img.is_uint8()) { + dtype = "uint8"; + } else if (img.is_float()) { + dtype = "float32"; + } + return ""; + }); + + // Bind Audio class + py::class_