From 190ac509f1b7efb1c87175b2c6d5a302d73bca41 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 14:22:47 -0700 Subject: [PATCH 01/40] Update --- extension/llm/runner/CMakeLists.txt | 43 +++ .../llm/runner/README_PYTHON_BINDINGS.md | 249 ++++++++++++ extension/llm/runner/__init__.py | 340 ++++++++++++++++ extension/llm/runner/pybindings.cpp | 362 ++++++++++++++++++ extension/llm/runner/utils.py | 302 +++++++++++++++ setup.py | 5 + 6 files changed, 1301 insertions(+) create mode 100644 extension/llm/runner/README_PYTHON_BINDINGS.md create mode 100644 extension/llm/runner/__init__.py create mode 100644 extension/llm/runner/pybindings.cpp create mode 100644 extension/llm/runner/utils.py diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index cf8983db1fb..d86fc53ae75 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -79,3 +79,46 @@ install( if(BUILD_TESTING) add_subdirectory(test) endif() + +# Python bindings for MultimodalRunner +if(EXECUTORCH_BUILD_PYBIND) + # Find pybind11 + find_package(pybind11 REQUIRED) + + # Create the Python extension module for LLM runners + pybind11_add_module( + _llm_runner + ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp + ) + + # Link with the extension_llm_runner library and its dependencies + target_link_libraries( + _llm_runner + PRIVATE + extension_llm_runner + executorch_core + extension_module + extension_tensor + tokenizers::tokenizers + ) + + # Set properties for the Python extension + set_target_properties( + _llm_runner + PROPERTIES + POSITION_INDEPENDENT_CODE ON + CXX_VISIBILITY_PRESET "hidden" + INTERPROCEDURAL_OPTIMIZATION TRUE + PREFIX "${PYTHON_MODULE_PREFIX}" + SUFFIX "${PYTHON_MODULE_SUFFIX}" + ) + + # Add include directories + target_include_directories( + _llm_runner + PRIVATE + ${_common_include_directories} + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../sampler + ) +endif() diff --git a/extension/llm/runner/README_PYTHON_BINDINGS.md b/extension/llm/runner/README_PYTHON_BINDINGS.md new file mode 100644 index 00000000000..105b05f4f1e --- /dev/null +++ b/extension/llm/runner/README_PYTHON_BINDINGS.md @@ -0,0 +1,249 @@ +# Python Bindings for MultimodalRunner + +## Overview + +This project provides Python bindings for the ExecuTorch MultimodalRunner, enabling Python developers to easily use the multimodal LLM runner for processing mixed inputs (text, images, audio) and generating text outputs. + +## Architecture + +The MultimodalRunner is designed for Large Language Models that can process multimodal inputs and generate text outputs. It supports models like: +- LLaVA (vision-language models) +- CLIP-based models +- Speech-to-text models +- Other multimodal transformers + +### Key Components + +1. **MultimodalRunner** - Main runner class for multimodal inference +2. **MultimodalInput** - Handles different input modalities (text, image, audio) +3. **GenerationConfig** - Configuration for text generation parameters +4. **Stats** - Performance monitoring and statistics +5. **Tokenizer** - Text tokenization and decoding + +## Project Structure + +``` +extension/llm/runner/ +├── multimodal_runner_pybindings.cpp # Python bindings implementation (NEW) +├── __init__.py # Python package initialization (NEW) +├── multimodal_runner.py # Python wrapper classes (NEW) +├── utils.py # Utility functions (NEW) +├── CMakeLists.txt # Existing - update to include Python bindings +└── test/ + ├── test_multimodal_runner.py # Unit tests for Python bindings (NEW) + └── test_generation.py # Generation tests (NEW) + └── [existing test files] # Existing C++ tests remain here +``` + +Note: We'll reuse the root-level `setup.py` and update the existing `CMakeLists.txt` rather than creating new ones. + +## Action Items + +### 1. Core Implementation Tasks + +#### High Priority +- [x] ~~**Create Python bindings file** (`multimodal_runner_pybindings.cpp`)~~ + - [x] ~~Bind MultimodalRunner class~~ + - [x] ~~Bind MultimodalInput and helper functions~~ + - [x] ~~Bind GenerationConfig struct~~ + - [x] ~~Bind Stats class for performance monitoring~~ + - [x] ~~Implement error handling and exception translation~~ + +#### Medium Priority +- [x] ~~**Update existing CMakeLists.txt** in `extension/llm/runner/`~~ + - [x] ~~Add Python bindings target when EXECUTORCH_BUILD_PYBIND is enabled~~ + - [x] ~~Configure pybind11 integration~~ + - [x] ~~Link with extension_llm_runner library~~ + - [x] ~~Handle tokenizers dependency~~ + - [x] ~~Set up proper include paths~~ + +- [x] ~~**Update root-level setup.py**~~ + - [x] ~~Add multimodal_runner to the extensions list~~ + - [x] ~~Ensure proper build configuration~~ + - [x] ~~Handle platform-specific configurations~~ + +#### Low Priority +- [x] ~~**Create Python wrapper files** in `extension/llm/runner/`~~ + - [x] ~~`__init__.py` - Package initialization~~ + - [x] ~~`multimodal_runner.py` - High-level Python API~~ + - [x] ~~`utils.py` - Utility functions for input preprocessing~~ + +### 2. Build System Integration + +- [ ] **Integrate with main CMake build** + - [ ] Add Python bindings compilation when EXECUTORCH_BUILD_PYBIND is enabled + - [ ] Update extension/llm/runner/CMakeLists.txt to build multimodal_runner_pybindings.cpp + - [ ] Ensure proper dependency resolution + +- [ ] **Handle dependencies** + - [ ] Link against existing tokenizers Python bindings + - [ ] Ensure Module and other dependencies are available + - [ ] Handle pybind11 version requirements + +### 3. Input/Output Handling + +- [ ] **Implement MultimodalInput Python bindings** + - [ ] Support for text inputs + - [ ] Support for image inputs (numpy arrays, PIL Images) + - [ ] Support for audio inputs (if applicable) + - [ ] Mixed input ordering support + +- [ ] **Implement callbacks** + - [ ] Token generation callback + - [ ] Statistics callback + - [ ] Progress reporting + +### 4. Testing and Documentation + +- [ ] **Create comprehensive tests** + - [ ] Unit tests for bindings + - [ ] Integration tests with sample models + - [ ] Performance benchmarks + - [ ] Memory leak tests + +- [ ] **Write documentation** + - [ ] API documentation with examples + - [ ] Installation guide + - [ ] Usage tutorials + - [ ] Model compatibility guide + +### 5. Example Scripts + +- [ ] **Create example scripts** + - [ ] Basic text generation + - [ ] Image + text (vision-language) example + - [ ] Batch processing example + - [ ] Streaming generation example + +## Installation Instructions + +### Prerequisites + +- Python >= 3.8 +- CMake >= 3.18 +- C++17 compatible compiler +- PyTorch (for tensor operations) +- pybind11 >= 2.6.0 + +### Building from Source + +```bash +# Clone the repository +git clone https://github.com/pytorch/executorch.git +cd executorch + +# Install dependencies +pip install -r requirements.txt + +# Build with Python bindings enabled +python setup.py install --cmake-args="-DEXECUTORCH_BUILD_PYBIND=ON" + +# Or for development +pip install -e . --config-settings editable_mode=compat +``` + +### Running Tests + +```bash +# Run the multimodal runner Python tests +python -m pytest extension/llm/runner/test/test_multimodal_runner.py -v +``` + +## Usage Example + +```python +from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig +from executorch.extension.llm.runner.utils import make_text_input, make_image_input +import numpy as np + +# Initialize the runner +runner = MultimodalRunner( + model_path="path/to/model.pte", + tokenizer_path="path/to/tokenizer.bin" +) + +# Create multimodal inputs +image_array = np.random.rand(224, 224, 3) # Example image +inputs = [ + make_text_input("Describe this image:"), + make_image_input(image_array) # numpy array or PIL Image +] + +# Configure generation +config = GenerationConfig( + max_new_tokens=100, + temperature=0.7, + top_p=0.9 +) + +# Generate text with callbacks +def on_token(token): + print(token, end='', flush=True) + +def on_stats(stats): + print(f"\nTokens/sec: {stats.tokens_per_second:.2f}") + +runner.generate(inputs, config, token_callback=on_token, stats_callback=on_stats) + +# Or simpler usage without callbacks +response = runner.generate_text(inputs, config) +print(response) +``` + +## Technical Considerations + +### Memory Management +- Python bindings should properly handle memory ownership +- Use shared_ptr/unique_ptr appropriately +- Implement proper cleanup in destructors + +### Threading and GIL +- Consider GIL release during long-running operations +- Ensure thread safety for callbacks +- Handle Python exceptions in C++ code + +### Performance +- Minimize data copying between Python and C++ +- Use move semantics where possible +- Consider zero-copy tensor operations + +## Dependencies + +### Required +- executorch core libraries +- extension_llm_runner +- tokenizers library +- pybind11 + +### Optional +- numpy (for array handling) +- PIL/Pillow (for image processing) +- torch (for tensor operations) + +## Contributing + +Please follow the ExecuTorch contribution guidelines. Key points: +- Code should be formatted with clang-format +- Python code should follow PEP 8 +- Add comprehensive tests for new features +- Update documentation as needed + +## License + +This project is licensed under the BSD-style license found in the LICENSE file in the root directory of the ExecuTorch repository. + +## Next Steps + +1. **Review and approve this plan** with the team +2. **Start with core bindings** implementation +3. **Test with existing models** (LLaVA, etc.) +4. **Gather feedback** from early users +5. **Iterate and improve** based on usage patterns + +## Questions for Discussion + +1. Should we support async generation? +2. What level of integration with PyTorch tensors is needed? +3. Should we provide pre-built wheels or source-only distribution? +4. How should we handle model loading and caching? +5. What additional utilities would be helpful for users? \ No newline at end of file diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py new file mode 100644 index 00000000000..d41130b0ef4 --- /dev/null +++ b/extension/llm/runner/__init__.py @@ -0,0 +1,340 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Python bindings for ExecuTorch MultimodalRunner. + +This module provides a Python interface to the ExecuTorch multimodal LLM runner, +enabling processing of mixed inputs (text, images, audio) and text generation. +""" + +from typing import List, Union, Optional, Callable, Any +import numpy as np +from pathlib import Path + +try: + from PIL import Image as PILImage + HAS_PIL = True +except ImportError: + HAS_PIL = False + +try: + # Import shared components from the compiled C++ extension + from ._llm_runner import ( + GenerationConfig, + Stats, + Image, + MultimodalInput, + make_text_input, + make_image_input, + MultimodalRunner as _MultimodalRunnerCpp, + ) + + # Define the high-level Python wrapper for MultimodalRunner + class MultimodalRunner: + """ + High-level Python wrapper for the ExecuTorch MultimodalRunner. + + This class provides a convenient interface for running multimodal language models + that can process text, images, and other modalities to generate text output. + + Args: + model_path: Path to the ExecuTorch model file (.pte) + tokenizer_path: Path to the tokenizer file + temperature: Default temperature for text generation (default: 0.8) + device: Device to run on (currently only 'cpu' is supported) + + Example: + >>> runner = MultimodalRunner("model.pte", "tokenizer.bin") + >>> inputs = [ + ... runner.create_text_input("Describe this image:"), + ... runner.create_image_input("image.jpg") + ... ] + >>> response = runner.generate_text(inputs, max_new_tokens=100) + >>> print(response) + """ + + def __init__( + self, + model_path: Union[str, Path], + tokenizer_path: Union[str, Path], + temperature: float = 0.8, + device: str = "cpu" + ): + """Initialize the MultimodalRunner.""" + if device != "cpu": + raise ValueError(f"Currently only 'cpu' device is supported, got '{device}'") + + # Convert paths to strings + model_path = str(Path(model_path).resolve()) + tokenizer_path = str(Path(tokenizer_path).resolve()) + + # Validate paths exist + if not Path(model_path).exists(): + raise FileNotFoundError(f"Model file not found: {model_path}") + if not Path(tokenizer_path).exists(): + raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}") + + # Initialize the C++ runner + self._runner = _MultimodalRunnerCpp(model_path, tokenizer_path, temperature) + self._model_path = model_path + self._tokenizer_path = tokenizer_path + self._default_temperature = temperature + + def create_text_input(self, text: str): + """ + Create a text input for multimodal processing. + + Args: + text: The input text string + + Returns: + A MultimodalInput object containing the text + """ + return make_text_input(text) + + def create_image_input( + self, + image: Union[str, Path, np.ndarray, 'PILImage.Image'] + ): + """ + Create an image input for multimodal processing. + + Args: + image: Can be: + - Path to an image file (str or Path) + - NumPy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) + - PIL Image object + + Returns: + A MultimodalInput object containing the image + + Raises: + ValueError: If the image format is not supported + FileNotFoundError: If the image file doesn't exist + """ + if isinstance(image, (str, Path)): + # Load image from file + image_path = Path(image) + if not image_path.exists(): + raise FileNotFoundError(f"Image file not found: {image_path}") + + if HAS_PIL: + pil_image = PILImage.open(image_path) + # Convert to RGB if necessary + if pil_image.mode != 'RGB': + pil_image = pil_image.convert('RGB') + image = np.array(pil_image, dtype=np.uint8) + else: + # Try to use cv2 if available + try: + import cv2 + image = cv2.imread(str(image_path)) + if image is None: + raise ValueError(f"Failed to load image: {image_path}") + # Convert BGR to RGB + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + except ImportError: + raise ImportError( + "Either PIL or OpenCV is required to load images from files. " + "Install with: pip install pillow or pip install opencv-python" + ) + + elif HAS_PIL and isinstance(image, PILImage.Image): + # Convert PIL Image to numpy array + if image.mode != 'RGB': + image = image.convert('RGB') + image = np.array(image, dtype=np.uint8) + + elif isinstance(image, np.ndarray): + # Validate numpy array + if image.ndim != 3: + raise ValueError(f"Image array must be 3-dimensional (H, W, C), got shape {image.shape}") + if image.shape[2] not in [3, 4]: + raise ValueError(f"Image must have 3 (RGB) or 4 (RGBA) channels, got {image.shape[2]}") + if image.dtype != np.uint8: + # Convert to uint8 if necessary + if image.max() <= 1.0: + # Assume normalized [0, 1] range + image = (image * 255).astype(np.uint8) + else: + image = image.astype(np.uint8) + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + return make_image_input(image) + + def generate( + self, + inputs: List[Any], + config: Optional[GenerationConfig] = None, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Any], None]] = None + ): + """ + Generate text from multimodal inputs with streaming callbacks. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration (uses defaults if None) + token_callback: Function called for each generated token + stats_callback: Function called with generation statistics + """ + if config is None: + config = GenerationConfig() + config.temperature = self._default_temperature + + self._runner.generate(inputs, config, token_callback, stats_callback) + + def generate_text( + self, + inputs: List[Any], + config: Optional[GenerationConfig] = None, + max_new_tokens: Optional[int] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + **kwargs + ) -> str: + """ + Generate text from multimodal inputs and return the complete result. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration (overrides other parameters if provided) + max_new_tokens: Maximum number of tokens to generate + temperature: Sampling temperature (0.0 to 1.0) + top_p: Top-p sampling parameter + **kwargs: Additional generation parameters + + Returns: + The generated text as a string + """ + if config is None: + config = GenerationConfig() + config.temperature = temperature or self._default_temperature + if max_new_tokens is not None: + config.max_new_tokens = max_new_tokens + if top_p is not None: + config.top_p = top_p + + # Set any additional parameters + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + + return self._runner.generate_text(inputs, config) + + def stop(self): + """Stop the current generation process.""" + self._runner.stop() + + @property + def vocab_size(self) -> int: + """Get the vocabulary size of the model.""" + return self._runner.get_vocab_size() + + @property + def model_path(self) -> str: + """Get the path to the loaded model.""" + return self._model_path + + @property + def tokenizer_path(self) -> str: + """Get the path to the loaded tokenizer.""" + return self._tokenizer_path + + def __repr__(self) -> str: + return ( + f"MultimodalRunner(model='{Path(self._model_path).name}', " + f"tokenizer='{Path(self._tokenizer_path).name}', " + f"vocab_size={self.vocab_size})" + ) + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - ensures cleanup.""" + self.stop() + return False + +except ImportError as e: + import warnings + warnings.warn( + f"Failed to import _llm_runner extension: {e}\n" + "Please ensure the extension is built with EXECUTORCH_BUILD_PYBIND=ON", + ImportWarning + ) + # Provide placeholder classes if the extension is not available + class GenerationConfig: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + class Stats: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + class MultimodalRunner: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + class Image: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + class MultimodalInput: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + def make_text_input(text): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + + def make_image_input(image): + raise RuntimeError( + "LLM Runner extension not built. " + "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + ) + +# Import utility functions +from .utils import ( + load_image_from_file, + preprocess_image, + create_generation_config, +) + +__all__ = [ + "MultimodalRunner", + "GenerationConfig", + "Stats", + "Image", + "MultimodalInput", + "make_text_input", + "make_image_input", + "load_image_from_file", + "preprocess_image", + "create_generation_config", +] + +__version__ = "0.1.0" \ No newline at end of file diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp new file mode 100644 index 00000000000..567f6322f71 --- /dev/null +++ b/extension/llm/runner/pybindings.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace py = pybind11; +using namespace executorch::extension::llm; +using namespace executorch::extension; +using namespace executorch::runtime; + +// Helper macro for error handling +#define THROW_IF_ERROR(error, message, ...) \ + ({ \ + if ((error) != Error::Ok) { \ + char msg_buf[256]; \ + snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \ + throw std::runtime_error(msg_buf); \ + } \ + }) + +// Python wrapper class for MultimodalRunner +class PyMultimodalRunner { + public: + PyMultimodalRunner( + const std::string& model_path, + const std::string& tokenizer_path, + float temperature = 0.8f) { + // Load tokenizer + tokenizer_ = get_tokenizer(tokenizer_path.c_str()); + if (!tokenizer_) { + throw std::runtime_error("Failed to load tokenizer from: " + tokenizer_path); + } + + // Load module + module_ = std::make_unique(model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); + Error error = module_->load_method("forward"); + THROW_IF_ERROR(error, "Failed to load model from: %s", model_path.c_str()); + + // Get model type from metadata + const auto method_names = module_->method_names(); + ET_CHECK_MSG(!method_names.empty(), "No methods found in model"); + + // Get metadata + auto method_meta = module_->method_meta("forward"); + if (method_meta.ok()) { + for (const auto& [key, value] : method_meta.get()) { + metadata_[key] = std::stoi(value); + } + } + + // Set up sampler + int32_t vocab_size = get_vocab_size(); + sampler_ = std::make_unique( + vocab_size, + temperature, + 0.9f, // top_p + 0LL // seed + ); + + // Create components + stats_ = std::make_unique(metadata_); + + // Create text decoder runner + text_decoder_runner_ = std::make_unique( + module_.get(), + metadata_ + ); + + // Create multimodal prefiller + multimodal_prefiller_ = std::make_unique( + module_.get(), + metadata_ + ); + + // Create IO manager + io_manager_ = std::make_unique( + module_.get(), + tokenizer_.get(), + text_decoder_runner_.get(), + multimodal_prefiller_.get(), + sampler_.get(), + stats_.get(), + metadata_ + ); + + // Create text token generator + text_token_generator_ = std::make_unique( + tokenizer_.get(), + sampler_.get(), + text_decoder_runner_.get(), + false, // echo + stats_.get(), + false // warming + ); + + // Finally create the runner + runner_ = std::make_unique( + metadata_, + std::move(tokenizer_), + std::move(module_), + std::move(text_decoder_runner_), + std::move(multimodal_prefiller_), + std::move(io_manager_), + std::move(text_token_generator_), + std::move(stats_) + ); + } + + void generate( + const std::vector& inputs, + const GenerationConfig& config, + py::object token_callback = py::none(), + py::object stats_callback = py::none()) { + + // Convert Python callbacks to C++ std::function + std::function cpp_token_callback = nullptr; + if (!token_callback.is_none()) { + cpp_token_callback = [token_callback](const std::string& token) { + py::gil_scoped_acquire acquire; + token_callback(token); + }; + } + + std::function cpp_stats_callback = nullptr; + if (!stats_callback.is_none()) { + cpp_stats_callback = [stats_callback](const Stats& stats) { + py::gil_scoped_acquire acquire; + stats_callback(stats); + }; + } + + // Release GIL during generation + { + py::gil_scoped_release release; + Error error = runner_->generate( + inputs, config, cpp_token_callback, cpp_stats_callback); + THROW_IF_ERROR(error, "Generation failed"); + } + } + + std::string generate_text( + const std::vector& inputs, + const GenerationConfig& config) { + std::string result; + + std::function token_callback = + [&result](const std::string& token) { + result += token; + }; + + std::function stats_callback = nullptr; + + { + py::gil_scoped_release release; + Error error = runner_->generate( + inputs, config, token_callback, stats_callback); + THROW_IF_ERROR(error, "Generation failed"); + } + + return result; + } + + void stop() { + runner_->stop(); + } + + int32_t get_vocab_size() const { + auto it = metadata_.find("vocab_size"); + if (it != metadata_.end()) { + return static_cast(it->second); + } + // Default vocab size if not in metadata + return tokenizer_->vocab_size(); + } + + private: + std::unique_ptr runner_; + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unique_ptr module_; + std::unique_ptr text_decoder_runner_; + std::unique_ptr multimodal_prefiller_; + std::unique_ptr io_manager_; + std::unique_ptr text_token_generator_; + std::unique_ptr stats_; + std::unique_ptr sampler_; + std::unordered_map metadata_; +}; + +// Helper functions for creating MultimodalInput +MultimodalInput make_text_input(const std::string& text) { + return MultimodalInput::text(text); +} + +MultimodalInput make_image_input(py::array_t image_array) { + // Get image dimensions + py::buffer_info buf = image_array.request(); + + if (buf.ndim != 3) { + throw std::runtime_error("Image array must be 3-dimensional (H, W, C)"); + } + + size_t height = buf.shape[0]; + size_t width = buf.shape[1]; + size_t channels = buf.shape[2]; + + if (channels != 3 && channels != 4) { + throw std::runtime_error("Image must have 3 (RGB) or 4 (RGBA) channels"); + } + + // Create Image object from numpy array + uint8_t* data = static_cast(buf.ptr); + std::vector image_data(data, data + height * width * channels); + + Image image(std::move(image_data), height, width, channels); + return MultimodalInput::image(std::move(image)); +} + +PYBIND11_MODULE(_llm_runner, m) { + m.doc() = "Python bindings for ExecuTorch LLM Runners"; + + // Initialize ExecuTorch runtime + runtime_init(); + + // Bind GenerationConfig + py::class_(m, "GenerationConfig") + .def(py::init<>()) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) + .def_readwrite("frequency_penalty", &GenerationConfig::frequency_penalty) + .def_readwrite("warming", &GenerationConfig::warming) + .def_readwrite("echo", &GenerationConfig::echo) + .def_readwrite("seed", &GenerationConfig::seed) + .def("__repr__", [](const GenerationConfig& config) { + return ""; + }); + + // Bind Stats + py::class_(m, "Stats") + .def_readonly("model_load_start_ms", &Stats::model_load_start_ms) + .def_readonly("model_load_end_ms", &Stats::model_load_end_ms) + .def_readonly("inference_start_ms", &Stats::inference_start_ms) + .def_readonly("inference_end_ms", &Stats::inference_end_ms) + .def_readonly("prompt_eval_start_ms", &Stats::prompt_eval_start_ms) + .def_readonly("prompt_eval_end_ms", &Stats::prompt_eval_end_ms) + .def_readonly("first_token_ms", &Stats::first_token_ms) + .def_readonly("aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms) + .def_readonly("num_prompt_tokens", &Stats::num_prompt_tokens) + .def_readonly("num_generated_tokens", &Stats::num_generated_tokens) + .def("get_model_load_time_ms", &Stats::get_model_load_time_ms) + .def("get_inference_time_ms", &Stats::get_inference_time_ms) + .def("get_prompt_eval_time_ms", &Stats::get_prompt_eval_time_ms) + .def("get_eval_time_ms", &Stats::get_eval_time_ms) + .def("get_sampling_time_ms", &Stats::get_sampling_time_ms) + .def("get_tokens_per_second", &Stats::get_tokens_per_second) + .def("__repr__", [](const Stats& stats) { + return ""; + }); + + // Bind Image class + py::class_(m, "Image") + .def(py::init, size_t, size_t, size_t>(), + py::arg("data"), py::arg("height"), py::arg("width"), py::arg("channels")) + .def_property_readonly("height", [](const Image& img) { return img.height_; }) + .def_property_readonly("width", [](const Image& img) { return img.width_; }) + .def_property_readonly("channels", [](const Image& img) { return img.channels_; }) + .def("__repr__", [](const Image& img) { + return ""; + }); + + // Bind MultimodalInput + py::class_(m, "MultimodalInput") + .def_static("text", &MultimodalInput::text, + "Create a text input", py::arg("text")) + .def_static("image", &MultimodalInput::image, + "Create an image input", py::arg("image")) + .def("is_text", &MultimodalInput::is_text) + .def("is_image", &MultimodalInput::is_image) + .def("get_text", [](const MultimodalInput& input) -> py::object { + if (input.is_text()) { + return py::cast(input.get_text()); + } + return py::none(); + }) + .def("__repr__", [](const MultimodalInput& input) { + if (input.is_text()) { + return " 50 ? "..." : "") + "\">"; + } else if (input.is_image()) { + return ""; + } + return ""; + }); + + // Bind helper functions + m.def("make_text_input", &make_text_input, + "Create a text input for multimodal processing", + py::arg("text")); + + m.def("make_image_input", &make_image_input, + "Create an image input from a numpy array (H, W, C)", + py::arg("image_array")); + + // Bind PyMultimodalRunner + py::class_(m, "MultimodalRunner") + .def(py::init(), + py::arg("model_path"), + py::arg("tokenizer_path"), + py::arg("temperature") = 0.8f, + "Initialize a MultimodalRunner with model and tokenizer paths") + .def("generate", &PyMultimodalRunner::generate, + py::arg("inputs"), + py::arg("config"), + py::arg("token_callback") = py::none(), + py::arg("stats_callback") = py::none(), + "Generate text from multimodal inputs with optional callbacks") + .def("generate_text", &PyMultimodalRunner::generate_text, + py::arg("inputs"), + py::arg("config"), + "Generate text and return the complete result as a string") + .def("stop", &PyMultimodalRunner::stop, + "Stop the current generation") + .def("get_vocab_size", &PyMultimodalRunner::get_vocab_size, + "Get the vocabulary size of the model") + .def("__repr__", [](const PyMultimodalRunner& runner) { + return ""; + }); +} \ No newline at end of file diff --git a/extension/llm/runner/utils.py b/extension/llm/runner/utils.py new file mode 100644 index 00000000000..35a3db11a3d --- /dev/null +++ b/extension/llm/runner/utils.py @@ -0,0 +1,302 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Utility functions for the MultimodalRunner Python bindings. + +This module provides helper functions for common tasks like image preprocessing, +configuration creation, and data conversion. +""" + +from typing import Union, Tuple, Optional, Dict, Any +import numpy as np +from pathlib import Path + +try: + from PIL import Image as PILImage + HAS_PIL = True +except ImportError: + HAS_PIL = False + +from ._llm_runner import GenerationConfig + + +def load_image_from_file( + image_path: Union[str, Path], + target_size: Optional[Tuple[int, int]] = None, + mode: str = 'RGB' +) -> np.ndarray: + """ + Load an image from file and optionally resize it. + + Args: + image_path: Path to the image file + target_size: Optional (width, height) tuple to resize the image + mode: Image mode ('RGB', 'RGBA', 'L' for grayscale) + + Returns: + NumPy array with shape (H, W, C) for color or (H, W) for grayscale + + Raises: + FileNotFoundError: If the image file doesn't exist + ImportError: If neither PIL nor OpenCV is available + ValueError: If the image cannot be loaded + """ + image_path = Path(image_path) + if not image_path.exists(): + raise FileNotFoundError(f"Image file not found: {image_path}") + + if HAS_PIL: + # Use PIL/Pillow + image = PILImage.open(image_path) + + # Convert to requested mode + if image.mode != mode: + image = image.convert(mode) + + # Resize if requested + if target_size is not None: + image = image.resize(target_size, PILImage.Resampling.LANCZOS) + + # Convert to numpy array + return np.array(image, dtype=np.uint8) + else: + # Try OpenCV + try: + import cv2 + + # Read image + if mode == 'L': + image = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE) + else: + image = cv2.imread(str(image_path), cv2.IMREAD_COLOR) + + if image is None: + raise ValueError(f"Failed to load image: {image_path}") + + # Convert BGR to RGB if needed + if mode == 'RGB' and len(image.shape) == 3: + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + elif mode == 'RGBA' and len(image.shape) == 3: + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGBA) + + # Resize if requested + if target_size is not None: + image = cv2.resize(image, target_size, interpolation=cv2.INTER_LANCZOS4) + + return image.astype(np.uint8) + + except ImportError: + raise ImportError( + "Either PIL or OpenCV is required to load images from files. " + "Install with: pip install pillow or pip install opencv-python" + ) + + +def preprocess_image( + image: np.ndarray, + target_size: Optional[Tuple[int, int]] = None, + normalize: bool = False, + mean: Optional[Tuple[float, float, float]] = None, + std: Optional[Tuple[float, float, float]] = None +) -> np.ndarray: + """ + Preprocess an image array for model input. + + Args: + image: Input image as numpy array (H, W, C) + target_size: Optional (width, height) tuple to resize the image + normalize: Whether to normalize pixel values to [0, 1] + mean: Mean values for normalization (per channel) + std: Standard deviation values for normalization (per channel) + + Returns: + Preprocessed image array + + Raises: + ValueError: If image dimensions are invalid + """ + if image.ndim != 3: + raise ValueError(f"Image must be 3-dimensional (H, W, C), got shape {image.shape}") + + # Resize if needed + if target_size is not None: + if HAS_PIL: + # Use PIL for resizing + pil_image = PILImage.fromarray(image) + pil_image = pil_image.resize(target_size, PILImage.Resampling.LANCZOS) + image = np.array(pil_image) + else: + # Try OpenCV + try: + import cv2 + image = cv2.resize(image, target_size, interpolation=cv2.INTER_LANCZOS4) + except ImportError: + # Simple nearest neighbor resize as fallback + from scipy import ndimage + factors = (target_size[1] / image.shape[0], target_size[0] / image.shape[1], 1) + image = ndimage.zoom(image, factors, order=1) + + # Convert to float for normalization + if normalize or mean is not None or std is not None: + image = image.astype(np.float32) + + if normalize: + image = image / 255.0 + + if mean is not None: + mean_arr = np.array(mean).reshape(1, 1, -1) + image = image - mean_arr + + if std is not None: + std_arr = np.array(std).reshape(1, 1, -1) + image = image / std_arr + + return image + + +def create_generation_config( + max_new_tokens: int = 1000, + temperature: float = 0.8, + top_p: float = 0.95, + top_k: int = 40, + repetition_penalty: float = 1.0, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + echo: bool = False, + seed: Optional[int] = None, + **kwargs +) -> GenerationConfig: + """ + Create a GenerationConfig with sensible defaults. + + Args: + max_new_tokens: Maximum number of tokens to generate (default: 1000) + temperature: Sampling temperature, higher = more random (default: 0.8) + top_p: Nucleus sampling parameter (default: 0.95) + top_k: Top-k sampling parameter (default: 40) + repetition_penalty: Penalty for repeating tokens (default: 1.0) + presence_penalty: Penalty for using tokens that appear in the prompt (default: 0.0) + frequency_penalty: Penalty based on token frequency (default: 0.0) + echo: Whether to echo the input prompt (default: False) + seed: Random seed for reproducibility (default: None) + **kwargs: Additional parameters to set on the config + + Returns: + A configured GenerationConfig object + + Example: + >>> config = create_generation_config( + ... max_new_tokens=100, + ... temperature=0.7, + ... top_p=0.9 + ... ) + """ + config = GenerationConfig() + + # Set all parameters + config.max_new_tokens = max_new_tokens + config.temperature = temperature + config.top_p = top_p + config.top_k = top_k + config.repetition_penalty = repetition_penalty + config.presence_penalty = presence_penalty + config.frequency_penalty = frequency_penalty + config.echo = echo + + if seed is not None: + config.seed = seed + + # Set any additional parameters + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + else: + raise ValueError(f"GenerationConfig has no parameter '{key}'") + + return config + + +def batch_generate( + runner: 'MultimodalRunner', + batch_inputs: list, + config: Optional[GenerationConfig] = None, + show_progress: bool = True +) -> list: + """ + Generate text for multiple input batches. + + Args: + runner: The MultimodalRunner instance + batch_inputs: List of input lists, each containing multimodal inputs + config: Generation configuration (shared for all batches) + show_progress: Whether to show a progress bar + + Returns: + List of generated text strings + + Example: + >>> batch_inputs = [ + ... [make_text_input("Question 1")], + ... [make_text_input("Question 2")], + ... ] + >>> results = batch_generate(runner, batch_inputs) + """ + results = [] + + if show_progress: + try: + from tqdm import tqdm + batch_inputs = tqdm(batch_inputs, desc="Generating") + except ImportError: + pass + + for inputs in batch_inputs: + result = runner.generate_text(inputs, config) + results.append(result) + + return results + + +def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int: + """ + Estimate the number of tokens in a text string. + + This is a rough approximation and actual token count may vary + depending on the tokenizer used. + + Args: + text: Input text string + chars_per_token: Average characters per token (default: 4.0) + + Returns: + Estimated number of tokens + """ + return max(1, int(len(text) / chars_per_token)) + + +def format_stats(stats: Any) -> str: + """ + Format generation statistics for display. + + Args: + stats: Stats object from the runner + + Returns: + Formatted string with statistics + """ + lines = [ + "Generation Statistics:", + f" Model load time: {stats.get_model_load_time_ms():.2f} ms", + f" Prompt eval time: {stats.get_prompt_eval_time_ms():.2f} ms", + f" Generation time: {stats.get_eval_time_ms():.2f} ms", + f" Sampling time: {stats.get_sampling_time_ms():.2f} ms", + f" Total inference time: {stats.get_inference_time_ms():.2f} ms", + f" Prompt tokens: {stats.num_prompt_tokens}", + f" Generated tokens: {stats.num_generated_tokens}", + f" Tokens per second: {stats.get_tokens_per_second():.2f}", + ] + return "\n".join(lines) \ No newline at end of file diff --git a/setup.py b/setup.py index def9b996be0..a35e0c96a9c 100644 --- a/setup.py +++ b/setup.py @@ -884,6 +884,11 @@ def run(self): # noqa C901 modpath="executorch.codegen.tools.selective_build", dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"], ), + BuiltExtension( + src="extension/llm/runner/_llm_runner.*", + modpath="executorch.extension.llm.runner._llm_runner", + dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"], + ), BuiltExtension( src="executorchcoreml.*", src_dir="backends/apple/coreml", From 693c759ebf19f21d64f1b64afa4f05862ee44867 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 18:07:00 -0700 Subject: [PATCH 02/40] Make it work --- extension/llm/runner/CMakeLists.txt | 40 +- extension/llm/runner/__init__.py | 535 +++++++++++------------ extension/llm/runner/_llm_runner.pyi | 294 +++++++++++++ extension/llm/runner/llm_runner_helper.h | 17 + extension/llm/runner/pybindings.cpp | 396 ++++++++--------- extension/llm/runner/test_pybindings.py | 413 +++++++++++++++++ extension/llm/runner/utils.py | 141 +++--- setup.py | 1 + tools/cmake/preset/pybind.cmake | 2 + 9 files changed, 1216 insertions(+), 623 deletions(-) create mode 100644 extension/llm/runner/_llm_runner.pyi create mode 100644 extension/llm/runner/test_pybindings.py diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index d86fc53ae75..fedb7a91162 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -82,43 +82,29 @@ endif() # Python bindings for MultimodalRunner if(EXECUTORCH_BUILD_PYBIND) - # Find pybind11 - find_package(pybind11 REQUIRED) - # Create the Python extension module for LLM runners pybind11_add_module( - _llm_runner - ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp + _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp ) - + # Link with the extension_llm_runner library and its dependencies target_link_libraries( - _llm_runner - PRIVATE - extension_llm_runner - executorch_core - extension_module - extension_tensor - tokenizers::tokenizers + _llm_runner PRIVATE extension_llm_runner executorch_core extension_module + extension_tensor tokenizers::tokenizers ) - + # Set properties for the Python extension set_target_properties( _llm_runner - PROPERTIES - POSITION_INDEPENDENT_CODE ON - CXX_VISIBILITY_PRESET "hidden" - INTERPROCEDURAL_OPTIMIZATION TRUE - PREFIX "${PYTHON_MODULE_PREFIX}" - SUFFIX "${PYTHON_MODULE_SUFFIX}" + PROPERTIES POSITION_INDEPENDENT_CODE ON + CXX_VISIBILITY_PRESET "hidden" + INTERPROCEDURAL_OPTIMIZATION TRUE ) - + # Add include directories - target_include_directories( - _llm_runner - PRIVATE - ${_common_include_directories} - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/../sampler + target_include_directories(_llm_runner PRIVATE ${_common_include_directories}) + + install(TARGETS _llm_runner + LIBRARY DESTINATION executorch/extension/llm/runner ) endif() diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py index d41130b0ef4..466c2101ab8 100644 --- a/extension/llm/runner/__init__.py +++ b/extension/llm/runner/__init__.py @@ -11,12 +11,14 @@ enabling processing of mixed inputs (text, images, audio) and text generation. """ -from typing import List, Union, Optional, Callable, Any -import numpy as np from pathlib import Path +from typing import Any, Callable, List, Optional, Union + +import numpy as np try: from PIL import Image as PILImage + HAS_PIL = True except ImportError: HAS_PIL = False @@ -25,311 +27,262 @@ # Import shared components from the compiled C++ extension from ._llm_runner import ( GenerationConfig, - Stats, Image, - MultimodalInput, - make_text_input, make_image_input, + make_text_input, + MultimodalInput, MultimodalRunner as _MultimodalRunnerCpp, + Stats, + ) +except ImportError: + raise RuntimeError( + "LLM runner is not installed. Please build ExecuTorch from source with EXECUTORCH_BUILD_PYBIND=ON" ) - - # Define the high-level Python wrapper for MultimodalRunner - class MultimodalRunner: + + +# Define the high-level Python wrapper for MultimodalRunner +class MultimodalRunner: + """ + High-level Python wrapper for the ExecuTorch MultimodalRunner. + + This class provides a convenient interface for running multimodal language models + that can process text, images, and other modalities to generate text output. + + Args: + model_path: Path to the ExecuTorch model file (.pte) + tokenizer_path: Path to the tokenizer file + temperature: Default temperature for text generation (default: 0.8) + device: Device to run on (currently only 'cpu' is supported) + + Example: + >>> runner = MultimodalRunner("model.pte", "tokenizer.bin") + >>> inputs = [ + ... runner.create_text_input("Describe this image:"), + ... runner.create_image_input("image.jpg") + ... ] + >>> response = runner.generate_text(inputs, max_new_tokens=100) + >>> print(response) + """ + + def __init__( + self, + model_path: Union[str, Path], + tokenizer_path: Union[str, Path], + temperature: float = 0.8, + device: str = "cpu", + ): + """Initialize the MultimodalRunner.""" + if device != "cpu": + raise ValueError( + f"Currently only 'cpu' device is supported, got '{device}'" + ) + + # Convert paths to strings + model_path = str(Path(model_path).resolve()) + tokenizer_path = str(Path(tokenizer_path).resolve()) + + # Validate paths exist + if not Path(model_path).exists(): + raise FileNotFoundError(f"Model file not found: {model_path}") + if not Path(tokenizer_path).exists(): + raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}") + + # Initialize the C++ runner + self._runner = _MultimodalRunnerCpp(model_path, tokenizer_path, temperature) + self._model_path = model_path + self._tokenizer_path = tokenizer_path + self._default_temperature = temperature + + def create_text_input(self, text: str): """ - High-level Python wrapper for the ExecuTorch MultimodalRunner. - - This class provides a convenient interface for running multimodal language models - that can process text, images, and other modalities to generate text output. - + Create a text input for multimodal processing. + Args: - model_path: Path to the ExecuTorch model file (.pte) - tokenizer_path: Path to the tokenizer file - temperature: Default temperature for text generation (default: 0.8) - device: Device to run on (currently only 'cpu' is supported) - - Example: - >>> runner = MultimodalRunner("model.pte", "tokenizer.bin") - >>> inputs = [ - ... runner.create_text_input("Describe this image:"), - ... runner.create_image_input("image.jpg") - ... ] - >>> response = runner.generate_text(inputs, max_new_tokens=100) - >>> print(response) + text: The input text string + + Returns: + A MultimodalInput object containing the text """ - - def __init__( - self, - model_path: Union[str, Path], - tokenizer_path: Union[str, Path], - temperature: float = 0.8, - device: str = "cpu" - ): - """Initialize the MultimodalRunner.""" - if device != "cpu": - raise ValueError(f"Currently only 'cpu' device is supported, got '{device}'") - - # Convert paths to strings - model_path = str(Path(model_path).resolve()) - tokenizer_path = str(Path(tokenizer_path).resolve()) - - # Validate paths exist - if not Path(model_path).exists(): - raise FileNotFoundError(f"Model file not found: {model_path}") - if not Path(tokenizer_path).exists(): - raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}") - - # Initialize the C++ runner - self._runner = _MultimodalRunnerCpp(model_path, tokenizer_path, temperature) - self._model_path = model_path - self._tokenizer_path = tokenizer_path - self._default_temperature = temperature - - def create_text_input(self, text: str): - """ - Create a text input for multimodal processing. - - Args: - text: The input text string - - Returns: - A MultimodalInput object containing the text - """ - return make_text_input(text) - - def create_image_input( - self, - image: Union[str, Path, np.ndarray, 'PILImage.Image'] - ): - """ - Create an image input for multimodal processing. - - Args: - image: Can be: - - Path to an image file (str or Path) - - NumPy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) - - PIL Image object - - Returns: - A MultimodalInput object containing the image - - Raises: - ValueError: If the image format is not supported - FileNotFoundError: If the image file doesn't exist - """ - if isinstance(image, (str, Path)): - # Load image from file - image_path = Path(image) - if not image_path.exists(): - raise FileNotFoundError(f"Image file not found: {image_path}") - - if HAS_PIL: - pil_image = PILImage.open(image_path) - # Convert to RGB if necessary - if pil_image.mode != 'RGB': - pil_image = pil_image.convert('RGB') - image = np.array(pil_image, dtype=np.uint8) - else: - # Try to use cv2 if available - try: - import cv2 - image = cv2.imread(str(image_path)) - if image is None: - raise ValueError(f"Failed to load image: {image_path}") - # Convert BGR to RGB - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - except ImportError: - raise ImportError( - "Either PIL or OpenCV is required to load images from files. " - "Install with: pip install pillow or pip install opencv-python" - ) - - elif HAS_PIL and isinstance(image, PILImage.Image): - # Convert PIL Image to numpy array - if image.mode != 'RGB': - image = image.convert('RGB') - image = np.array(image, dtype=np.uint8) - - elif isinstance(image, np.ndarray): - # Validate numpy array - if image.ndim != 3: - raise ValueError(f"Image array must be 3-dimensional (H, W, C), got shape {image.shape}") - if image.shape[2] not in [3, 4]: - raise ValueError(f"Image must have 3 (RGB) or 4 (RGBA) channels, got {image.shape[2]}") - if image.dtype != np.uint8: - # Convert to uint8 if necessary - if image.max() <= 1.0: - # Assume normalized [0, 1] range - image = (image * 255).astype(np.uint8) - else: - image = image.astype(np.uint8) + return make_text_input(text) + + def create_image_input(self, image: Union[str, Path, np.ndarray, "PILImage.Image"]): + """ + Create an image input for multimodal processing. + + Args: + image: Can be: + - Path to an image file (str or Path) + - NumPy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) + - PIL Image object + + Returns: + A MultimodalInput object containing the image + + Raises: + ValueError: If the image format is not supported + FileNotFoundError: If the image file doesn't exist + """ + if isinstance(image, (str, Path)): + # Load image from file + image_path = Path(image) + if not image_path.exists(): + raise FileNotFoundError(f"Image file not found: {image_path}") + + if HAS_PIL: + pil_image = PILImage.open(image_path) + # Convert to RGB if necessary + if pil_image.mode != "RGB": + pil_image = pil_image.convert("RGB") + image = np.array(pil_image, dtype=np.uint8) else: - raise ValueError(f"Unsupported image type: {type(image)}") - - return make_image_input(image) - - def generate( - self, - inputs: List[Any], - config: Optional[GenerationConfig] = None, - token_callback: Optional[Callable[[str], None]] = None, - stats_callback: Optional[Callable[[Any], None]] = None - ): - """ - Generate text from multimodal inputs with streaming callbacks. - - Args: - inputs: List of multimodal inputs (text, images, etc.) - config: Generation configuration (uses defaults if None) - token_callback: Function called for each generated token - stats_callback: Function called with generation statistics - """ - if config is None: - config = GenerationConfig() - config.temperature = self._default_temperature - - self._runner.generate(inputs, config, token_callback, stats_callback) - - def generate_text( - self, - inputs: List[Any], - config: Optional[GenerationConfig] = None, - max_new_tokens: Optional[int] = None, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - **kwargs - ) -> str: - """ - Generate text from multimodal inputs and return the complete result. - - Args: - inputs: List of multimodal inputs (text, images, etc.) - config: Generation configuration (overrides other parameters if provided) - max_new_tokens: Maximum number of tokens to generate - temperature: Sampling temperature (0.0 to 1.0) - top_p: Top-p sampling parameter - **kwargs: Additional generation parameters - - Returns: - The generated text as a string - """ - if config is None: - config = GenerationConfig() - config.temperature = temperature or self._default_temperature - if max_new_tokens is not None: - config.max_new_tokens = max_new_tokens - if top_p is not None: - config.top_p = top_p - - # Set any additional parameters - for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) - - return self._runner.generate_text(inputs, config) - - def stop(self): - """Stop the current generation process.""" - self._runner.stop() - - @property - def vocab_size(self) -> int: - """Get the vocabulary size of the model.""" - return self._runner.get_vocab_size() - - @property - def model_path(self) -> str: - """Get the path to the loaded model.""" - return self._model_path - - @property - def tokenizer_path(self) -> str: - """Get the path to the loaded tokenizer.""" - return self._tokenizer_path - - def __repr__(self) -> str: - return ( - f"MultimodalRunner(model='{Path(self._model_path).name}', " - f"tokenizer='{Path(self._tokenizer_path).name}', " - f"vocab_size={self.vocab_size})" - ) - - def __enter__(self): - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit - ensures cleanup.""" - self.stop() - return False - -except ImportError as e: - import warnings - warnings.warn( - f"Failed to import _llm_runner extension: {e}\n" - "Please ensure the extension is built with EXECUTORCH_BUILD_PYBIND=ON", - ImportWarning - ) - # Provide placeholder classes if the extension is not available - class GenerationConfig: - def __init__(self, *args, **kwargs): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - class Stats: - def __init__(self, *args, **kwargs): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - class MultimodalRunner: - def __init__(self, *args, **kwargs): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - class Image: - def __init__(self, *args, **kwargs): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - class MultimodalInput: - def __init__(self, *args, **kwargs): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - def make_text_input(text): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" - ) - - def make_image_input(image): - raise RuntimeError( - "LLM Runner extension not built. " - "Please rebuild with EXECUTORCH_BUILD_PYBIND=ON" + # Try to use cv2 if available + try: + import cv2 + + image = cv2.imread(str(image_path)) + if image is None: + raise ValueError(f"Failed to load image: {image_path}") + # Convert BGR to RGB + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + except ImportError: + raise ImportError( + "Either PIL or OpenCV is required to load images from files. " + "Install with: pip install pillow or pip install opencv-python" + ) + + elif HAS_PIL and isinstance(image, PILImage.Image): + # Convert PIL Image to numpy array + if image.mode != "RGB": + image = image.convert("RGB") + image = np.array(image, dtype=np.uint8) + + elif isinstance(image, np.ndarray): + # Validate numpy array + if image.ndim != 3: + raise ValueError( + f"Image array must be 3-dimensional (H, W, C), got shape {image.shape}" + ) + if image.shape[2] not in [3, 4]: + raise ValueError( + f"Image must have 3 (RGB) or 4 (RGBA) channels, got {image.shape[2]}" + ) + if image.dtype != np.uint8: + # Convert to uint8 if necessary + if image.max() <= 1.0: + # Assume normalized [0, 1] range + image = (image * 255).astype(np.uint8) + else: + image = image.astype(np.uint8) + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + return make_image_input(image) + + def generate( + self, + inputs: List[Any], + config: Optional[GenerationConfig] = None, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Any], None]] = None, + ): + """ + Generate text from multimodal inputs with streaming callbacks. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration (uses defaults if None) + token_callback: Function called for each generated token + stats_callback: Function called with generation statistics + """ + if config is None: + config = GenerationConfig() + config.temperature = self._default_temperature + + self._runner.generate(inputs, config, token_callback, stats_callback) + + def generate_text( + self, + inputs: List[Any], + config: Optional[GenerationConfig] = None, + max_new_tokens: Optional[int] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + **kwargs, + ) -> str: + """ + Generate text from multimodal inputs and return the complete result. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration (overrides other parameters if provided) + max_new_tokens: Maximum number of tokens to generate + temperature: Sampling temperature (0.0 to 1.0) + top_p: Top-p sampling parameter + **kwargs: Additional generation parameters + + Returns: + The generated text as a string + """ + if config is None: + config = GenerationConfig() + config.temperature = temperature or self._default_temperature + if max_new_tokens is not None: + config.max_new_tokens = max_new_tokens + if top_p is not None: + config.top_p = top_p + + # Set any additional parameters + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + + return self._runner.generate_text(inputs, config) + + def stop(self): + """Stop the current generation process.""" + self._runner.stop() + + @property + def vocab_size(self) -> int: + """Get the vocabulary size of the model.""" + return self._runner.get_vocab_size() + + @property + def model_path(self) -> str: + """Get the path to the loaded model.""" + return self._model_path + + @property + def tokenizer_path(self) -> str: + """Get the path to the loaded tokenizer.""" + return self._tokenizer_path + + def __repr__(self) -> str: + return ( + f"MultimodalRunner(model='{Path(self._model_path).name}', " + f"tokenizer='{Path(self._tokenizer_path).name}', " + f"vocab_size={self.vocab_size})" ) + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - ensures cleanup.""" + self.stop() + return False + + # Import utility functions -from .utils import ( - load_image_from_file, - preprocess_image, - create_generation_config, -) +from .utils import create_generation_config, load_image_from_file, preprocess_image __all__ = [ "MultimodalRunner", "GenerationConfig", "Stats", "Image", - "MultimodalInput", + "MultimodalInput", "make_text_input", "make_image_input", "load_image_from_file", @@ -337,4 +290,4 @@ def make_image_input(image): "create_generation_config", ] -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.1.0" diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi new file mode 100644 index 00000000000..97d84b08a0e --- /dev/null +++ b/extension/llm/runner/_llm_runner.pyi @@ -0,0 +1,294 @@ +""" +Type stubs for _llm_runner module. + +This file provides type annotations for the ExecuTorch LLM Runner Python bindings. +""" + +from typing import List, Optional, Callable, Union +import numpy as np +from numpy.typing import NDArray + +class GenerationConfig: + """Configuration for text generation.""" + + echo: bool + """Whether to echo the input prompt in the output.""" + + max_new_tokens: int + """Maximum number of new tokens to generate (-1 for auto).""" + + warming: bool + """Whether this is a warmup run (affects perf benchmarking).""" + + seq_len: int + """Maximum number of total tokens (-1 for auto).""" + + temperature: float + """Temperature for sampling (higher = more random).""" + + num_bos: int + """Number of BOS tokens to add to the prompt.""" + + num_eos: int + """Number of EOS tokens to add to the prompt.""" + + def __init__(self) -> None: + """Initialize GenerationConfig with default values.""" + ... + + def resolve_max_new_tokens(self, max_context_len: int, num_prompt_tokens: int) -> int: + """ + Resolve the maximum number of new tokens to generate based on constraints. + + Args: + max_context_len: The maximum context length supported by the model + num_prompt_tokens: The number of tokens in the input prompt + + Returns: + The resolved maximum number of new tokens to generate + """ + ... + + def __repr__(self) -> str: ... + + +class Stats: + """Statistics for LLM generation performance.""" + + SCALING_FACTOR_UNITS_PER_SECOND: int + """Scaling factor for timestamps (1000 for milliseconds).""" + + model_load_start_ms: int + """Start time of model loading in milliseconds.""" + + model_load_end_ms: int + """End time of model loading in milliseconds.""" + + inference_start_ms: int + """Start time of inference in milliseconds.""" + + token_encode_end_ms: int + """End time of tokenizer encoding in milliseconds.""" + + model_execution_start_ms: int + """Start time of model execution in milliseconds.""" + + model_execution_end_ms: int + """End time of model execution in milliseconds.""" + + prompt_eval_end_ms: int + """End time of prompt evaluation in milliseconds.""" + + first_token_ms: int + """Timestamp when the first generated token is emitted.""" + + inference_end_ms: int + """End time of inference/generation in milliseconds.""" + + aggregate_sampling_time_ms: int + """Total time spent in sampling across all tokens.""" + + num_prompt_tokens: int + """Number of tokens in the input prompt.""" + + num_generated_tokens: int + """Number of tokens generated.""" + + def on_sampling_begin(self) -> None: + """Mark the beginning of a sampling operation.""" + ... + + def on_sampling_end(self) -> None: + """Mark the end of a sampling operation.""" + ... + + def reset(self, all_stats: bool = False) -> None: + """ + Reset statistics. + + Args: + all_stats: If True, reset all stats including model load times. + If False, preserve model load times. + """ + ... + + def to_json_string(self) -> str: + """Convert stats to JSON string representation.""" + ... + + def __repr__(self) -> str: ... + + +class Image: + """Container for image data.""" + + data: List[int] + """Raw image data as a list of uint8 values.""" + + width: int + """Image width in pixels.""" + + height: int + """Image height in pixels.""" + + channels: int + """Number of color channels (3 for RGB, 4 for RGBA).""" + + def __init__(self) -> None: + """Initialize an empty Image.""" + ... + + def __repr__(self) -> str: ... + + +class MultimodalInput: + """Container for multimodal input data (text, image, etc.).""" + + def __init__(self, text: str) -> None: + """ + Create a MultimodalInput with text. + + Args: + text: The input text string + """ + ... + + def __init__(self, image: Image) -> None: + """ + Create a MultimodalInput with an image. + + Args: + image: The input image + """ + ... + + def is_text(self) -> bool: + """Check if this input contains text.""" + ... + + def is_image(self) -> bool: + """Check if this input contains an image.""" + ... + + def get_text(self) -> Optional[str]: + """ + Get the text content if this is a text input. + + Returns: + The text string if this is a text input, None otherwise + """ + ... + + def __repr__(self) -> str: ... + + +class MultimodalRunner: + """Runner for multimodal language models.""" + + def __init__( + self, + model_path: str, + tokenizer_path: str, + data_path: Optional[str] = None + ) -> None: + """ + Initialize a MultimodalRunner. + + Args: + model_path: Path to the model file (.pte) + tokenizer_path: Path to the tokenizer file + data_path: Optional path to additional data file + + Raises: + RuntimeError: If initialization fails + """ + ... + + def generate( + self, + inputs: List[MultimodalInput], + config: GenerationConfig, + token_callback: Optional[Callable[[str], None]] = None, + stats_callback: Optional[Callable[[Stats], None]] = None + ) -> None: + """ + Generate text from multimodal inputs. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration + token_callback: Optional callback called for each generated token + stats_callback: Optional callback called with generation statistics + + Raises: + RuntimeError: If generation fails + """ + ... + + def generate_text( + self, + inputs: List[MultimodalInput], + config: GenerationConfig + ) -> str: + """ + Generate text and return the complete result as a string. + + Args: + inputs: List of multimodal inputs (text, images, etc.) + config: Generation configuration + + Returns: + The generated text as a string + + Raises: + RuntimeError: If generation fails + """ + ... + + def stop(self) -> None: + """Stop the current generation process.""" + ... + + def reset(self) -> None: + """Reset the runner state and KV cache.""" + ... + + def get_vocab_size(self) -> int: + """ + Get the vocabulary size of the model. + + Returns: + The vocabulary size, or -1 if not available + """ + ... + + def __repr__(self) -> str: ... + + +def make_text_input(text: str) -> MultimodalInput: + """ + Create a text input for multimodal processing. + + Args: + text: The input text string + + Returns: + A MultimodalInput containing the text + """ + ... + + +def make_image_input(image_array: NDArray[np.uint8]) -> MultimodalInput: + """ + Create an image input from a numpy array. + + Args: + image_array: Numpy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) + + Returns: + A MultimodalInput containing the image + + Raises: + RuntimeError: If the array has invalid dimensions or number of channels + """ + ... \ No newline at end of file diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h index 191ea3ab090..76f129774cf 100644 --- a/extension/llm/runner/llm_runner_helper.h +++ b/extension/llm/runner/llm_runner_helper.h @@ -121,4 +121,21 @@ ET_EXPERIMENTAL std::unique_ptr create_multimodal_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path = std::nullopt); +/** + * @brief Creates a MultimodalRunner instance with a shared tokenizer + * + * This overload allows using a tokenizer that is shared/owned by Python or + * other code. The tokenizer must remain valid for the lifetime of the runner. + * + * @param model_path Path to the model file + * @param tokenizer Shared pointer to an initialized tokenizer instance + * @param data_path Optional path to additional .ptd required by the model + * @return std::unique_ptr Initialized MultimodalRunner + * instance, or nullptr on failure + */ +ET_EXPERIMENTAL std::unique_ptr create_multimodal_runner( + const std::string& model_path, + std::shared_ptr<::tokenizers::Tokenizer> tokenizer, + std::optional data_path = std::nullopt); + } // namespace executorch::extension::llm diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index 567f6322f71..77d1e95c88f 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -11,9 +11,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -43,89 +43,26 @@ using namespace executorch::runtime; // Python wrapper class for MultimodalRunner class PyMultimodalRunner { public: + // Constructor that takes a tokenizer path PyMultimodalRunner( const std::string& model_path, const std::string& tokenizer_path, - float temperature = 0.8f) { - // Load tokenizer - tokenizer_ = get_tokenizer(tokenizer_path.c_str()); - if (!tokenizer_) { - throw std::runtime_error("Failed to load tokenizer from: " + tokenizer_path); + std::optional data_path = std::nullopt) { + // Load tokenizer using the helper function + auto tokenizer = + load_tokenizer(tokenizer_path, nullptr, std::nullopt, 0, 0); + if (!tokenizer) { + throw std::runtime_error( + "Failed to load tokenizer from: " + tokenizer_path); } - // Load module - module_ = std::make_unique(model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); - Error error = module_->load_method("forward"); - THROW_IF_ERROR(error, "Failed to load model from: %s", model_path.c_str()); - - // Get model type from metadata - const auto method_names = module_->method_names(); - ET_CHECK_MSG(!method_names.empty(), "No methods found in model"); - - // Get metadata - auto method_meta = module_->method_meta("forward"); - if (method_meta.ok()) { - for (const auto& [key, value] : method_meta.get()) { - metadata_[key] = std::stoi(value); - } + // Create multimodal runner using the helper function + runner_ = + create_multimodal_runner(model_path, std::move(tokenizer), data_path); + if (!runner_) { + throw std::runtime_error( + "Failed to create multimodal runner with model: " + model_path); } - - // Set up sampler - int32_t vocab_size = get_vocab_size(); - sampler_ = std::make_unique( - vocab_size, - temperature, - 0.9f, // top_p - 0LL // seed - ); - - // Create components - stats_ = std::make_unique(metadata_); - - // Create text decoder runner - text_decoder_runner_ = std::make_unique( - module_.get(), - metadata_ - ); - - // Create multimodal prefiller - multimodal_prefiller_ = std::make_unique( - module_.get(), - metadata_ - ); - - // Create IO manager - io_manager_ = std::make_unique( - module_.get(), - tokenizer_.get(), - text_decoder_runner_.get(), - multimodal_prefiller_.get(), - sampler_.get(), - stats_.get(), - metadata_ - ); - - // Create text token generator - text_token_generator_ = std::make_unique( - tokenizer_.get(), - sampler_.get(), - text_decoder_runner_.get(), - false, // echo - stats_.get(), - false // warming - ); - - // Finally create the runner - runner_ = std::make_unique( - metadata_, - std::move(tokenizer_), - std::move(module_), - std::move(text_decoder_runner_), - std::move(multimodal_prefiller_), - std::move(io_manager_), - std::move(text_token_generator_), - std::move(stats_) - ); } void generate( @@ -133,7 +70,10 @@ class PyMultimodalRunner { const GenerationConfig& config, py::object token_callback = py::none(), py::object stats_callback = py::none()) { - + if (!runner_) { + throw std::runtime_error("Runner not initialized"); + } + // Convert Python callbacks to C++ std::function std::function cpp_token_callback = nullptr; if (!token_callback.is_none()) { @@ -160,83 +100,30 @@ class PyMultimodalRunner { } } - std::string generate_text( - const std::vector& inputs, - const GenerationConfig& config) { - std::string result; - - std::function token_callback = - [&result](const std::string& token) { - result += token; - }; - - std::function stats_callback = nullptr; - - { - py::gil_scoped_release release; - Error error = runner_->generate( - inputs, config, token_callback, stats_callback); - THROW_IF_ERROR(error, "Generation failed"); + void stop() { + if (runner_) { + runner_->stop(); } - - return result; } - void stop() { - runner_->stop(); + void reset() { + if (runner_) { + runner_->reset(); + } } + // Note: Since the runner owns the tokenizer and metadata after creation, + // we cannot directly access them. This is a limitation of the current design. + // For now, we'll return a placeholder value. int32_t get_vocab_size() const { - auto it = metadata_.find("vocab_size"); - if (it != metadata_.end()) { - return static_cast(it->second); - } - // Default vocab size if not in metadata - return tokenizer_->vocab_size(); + // TODO: Consider exposing metadata through the MultimodalRunner interface + return -1; // Indicate that vocab size is not available } private: std::unique_ptr runner_; - std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; - std::unique_ptr module_; - std::unique_ptr text_decoder_runner_; - std::unique_ptr multimodal_prefiller_; - std::unique_ptr io_manager_; - std::unique_ptr text_token_generator_; - std::unique_ptr stats_; - std::unique_ptr sampler_; - std::unordered_map metadata_; }; -// Helper functions for creating MultimodalInput -MultimodalInput make_text_input(const std::string& text) { - return MultimodalInput::text(text); -} - -MultimodalInput make_image_input(py::array_t image_array) { - // Get image dimensions - py::buffer_info buf = image_array.request(); - - if (buf.ndim != 3) { - throw std::runtime_error("Image array must be 3-dimensional (H, W, C)"); - } - - size_t height = buf.shape[0]; - size_t width = buf.shape[1]; - size_t channels = buf.shape[2]; - - if (channels != 3 && channels != 4) { - throw std::runtime_error("Image must have 3 (RGB) or 4 (RGBA) channels"); - } - - // Create Image object from numpy array - uint8_t* data = static_cast(buf.ptr); - std::vector image_data(data, data + height * width * channels); - - Image image(std::move(image_data), height, width, channels); - return MultimodalInput::image(std::move(image)); -} - PYBIND11_MODULE(_llm_runner, m) { m.doc() = "Python bindings for ExecuTorch LLM Runners"; @@ -246,117 +133,188 @@ PYBIND11_MODULE(_llm_runner, m) { // Bind GenerationConfig py::class_(m, "GenerationConfig") .def(py::init<>()) + .def_readwrite("echo", &GenerationConfig::echo) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) - .def_readwrite("temperature", &GenerationConfig::temperature) - .def_readwrite("top_p", &GenerationConfig::top_p) - .def_readwrite("top_k", &GenerationConfig::top_k) - .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) - .def_readwrite("frequency_penalty", &GenerationConfig::frequency_penalty) .def_readwrite("warming", &GenerationConfig::warming) - .def_readwrite("echo", &GenerationConfig::echo) - .def_readwrite("seed", &GenerationConfig::seed) + .def_readwrite("seq_len", &GenerationConfig::seq_len) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("num_bos", &GenerationConfig::num_bos) + .def_readwrite("num_eos", &GenerationConfig::num_eos) + .def( + "resolve_max_new_tokens", + &GenerationConfig::resolve_max_new_tokens, + py::arg("max_context_len"), + py::arg("num_prompt_tokens"), + "Resolve the maximum number of new tokens to generate based on constraints") .def("__repr__", [](const GenerationConfig& config) { - return ""; + return ""; }); // Bind Stats py::class_(m, "Stats") + .def_readonly( + "SCALING_FACTOR_UNITS_PER_SECOND", + &Stats::SCALING_FACTOR_UNITS_PER_SECOND) .def_readonly("model_load_start_ms", &Stats::model_load_start_ms) .def_readonly("model_load_end_ms", &Stats::model_load_end_ms) .def_readonly("inference_start_ms", &Stats::inference_start_ms) - .def_readonly("inference_end_ms", &Stats::inference_end_ms) - .def_readonly("prompt_eval_start_ms", &Stats::prompt_eval_start_ms) + .def_readonly("token_encode_end_ms", &Stats::token_encode_end_ms) + .def_readonly( + "model_execution_start_ms", &Stats::model_execution_start_ms) + .def_readonly("model_execution_end_ms", &Stats::model_execution_end_ms) .def_readonly("prompt_eval_end_ms", &Stats::prompt_eval_end_ms) .def_readonly("first_token_ms", &Stats::first_token_ms) - .def_readonly("aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms) + .def_readonly("inference_end_ms", &Stats::inference_end_ms) + .def_readonly( + "aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms) .def_readonly("num_prompt_tokens", &Stats::num_prompt_tokens) .def_readonly("num_generated_tokens", &Stats::num_generated_tokens) - .def("get_model_load_time_ms", &Stats::get_model_load_time_ms) - .def("get_inference_time_ms", &Stats::get_inference_time_ms) - .def("get_prompt_eval_time_ms", &Stats::get_prompt_eval_time_ms) - .def("get_eval_time_ms", &Stats::get_eval_time_ms) - .def("get_sampling_time_ms", &Stats::get_sampling_time_ms) - .def("get_tokens_per_second", &Stats::get_tokens_per_second) + .def("on_sampling_begin", &Stats::on_sampling_begin) + .def("on_sampling_end", &Stats::on_sampling_end) + .def( + "reset", + &Stats::reset, + py::arg("all_stats") = false, + "Reset stats, optionally including model load times") + .def( + "to_json_string", + [](const Stats& stats) { return stats_to_json_string(stats); }, + "Convert stats to JSON string representation") .def("__repr__", [](const Stats& stats) { - return ""; + double tokens_per_second = 0.0; + if (stats.inference_end_ms > stats.inference_start_ms) { + tokens_per_second = static_cast(stats.num_generated_tokens) * + stats.SCALING_FACTOR_UNITS_PER_SECOND / + (stats.inference_end_ms - stats.inference_start_ms); + } + return ""; }); // Bind Image class py::class_(m, "Image") - .def(py::init, size_t, size_t, size_t>(), - py::arg("data"), py::arg("height"), py::arg("width"), py::arg("channels")) - .def_property_readonly("height", [](const Image& img) { return img.height_; }) - .def_property_readonly("width", [](const Image& img) { return img.width_; }) - .def_property_readonly("channels", [](const Image& img) { return img.channels_; }) + .def(py::init<>()) + .def_readwrite("data", &Image::data) + .def_readwrite("width", &Image::width) + .def_readwrite("height", &Image::height) + .def_readwrite("channels", &Image::channels) .def("__repr__", [](const Image& img) { - return ""; + return ""; }); // Bind MultimodalInput py::class_(m, "MultimodalInput") - .def_static("text", &MultimodalInput::text, - "Create a text input", py::arg("text")) - .def_static("image", &MultimodalInput::image, - "Create an image input", py::arg("image")) + .def( + py::init(), + py::arg("text"), + "Create a MultimodalInput with text") + .def( + py::init(), + py::arg("image"), + "Create a MultimodalInput with an image") .def("is_text", &MultimodalInput::is_text) .def("is_image", &MultimodalInput::is_image) - .def("get_text", [](const MultimodalInput& input) -> py::object { + .def( + "get_text", + [](const MultimodalInput& input) -> py::object { + if (input.is_text()) { + return py::cast(input.get_text()); + } + return py::none(); + }) + .def("__repr__", [](const MultimodalInput& input) -> std::string { if (input.is_text()) { - return py::cast(input.get_text()); - } - return py::none(); - }) - .def("__repr__", [](const MultimodalInput& input) { - if (input.is_text()) { - return " 50 ? "..." : "") + "\">"; + return " 50 ? "..." : "") + "\">"; } else if (input.is_image()) { return ""; } return ""; }); - // Bind helper functions - m.def("make_text_input", &make_text_input, - "Create a text input for multimodal processing", - py::arg("text")); - - m.def("make_image_input", &make_image_input, - "Create an image input from a numpy array (H, W, C)", - py::arg("image_array")); + // Bind helper functions using lambdas + m.def( + "make_text_input", + [](const std::string& text) -> MultimodalInput { + return MultimodalInput(text); + }, + "Create a text input for multimodal processing", + py::arg("text")); + + m.def( + "make_image_input", + [](py::array_t image_array) -> MultimodalInput { + // Get image dimensions + py::buffer_info buf = image_array.request(); + + if (buf.ndim != 3) { + throw std::runtime_error( + "Image array must be 3-dimensional (H, W, C)"); + } + + size_t height = buf.shape[0]; + size_t width = buf.shape[1]; + size_t channels = buf.shape[2]; + + if (channels != 3 && channels != 4) { + throw std::runtime_error( + "Image must have 3 (RGB) or 4 (RGBA) channels"); + } + + // Create Image object from numpy array + uint8_t* data = static_cast(buf.ptr); + std::vector image_data(data, data + height * width * channels); + + Image image; + image.data = std::move(image_data); + image.width = static_cast(width); + image.height = static_cast(height); + image.channels = static_cast(channels); + return MultimodalInput(std::move(image)); + }, + "Create an image input from a numpy array (H, W, C)", + py::arg("image_array")); // Bind PyMultimodalRunner py::class_(m, "MultimodalRunner") - .def(py::init(), - py::arg("model_path"), - py::arg("tokenizer_path"), - py::arg("temperature") = 0.8f, - "Initialize a MultimodalRunner with model and tokenizer paths") - .def("generate", &PyMultimodalRunner::generate, - py::arg("inputs"), - py::arg("config"), - py::arg("token_callback") = py::none(), - py::arg("stats_callback") = py::none(), - "Generate text from multimodal inputs with optional callbacks") - .def("generate_text", &PyMultimodalRunner::generate_text, - py::arg("inputs"), - py::arg("config"), - "Generate text and return the complete result as a string") - .def("stop", &PyMultimodalRunner::stop, - "Stop the current generation") - .def("get_vocab_size", &PyMultimodalRunner::get_vocab_size, - "Get the vocabulary size of the model") + // Constructor with tokenizer path + .def( + py::init< + const std::string&, + const std::string&, + std::optional>(), + py::arg("model_path"), + py::arg("tokenizer_path"), + py::arg("data_path") = py::none(), + "Initialize a MultimodalRunner with model and tokenizer paths") + .def( + "generate", + &PyMultimodalRunner::generate, + py::arg("inputs"), + py::arg("config"), + py::arg("token_callback") = py::none(), + py::arg("stats_callback") = py::none(), + "Generate text from multimodal inputs with optional callbacks") + .def("stop", &PyMultimodalRunner::stop, "Stop the current generation") + .def( + "reset", + &PyMultimodalRunner::reset, + "Reset the runner state and KV cache") + .def( + "get_vocab_size", + &PyMultimodalRunner::get_vocab_size, + "Get the vocabulary size of the model") .def("__repr__", [](const PyMultimodalRunner& runner) { - return ""; + return ""; }); } \ No newline at end of file diff --git a/extension/llm/runner/test_pybindings.py b/extension/llm/runner/test_pybindings.py new file mode 100644 index 00000000000..f914a785e70 --- /dev/null +++ b/extension/llm/runner/test_pybindings.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Unit tests for the ExecuTorch LLM Runner Python bindings. + +To run these tests: + python -m pytest test_pybindings.py -v +""" + +import unittest +import tempfile +import numpy as np +import os +import sys +from unittest.mock import Mock, patch, MagicMock + +# Try to import the module +try: + import _llm_runner +except ImportError: + print("Warning: _llm_runner module not found. Make sure it's built and in PYTHONPATH.") + sys.exit(1) + + +class TestGenerationConfig(unittest.TestCase): + """Test the GenerationConfig class.""" + + def test_default_values(self): + """Test that GenerationConfig has correct default values.""" + config = _llm_runner.GenerationConfig() + + # Check defaults based on irunner.h + self.assertEqual(config.echo, True) + self.assertEqual(config.max_new_tokens, -1) + self.assertEqual(config.warming, False) + self.assertEqual(config.seq_len, -1) + self.assertAlmostEqual(config.temperature, 0.8, places=5) + self.assertEqual(config.num_bos, 0) + self.assertEqual(config.num_eos, 0) + + def test_set_values(self): + """Test setting values on GenerationConfig.""" + config = _llm_runner.GenerationConfig() + + config.echo = False + config.max_new_tokens = 100 + config.warming = True + config.seq_len = 512 + config.temperature = 0.5 + config.num_bos = 1 + config.num_eos = 2 + + self.assertEqual(config.echo, False) + self.assertEqual(config.max_new_tokens, 100) + self.assertEqual(config.warming, True) + self.assertEqual(config.seq_len, 512) + self.assertAlmostEqual(config.temperature, 0.5, places=5) + self.assertEqual(config.num_bos, 1) + self.assertEqual(config.num_eos, 2) + + def test_resolve_max_new_tokens(self): + """Test the resolve_max_new_tokens method.""" + config = _llm_runner.GenerationConfig() + + # Test case 1: Both seq_len and max_new_tokens are -1 + config.seq_len = -1 + config.max_new_tokens = -1 + result = config.resolve_max_new_tokens(1024, 100) + self.assertEqual(result, 924) # 1024 - 100 + + # Test case 2: Only max_new_tokens is specified + config.seq_len = -1 + config.max_new_tokens = 200 + result = config.resolve_max_new_tokens(1024, 100) + self.assertEqual(result, 200) # min(200, 1024-100) + + # Test case 3: Only seq_len is specified + config.seq_len = 512 + config.max_new_tokens = -1 + result = config.resolve_max_new_tokens(1024, 100) + self.assertEqual(result, 412) # min(512, 1024) - 100 + + # Test case 4: Both are specified + config.seq_len = 512 + config.max_new_tokens = 200 + result = config.resolve_max_new_tokens(1024, 100) + self.assertEqual(result, 200) # min(min(512, 1024) - 100, 200) + + # Test case 5: Result would be negative + config.seq_len = 50 + config.max_new_tokens = -1 + result = config.resolve_max_new_tokens(1024, 100) + self.assertEqual(result, 0) # max(0, 50 - 100) + + def test_repr(self): + """Test the string representation.""" + config = _llm_runner.GenerationConfig() + config.max_new_tokens = 100 + config.seq_len = 512 + config.temperature = 0.7 + + repr_str = repr(config) + self.assertIn("GenerationConfig", repr_str) + self.assertIn("max_new_tokens=100", repr_str) + self.assertIn("seq_len=512", repr_str) + self.assertIn("temperature=0.7", repr_str) + self.assertIn("echo=True", repr_str) + self.assertIn("warming=False", repr_str) + + +class TestStats(unittest.TestCase): + """Test the Stats class.""" + + def test_attributes(self): + """Test that Stats has all expected attributes.""" + stats = _llm_runner.Stats() + + # Check all timing attributes exist + self.assertTrue(hasattr(stats, 'SCALING_FACTOR_UNITS_PER_SECOND')) + self.assertTrue(hasattr(stats, 'model_load_start_ms')) + self.assertTrue(hasattr(stats, 'model_load_end_ms')) + self.assertTrue(hasattr(stats, 'inference_start_ms')) + self.assertTrue(hasattr(stats, 'token_encode_end_ms')) + self.assertTrue(hasattr(stats, 'model_execution_start_ms')) + self.assertTrue(hasattr(stats, 'model_execution_end_ms')) + self.assertTrue(hasattr(stats, 'prompt_eval_end_ms')) + self.assertTrue(hasattr(stats, 'first_token_ms')) + self.assertTrue(hasattr(stats, 'inference_end_ms')) + self.assertTrue(hasattr(stats, 'aggregate_sampling_time_ms')) + self.assertTrue(hasattr(stats, 'num_prompt_tokens')) + self.assertTrue(hasattr(stats, 'num_generated_tokens')) + + def test_scaling_factor(self): + """Test the scaling factor constant.""" + stats = _llm_runner.Stats() + self.assertEqual(stats.SCALING_FACTOR_UNITS_PER_SECOND, 1000) + + def test_methods(self): + """Test Stats methods.""" + stats = _llm_runner.Stats() + + # Test on_sampling_begin and on_sampling_end + stats.on_sampling_begin() + stats.on_sampling_end() + + # Test reset without all_stats + stats.model_load_start_ms = 100 + stats.model_load_end_ms = 200 + stats.inference_start_ms = 300 + stats.num_prompt_tokens = 10 + stats.num_generated_tokens = 20 + + stats.reset(False) + + # Model load times should be preserved + self.assertEqual(stats.model_load_start_ms, 100) + self.assertEqual(stats.model_load_end_ms, 200) + # Other stats should be reset + self.assertEqual(stats.inference_start_ms, 0) + self.assertEqual(stats.num_prompt_tokens, 0) + self.assertEqual(stats.num_generated_tokens, 0) + + # Test reset with all_stats + stats.reset(True) + self.assertEqual(stats.model_load_start_ms, 0) + self.assertEqual(stats.model_load_end_ms, 0) + + def test_to_json_string(self): + """Test JSON string conversion.""" + stats = _llm_runner.Stats() + stats.num_prompt_tokens = 10 + stats.num_generated_tokens = 20 + stats.model_load_start_ms = 100 + stats.model_load_end_ms = 200 + stats.inference_start_ms = 300 + stats.inference_end_ms = 1300 + + json_str = stats.to_json_string() + self.assertIn('"prompt_tokens":10', json_str) + self.assertIn('"generated_tokens":20', json_str) + self.assertIn('"model_load_start_ms":100', json_str) + self.assertIn('"model_load_end_ms":200', json_str) + + def test_repr(self): + """Test string representation.""" + stats = _llm_runner.Stats() + stats.num_prompt_tokens = 10 + stats.num_generated_tokens = 20 + stats.inference_start_ms = 1000 + stats.inference_end_ms = 2000 + + repr_str = repr(stats) + self.assertIn("Stats", repr_str) + self.assertIn("num_prompt_tokens=10", repr_str) + self.assertIn("num_generated_tokens=20", repr_str) + self.assertIn("tokens_per_second=20", repr_str) # 20 tokens / 1 second + + +class TestImage(unittest.TestCase): + """Test the Image class.""" + + def test_creation(self): + """Test creating an Image object.""" + image = _llm_runner.Image() + + # Set properties + image.data = [1, 2, 3, 4] + image.width = 2 + image.height = 2 + image.channels = 1 + + self.assertEqual(image.data, [1, 2, 3, 4]) + self.assertEqual(image.width, 2) + self.assertEqual(image.height, 2) + self.assertEqual(image.channels, 1) + + def test_repr(self): + """Test string representation.""" + image = _llm_runner.Image() + image.width = 640 + image.height = 480 + image.channels = 3 + + repr_str = repr(image) + self.assertIn("Image", repr_str) + self.assertIn("height=480", repr_str) + self.assertIn("width=640", repr_str) + self.assertIn("channels=3", repr_str) + + +class TestMultimodalInput(unittest.TestCase): + """Test the MultimodalInput class.""" + + def test_text_input(self): + """Test creating a text MultimodalInput.""" + # Test direct constructor + text_input = _llm_runner.MultimodalInput("Hello, world!") + self.assertTrue(text_input.is_text()) + self.assertFalse(text_input.is_image()) + self.assertEqual(text_input.get_text(), "Hello, world!") + + # Test helper function + text_input2 = _llm_runner.make_text_input("Test text") + self.assertTrue(text_input2.is_text()) + self.assertEqual(text_input2.get_text(), "Test text") + + def test_image_input(self): + """Test creating an image MultimodalInput.""" + # Create an image + image = _llm_runner.Image() + image.data = [255] * (100 * 100 * 3) + image.width = 100 + image.height = 100 + image.channels = 3 + + # Test direct constructor + image_input = _llm_runner.MultimodalInput(image) + self.assertTrue(image_input.is_image()) + self.assertFalse(image_input.is_text()) + + # Test helper function with numpy array + img_array = np.ones((50, 60, 3), dtype=np.uint8) * 128 + image_input2 = _llm_runner.make_image_input(img_array) + self.assertTrue(image_input2.is_image()) + self.assertFalse(image_input2.is_text()) + + def test_invalid_image_array(self): + """Test error handling for invalid image arrays.""" + # Wrong dimensions + with self.assertRaises(RuntimeError) as cm: + _llm_runner.make_image_input(np.ones((100,), dtype=np.uint8)) + self.assertIn("3-dimensional", str(cm.exception)) + + # Wrong number of channels + with self.assertRaises(RuntimeError) as cm: + _llm_runner.make_image_input(np.ones((100, 100, 2), dtype=np.uint8)) + self.assertIn("3 (RGB) or 4 (RGBA)", str(cm.exception)) + + def test_repr(self): + """Test string representation.""" + # Text input + text_input = _llm_runner.MultimodalInput("This is a test") + repr_str = repr(text_input) + self.assertIn("MultimodalInput", repr_str) + self.assertIn("type=text", repr_str) + self.assertIn("This is a test", repr_str) + + # Long text input (should be truncated) + long_text = "a" * 100 + text_input2 = _llm_runner.MultimodalInput(long_text) + repr_str2 = repr(text_input2) + self.assertIn("...", repr_str2) + + # Image input + image = _llm_runner.Image() + image_input = _llm_runner.MultimodalInput(image) + repr_str3 = repr(image_input) + self.assertIn("type=image", repr_str3) + + +class TestMultimodalRunner(unittest.TestCase): + """Test the MultimodalRunner class.""" + + def setUp(self): + """Set up test fixtures.""" + # Create temporary files for testing + self.temp_dir = tempfile.mkdtemp() + self.model_path = os.path.join(self.temp_dir, "model.pte") + self.tokenizer_path = os.path.join(self.temp_dir, "tokenizer.bin") + + # Create dummy files (these won't actually work, but we can test initialization failure) + with open(self.model_path, 'wb') as f: + f.write(b"dummy model") + with open(self.tokenizer_path, 'wb') as f: + f.write(b"dummy tokenizer") + + def tearDown(self): + """Clean up test fixtures.""" + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_initialization_failure(self): + """Test that initialization fails gracefully with invalid files.""" + with self.assertRaises(RuntimeError) as cm: + runner = _llm_runner.MultimodalRunner( + self.model_path, + self.tokenizer_path + ) + # Should fail because the tokenizer file is not valid + self.assertIn("Failed to", str(cm.exception)) + + +class TestHelperFunctions(unittest.TestCase): + """Test helper functions.""" + + def test_make_text_input(self): + """Test make_text_input helper.""" + text_input = _llm_runner.make_text_input("Hello") + self.assertTrue(text_input.is_text()) + self.assertEqual(text_input.get_text(), "Hello") + + def test_make_image_input(self): + """Test make_image_input helper.""" + # Create a test image array (RGB) + img_array = np.zeros((100, 150, 3), dtype=np.uint8) + img_array[:, :, 0] = 255 # Red channel + + image_input = _llm_runner.make_image_input(img_array) + self.assertTrue(image_input.is_image()) + + # Test with RGBA + img_array_rgba = np.ones((50, 50, 4), dtype=np.uint8) * 128 + image_input_rgba = _llm_runner.make_image_input(img_array_rgba) + self.assertTrue(image_input_rgba.is_image()) + + +class TestIntegration(unittest.TestCase): + """Integration tests for the module.""" + + def test_module_attributes(self): + """Test that the module has expected attributes.""" + # Classes + self.assertTrue(hasattr(_llm_runner, 'GenerationConfig')) + self.assertTrue(hasattr(_llm_runner, 'Stats')) + self.assertTrue(hasattr(_llm_runner, 'Image')) + self.assertTrue(hasattr(_llm_runner, 'MultimodalInput')) + self.assertTrue(hasattr(_llm_runner, 'MultimodalRunner')) + + # Helper functions + self.assertTrue(hasattr(_llm_runner, 'make_text_input')) + self.assertTrue(hasattr(_llm_runner, 'make_image_input')) + + def test_workflow_simulation(self): + """Test a simulated workflow (without actual model).""" + # Create configuration + config = _llm_runner.GenerationConfig() + config.max_new_tokens = 50 + config.temperature = 0.7 + config.echo = False + + # Create inputs + inputs = [] + + # Add text input + text = "Describe this image in detail:" + inputs.append(_llm_runner.make_text_input(text)) + + # Add image input + image_array = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) + inputs.append(_llm_runner.make_image_input(image_array)) + + # Verify inputs + self.assertEqual(len(inputs), 2) + self.assertTrue(inputs[0].is_text()) + self.assertTrue(inputs[1].is_image()) + self.assertEqual(inputs[0].get_text(), text) + + # Test Stats + stats = _llm_runner.Stats() + stats.num_prompt_tokens = 15 + stats.num_generated_tokens = 45 + stats.inference_start_ms = 1000 + stats.inference_end_ms = 3000 + + json_output = stats.to_json_string() + self.assertIsInstance(json_output, str) + self.assertIn("prompt_tokens", json_output) + self.assertIn("generated_tokens", json_output) \ No newline at end of file diff --git a/extension/llm/runner/utils.py b/extension/llm/runner/utils.py index 35a3db11a3d..af6b19a94e4 100644 --- a/extension/llm/runner/utils.py +++ b/extension/llm/runner/utils.py @@ -11,12 +11,14 @@ configuration creation, and data conversion. """ -from typing import Union, Tuple, Optional, Dict, Any -import numpy as np from pathlib import Path +from typing import Any, Optional, Tuple, Union + +import numpy as np try: from PIL import Image as PILImage + HAS_PIL = True except ImportError: HAS_PIL = False @@ -27,19 +29,19 @@ def load_image_from_file( image_path: Union[str, Path], target_size: Optional[Tuple[int, int]] = None, - mode: str = 'RGB' + mode: str = "RGB", ) -> np.ndarray: """ Load an image from file and optionally resize it. - + Args: image_path: Path to the image file target_size: Optional (width, height) tuple to resize the image mode: Image mode ('RGB', 'RGBA', 'L' for grayscale) - + Returns: NumPy array with shape (H, W, C) for color or (H, W) for grayscale - + Raises: FileNotFoundError: If the image file doesn't exist ImportError: If neither PIL nor OpenCV is available @@ -48,47 +50,47 @@ def load_image_from_file( image_path = Path(image_path) if not image_path.exists(): raise FileNotFoundError(f"Image file not found: {image_path}") - + if HAS_PIL: # Use PIL/Pillow image = PILImage.open(image_path) - + # Convert to requested mode if image.mode != mode: image = image.convert(mode) - + # Resize if requested if target_size is not None: image = image.resize(target_size, PILImage.Resampling.LANCZOS) - + # Convert to numpy array return np.array(image, dtype=np.uint8) else: # Try OpenCV try: import cv2 - + # Read image - if mode == 'L': + if mode == "L": image = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE) else: image = cv2.imread(str(image_path), cv2.IMREAD_COLOR) - + if image is None: raise ValueError(f"Failed to load image: {image_path}") - + # Convert BGR to RGB if needed - if mode == 'RGB' and len(image.shape) == 3: + if mode == "RGB" and len(image.shape) == 3: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - elif mode == 'RGBA' and len(image.shape) == 3: + elif mode == "RGBA" and len(image.shape) == 3: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGBA) - + # Resize if requested if target_size is not None: image = cv2.resize(image, target_size, interpolation=cv2.INTER_LANCZOS4) - + return image.astype(np.uint8) - + except ImportError: raise ImportError( "Either PIL or OpenCV is required to load images from files. " @@ -101,27 +103,29 @@ def preprocess_image( target_size: Optional[Tuple[int, int]] = None, normalize: bool = False, mean: Optional[Tuple[float, float, float]] = None, - std: Optional[Tuple[float, float, float]] = None + std: Optional[Tuple[float, float, float]] = None, ) -> np.ndarray: """ Preprocess an image array for model input. - + Args: image: Input image as numpy array (H, W, C) target_size: Optional (width, height) tuple to resize the image normalize: Whether to normalize pixel values to [0, 1] mean: Mean values for normalization (per channel) std: Standard deviation values for normalization (per channel) - + Returns: Preprocessed image array - + Raises: ValueError: If image dimensions are invalid """ if image.ndim != 3: - raise ValueError(f"Image must be 3-dimensional (H, W, C), got shape {image.shape}") - + raise ValueError( + f"Image must be 3-dimensional (H, W, C), got shape {image.shape}" + ) + # Resize if needed if target_size is not None: if HAS_PIL: @@ -133,28 +137,34 @@ def preprocess_image( # Try OpenCV try: import cv2 + image = cv2.resize(image, target_size, interpolation=cv2.INTER_LANCZOS4) except ImportError: # Simple nearest neighbor resize as fallback from scipy import ndimage - factors = (target_size[1] / image.shape[0], target_size[0] / image.shape[1], 1) + + factors = ( + target_size[1] / image.shape[0], + target_size[0] / image.shape[1], + 1, + ) image = ndimage.zoom(image, factors, order=1) - + # Convert to float for normalization if normalize or mean is not None or std is not None: image = image.astype(np.float32) - + if normalize: image = image / 255.0 - + if mean is not None: mean_arr = np.array(mean).reshape(1, 1, -1) image = image - mean_arr - + if std is not None: std_arr = np.array(std).reshape(1, 1, -1) image = image / std_arr - + return image @@ -168,11 +178,11 @@ def create_generation_config( frequency_penalty: float = 0.0, echo: bool = False, seed: Optional[int] = None, - **kwargs + **kwargs, ) -> GenerationConfig: """ Create a GenerationConfig with sensible defaults. - + Args: max_new_tokens: Maximum number of tokens to generate (default: 1000) temperature: Sampling temperature, higher = more random (default: 0.8) @@ -184,10 +194,10 @@ def create_generation_config( echo: Whether to echo the input prompt (default: False) seed: Random seed for reproducibility (default: None) **kwargs: Additional parameters to set on the config - + Returns: A configured GenerationConfig object - + Example: >>> config = create_generation_config( ... max_new_tokens=100, @@ -196,7 +206,7 @@ def create_generation_config( ... ) """ config = GenerationConfig() - + # Set all parameters config.max_new_tokens = max_new_tokens config.temperature = temperature @@ -206,72 +216,31 @@ def create_generation_config( config.presence_penalty = presence_penalty config.frequency_penalty = frequency_penalty config.echo = echo - + if seed is not None: config.seed = seed - + # Set any additional parameters for key, value in kwargs.items(): if hasattr(config, key): setattr(config, key, value) else: raise ValueError(f"GenerationConfig has no parameter '{key}'") - - return config - -def batch_generate( - runner: 'MultimodalRunner', - batch_inputs: list, - config: Optional[GenerationConfig] = None, - show_progress: bool = True -) -> list: - """ - Generate text for multiple input batches. - - Args: - runner: The MultimodalRunner instance - batch_inputs: List of input lists, each containing multimodal inputs - config: Generation configuration (shared for all batches) - show_progress: Whether to show a progress bar - - Returns: - List of generated text strings - - Example: - >>> batch_inputs = [ - ... [make_text_input("Question 1")], - ... [make_text_input("Question 2")], - ... ] - >>> results = batch_generate(runner, batch_inputs) - """ - results = [] - - if show_progress: - try: - from tqdm import tqdm - batch_inputs = tqdm(batch_inputs, desc="Generating") - except ImportError: - pass - - for inputs in batch_inputs: - result = runner.generate_text(inputs, config) - results.append(result) - - return results + return config def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int: """ Estimate the number of tokens in a text string. - + This is a rough approximation and actual token count may vary depending on the tokenizer used. - + Args: text: Input text string chars_per_token: Average characters per token (default: 4.0) - + Returns: Estimated number of tokens """ @@ -281,10 +250,10 @@ def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int: def format_stats(stats: Any) -> str: """ Format generation statistics for display. - + Args: stats: Stats object from the runner - + Returns: Formatted string with statistics """ @@ -299,4 +268,4 @@ def format_stats(stats: Any) -> str: f" Generated tokens: {stats.num_generated_tokens}", f" Tokens per second: {stats.get_tokens_per_second():.2f}", ] - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) diff --git a/setup.py b/setup.py index a35e0c96a9c..83e67f345c7 100644 --- a/setup.py +++ b/setup.py @@ -814,6 +814,7 @@ def run(self): # noqa C901 if cmake_cache.is_enabled("EXECUTORCH_BUILD_PYBIND"): cmake_build_args += ["--target", "portable_lib"] cmake_build_args += ["--target", "selective_build"] + cmake_build_args += ["--target", "_llm_runner"] if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_MODULE"): cmake_build_args += ["--target", "extension_module"] diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake index c7ad94cd8be..95f54ed8de2 100644 --- a/tools/cmake/preset/pybind.cmake +++ b/tools/cmake/preset/pybind.cmake @@ -13,6 +13,8 @@ set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT ON) set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON) set_overridable_option(EXECUTORCH_LOG_LEVEL Info) set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON) set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON) set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON) set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON) From 568f50c267c0de8100a1fb2cbd0f4ef90c40fd41 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 18:38:39 -0700 Subject: [PATCH 03/40] Add readme --- extension/llm/runner/README.md | 117 ++++++++ .../llm/runner/README_PYTHON_BINDINGS.md | 249 ------------------ 2 files changed, 117 insertions(+), 249 deletions(-) delete mode 100644 extension/llm/runner/README_PYTHON_BINDINGS.md diff --git a/extension/llm/runner/README.md b/extension/llm/runner/README.md index ab8ec8964dd..125944663ed 100644 --- a/extension/llm/runner/README.md +++ b/extension/llm/runner/README.md @@ -164,6 +164,123 @@ int main() { } ``` +## Python API + +The LLM Runner framework also provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features. + +### Installation + +Build the Python bindings as part of the ExecuTorch build: + +```bash +# Build with Python bindings enabled +cmake -DPYTHON_EXECUTABLE=$(which python3) \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_PYTHON_BINDINGS=ON \ + .. +make -j8 _llm_runner +``` + +### Quick Start - Python + +```python +import _llm_runner +import numpy as np + +# Create a multimodal runner +runner = _llm_runner.MultimodalRunner( + model_path="/path/to/model.pte", + tokenizer_path="/path/to/tokenizer.bin" +) + +# Create multimodal inputs +inputs = [] + +# Add text input +inputs.append(_llm_runner.make_text_input("Describe this image:")) + +# Add image input from numpy array +image_array = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) +inputs.append(_llm_runner.make_image_input(image_array)) + +# Configure generation +config = _llm_runner.GenerationConfig() +config.max_new_tokens = 100 +config.temperature = 0.7 +config.echo = False + +# Generate text with callback +def token_callback(token: str): + print(token, end='', flush=True) + +def stats_callback(stats): + print(f"\nGenerated {stats.num_generated_tokens} tokens") + print(f"Tokens/sec: {stats.num_generated_tokens * 1000 / (stats.inference_end_ms - stats.inference_start_ms):.1f}") + +# Run generation +runner.generate(inputs, config, token_callback, stats_callback) + +# Or get complete text result +result = runner.generate_text(inputs, config) +print(f"Generated text: {result}") +``` + +### Python API Features + +- **Type hints**: Full type annotations with `.pyi` stub files for IDE support +- **NumPy integration**: Direct support for numpy arrays as image inputs +- **Callbacks**: Optional token and statistics callbacks for streaming generation +- **Exception handling**: Pythonic error handling with RuntimeError for failures +- **Memory management**: Automatic resource cleanup with Python garbage collection + +### Python API Classes + +#### GenerationConfig +```python +config = _llm_runner.GenerationConfig() +config.max_new_tokens = 50 # Maximum tokens to generate +config.temperature = 0.8 # Sampling temperature +config.echo = True # Echo input prompt +config.seq_len = 512 # Maximum sequence length +config.num_bos = 1 # Number of BOS tokens +config.num_eos = 1 # Number of EOS tokens +``` + +#### MultimodalInput +```python +# Text input +text_input = _llm_runner.MultimodalInput("Hello, world!") +# Or using helper +text_input = _llm_runner.make_text_input("Hello, world!") + +# Image input +image = _llm_runner.Image() +image.data = [255] * (224 * 224 * 3) # RGB data +image.width = 224 +image.height = 224 +image.channels = 3 +image_input = _llm_runner.MultimodalInput(image) + +# Or from numpy array +img_array = np.ones((224, 224, 3), dtype=np.uint8) * 128 +image_input = _llm_runner.make_image_input(img_array) +``` + +#### Stats +```python +# Access timing and performance statistics +stats = _llm_runner.Stats() +print(f"Model load time: {stats.model_load_end_ms - stats.model_load_start_ms}ms") +print(f"Inference time: {stats.inference_end_ms - stats.inference_start_ms}ms") +print(f"Tokens generated: {stats.num_generated_tokens}") +print(f"Prompt tokens: {stats.num_prompt_tokens}") + +# JSON export +json_str = stats.to_json_string() +``` + +For detailed Python API documentation and examples, see [README_PYTHON_BINDINGS.md](README_PYTHON_BINDINGS.md). + ## Core Components ### Component Architecture diff --git a/extension/llm/runner/README_PYTHON_BINDINGS.md b/extension/llm/runner/README_PYTHON_BINDINGS.md deleted file mode 100644 index 105b05f4f1e..00000000000 --- a/extension/llm/runner/README_PYTHON_BINDINGS.md +++ /dev/null @@ -1,249 +0,0 @@ -# Python Bindings for MultimodalRunner - -## Overview - -This project provides Python bindings for the ExecuTorch MultimodalRunner, enabling Python developers to easily use the multimodal LLM runner for processing mixed inputs (text, images, audio) and generating text outputs. - -## Architecture - -The MultimodalRunner is designed for Large Language Models that can process multimodal inputs and generate text outputs. It supports models like: -- LLaVA (vision-language models) -- CLIP-based models -- Speech-to-text models -- Other multimodal transformers - -### Key Components - -1. **MultimodalRunner** - Main runner class for multimodal inference -2. **MultimodalInput** - Handles different input modalities (text, image, audio) -3. **GenerationConfig** - Configuration for text generation parameters -4. **Stats** - Performance monitoring and statistics -5. **Tokenizer** - Text tokenization and decoding - -## Project Structure - -``` -extension/llm/runner/ -├── multimodal_runner_pybindings.cpp # Python bindings implementation (NEW) -├── __init__.py # Python package initialization (NEW) -├── multimodal_runner.py # Python wrapper classes (NEW) -├── utils.py # Utility functions (NEW) -├── CMakeLists.txt # Existing - update to include Python bindings -└── test/ - ├── test_multimodal_runner.py # Unit tests for Python bindings (NEW) - └── test_generation.py # Generation tests (NEW) - └── [existing test files] # Existing C++ tests remain here -``` - -Note: We'll reuse the root-level `setup.py` and update the existing `CMakeLists.txt` rather than creating new ones. - -## Action Items - -### 1. Core Implementation Tasks - -#### High Priority -- [x] ~~**Create Python bindings file** (`multimodal_runner_pybindings.cpp`)~~ - - [x] ~~Bind MultimodalRunner class~~ - - [x] ~~Bind MultimodalInput and helper functions~~ - - [x] ~~Bind GenerationConfig struct~~ - - [x] ~~Bind Stats class for performance monitoring~~ - - [x] ~~Implement error handling and exception translation~~ - -#### Medium Priority -- [x] ~~**Update existing CMakeLists.txt** in `extension/llm/runner/`~~ - - [x] ~~Add Python bindings target when EXECUTORCH_BUILD_PYBIND is enabled~~ - - [x] ~~Configure pybind11 integration~~ - - [x] ~~Link with extension_llm_runner library~~ - - [x] ~~Handle tokenizers dependency~~ - - [x] ~~Set up proper include paths~~ - -- [x] ~~**Update root-level setup.py**~~ - - [x] ~~Add multimodal_runner to the extensions list~~ - - [x] ~~Ensure proper build configuration~~ - - [x] ~~Handle platform-specific configurations~~ - -#### Low Priority -- [x] ~~**Create Python wrapper files** in `extension/llm/runner/`~~ - - [x] ~~`__init__.py` - Package initialization~~ - - [x] ~~`multimodal_runner.py` - High-level Python API~~ - - [x] ~~`utils.py` - Utility functions for input preprocessing~~ - -### 2. Build System Integration - -- [ ] **Integrate with main CMake build** - - [ ] Add Python bindings compilation when EXECUTORCH_BUILD_PYBIND is enabled - - [ ] Update extension/llm/runner/CMakeLists.txt to build multimodal_runner_pybindings.cpp - - [ ] Ensure proper dependency resolution - -- [ ] **Handle dependencies** - - [ ] Link against existing tokenizers Python bindings - - [ ] Ensure Module and other dependencies are available - - [ ] Handle pybind11 version requirements - -### 3. Input/Output Handling - -- [ ] **Implement MultimodalInput Python bindings** - - [ ] Support for text inputs - - [ ] Support for image inputs (numpy arrays, PIL Images) - - [ ] Support for audio inputs (if applicable) - - [ ] Mixed input ordering support - -- [ ] **Implement callbacks** - - [ ] Token generation callback - - [ ] Statistics callback - - [ ] Progress reporting - -### 4. Testing and Documentation - -- [ ] **Create comprehensive tests** - - [ ] Unit tests for bindings - - [ ] Integration tests with sample models - - [ ] Performance benchmarks - - [ ] Memory leak tests - -- [ ] **Write documentation** - - [ ] API documentation with examples - - [ ] Installation guide - - [ ] Usage tutorials - - [ ] Model compatibility guide - -### 5. Example Scripts - -- [ ] **Create example scripts** - - [ ] Basic text generation - - [ ] Image + text (vision-language) example - - [ ] Batch processing example - - [ ] Streaming generation example - -## Installation Instructions - -### Prerequisites - -- Python >= 3.8 -- CMake >= 3.18 -- C++17 compatible compiler -- PyTorch (for tensor operations) -- pybind11 >= 2.6.0 - -### Building from Source - -```bash -# Clone the repository -git clone https://github.com/pytorch/executorch.git -cd executorch - -# Install dependencies -pip install -r requirements.txt - -# Build with Python bindings enabled -python setup.py install --cmake-args="-DEXECUTORCH_BUILD_PYBIND=ON" - -# Or for development -pip install -e . --config-settings editable_mode=compat -``` - -### Running Tests - -```bash -# Run the multimodal runner Python tests -python -m pytest extension/llm/runner/test/test_multimodal_runner.py -v -``` - -## Usage Example - -```python -from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig -from executorch.extension.llm.runner.utils import make_text_input, make_image_input -import numpy as np - -# Initialize the runner -runner = MultimodalRunner( - model_path="path/to/model.pte", - tokenizer_path="path/to/tokenizer.bin" -) - -# Create multimodal inputs -image_array = np.random.rand(224, 224, 3) # Example image -inputs = [ - make_text_input("Describe this image:"), - make_image_input(image_array) # numpy array or PIL Image -] - -# Configure generation -config = GenerationConfig( - max_new_tokens=100, - temperature=0.7, - top_p=0.9 -) - -# Generate text with callbacks -def on_token(token): - print(token, end='', flush=True) - -def on_stats(stats): - print(f"\nTokens/sec: {stats.tokens_per_second:.2f}") - -runner.generate(inputs, config, token_callback=on_token, stats_callback=on_stats) - -# Or simpler usage without callbacks -response = runner.generate_text(inputs, config) -print(response) -``` - -## Technical Considerations - -### Memory Management -- Python bindings should properly handle memory ownership -- Use shared_ptr/unique_ptr appropriately -- Implement proper cleanup in destructors - -### Threading and GIL -- Consider GIL release during long-running operations -- Ensure thread safety for callbacks -- Handle Python exceptions in C++ code - -### Performance -- Minimize data copying between Python and C++ -- Use move semantics where possible -- Consider zero-copy tensor operations - -## Dependencies - -### Required -- executorch core libraries -- extension_llm_runner -- tokenizers library -- pybind11 - -### Optional -- numpy (for array handling) -- PIL/Pillow (for image processing) -- torch (for tensor operations) - -## Contributing - -Please follow the ExecuTorch contribution guidelines. Key points: -- Code should be formatted with clang-format -- Python code should follow PEP 8 -- Add comprehensive tests for new features -- Update documentation as needed - -## License - -This project is licensed under the BSD-style license found in the LICENSE file in the root directory of the ExecuTorch repository. - -## Next Steps - -1. **Review and approve this plan** with the team -2. **Start with core bindings** implementation -3. **Test with existing models** (LLaVA, etc.) -4. **Gather feedback** from early users -5. **Iterate and improve** based on usage patterns - -## Questions for Discussion - -1. Should we support async generation? -2. What level of integration with PyTorch tensors is needed? -3. Should we provide pre-built wheels or source-only distribution? -4. How should we handle model loading and caching? -5. What additional utilities would be helpful for users? \ No newline at end of file From 72fc953a0af27d3dadb464205881315b2bb6f985 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 18:44:57 -0700 Subject: [PATCH 04/40] move test to test/ --- extension/llm/runner/{ => test}/test_pybindings.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename extension/llm/runner/{ => test}/test_pybindings.py (100%) diff --git a/extension/llm/runner/test_pybindings.py b/extension/llm/runner/test/test_pybindings.py similarity index 100% rename from extension/llm/runner/test_pybindings.py rename to extension/llm/runner/test/test_pybindings.py From e4ffbbeff6a946917f3ed77416bf288a833f08cf Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 15 Sep 2025 10:39:23 -0700 Subject: [PATCH 05/40] Fix tests --- extension/llm/runner/_llm_runner.pyi | 158 ++++++----- extension/llm/runner/test/test_pybindings.py | 261 ++++++++----------- 2 files changed, 178 insertions(+), 241 deletions(-) diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi index 97d84b08a0e..e245301747b 100644 --- a/extension/llm/runner/_llm_runner.pyi +++ b/extension/llm/runner/_llm_runner.pyi @@ -4,291 +4,283 @@ Type stubs for _llm_runner module. This file provides type annotations for the ExecuTorch LLM Runner Python bindings. """ -from typing import List, Optional, Callable, Union +from typing import Callable, List, Optional, Union + import numpy as np from numpy.typing import NDArray class GenerationConfig: """Configuration for text generation.""" - + echo: bool """Whether to echo the input prompt in the output.""" - + max_new_tokens: int """Maximum number of new tokens to generate (-1 for auto).""" - + warming: bool """Whether this is a warmup run (affects perf benchmarking).""" - + seq_len: int """Maximum number of total tokens (-1 for auto).""" - + temperature: float """Temperature for sampling (higher = more random).""" - + num_bos: int """Number of BOS tokens to add to the prompt.""" - + num_eos: int """Number of EOS tokens to add to the prompt.""" - + def __init__(self) -> None: """Initialize GenerationConfig with default values.""" ... - - def resolve_max_new_tokens(self, max_context_len: int, num_prompt_tokens: int) -> int: + + def resolve_max_new_tokens( + self, max_context_len: int, num_prompt_tokens: int + ) -> int: """ Resolve the maximum number of new tokens to generate based on constraints. - + Args: max_context_len: The maximum context length supported by the model num_prompt_tokens: The number of tokens in the input prompt - + Returns: The resolved maximum number of new tokens to generate """ ... - - def __repr__(self) -> str: ... + def __repr__(self) -> str: ... class Stats: """Statistics for LLM generation performance.""" - + SCALING_FACTOR_UNITS_PER_SECOND: int """Scaling factor for timestamps (1000 for milliseconds).""" - + model_load_start_ms: int """Start time of model loading in milliseconds.""" - + model_load_end_ms: int """End time of model loading in milliseconds.""" - + inference_start_ms: int """Start time of inference in milliseconds.""" - + token_encode_end_ms: int """End time of tokenizer encoding in milliseconds.""" - + model_execution_start_ms: int """Start time of model execution in milliseconds.""" - + model_execution_end_ms: int """End time of model execution in milliseconds.""" - + prompt_eval_end_ms: int """End time of prompt evaluation in milliseconds.""" - + first_token_ms: int """Timestamp when the first generated token is emitted.""" - + inference_end_ms: int """End time of inference/generation in milliseconds.""" - + aggregate_sampling_time_ms: int """Total time spent in sampling across all tokens.""" - + num_prompt_tokens: int """Number of tokens in the input prompt.""" - + num_generated_tokens: int """Number of tokens generated.""" - + def on_sampling_begin(self) -> None: """Mark the beginning of a sampling operation.""" ... - + def on_sampling_end(self) -> None: """Mark the end of a sampling operation.""" ... - + def reset(self, all_stats: bool = False) -> None: """ Reset statistics. - + Args: all_stats: If True, reset all stats including model load times. If False, preserve model load times. """ ... - + def to_json_string(self) -> str: """Convert stats to JSON string representation.""" ... - - def __repr__(self) -> str: ... + def __repr__(self) -> str: ... class Image: """Container for image data.""" - + data: List[int] """Raw image data as a list of uint8 values.""" - + width: int """Image width in pixels.""" - + height: int """Image height in pixels.""" - + channels: int """Number of color channels (3 for RGB, 4 for RGBA).""" - + def __init__(self) -> None: """Initialize an empty Image.""" ... - - def __repr__(self) -> str: ... + def __repr__(self) -> str: ... class MultimodalInput: """Container for multimodal input data (text, image, etc.).""" - + def __init__(self, text: str) -> None: """ Create a MultimodalInput with text. - + Args: text: The input text string """ ... - + def __init__(self, image: Image) -> None: """ Create a MultimodalInput with an image. - + Args: image: The input image """ ... - + def is_text(self) -> bool: """Check if this input contains text.""" ... - + def is_image(self) -> bool: """Check if this input contains an image.""" ... - + def get_text(self) -> Optional[str]: """ Get the text content if this is a text input. - + Returns: The text string if this is a text input, None otherwise """ ... - - def __repr__(self) -> str: ... + def __repr__(self) -> str: ... class MultimodalRunner: """Runner for multimodal language models.""" - + def __init__( - self, - model_path: str, - tokenizer_path: str, - data_path: Optional[str] = None + self, model_path: str, tokenizer_path: str, data_path: Optional[str] = None ) -> None: """ Initialize a MultimodalRunner. - + Args: model_path: Path to the model file (.pte) tokenizer_path: Path to the tokenizer file data_path: Optional path to additional data file - + Raises: RuntimeError: If initialization fails """ ... - + def generate( self, inputs: List[MultimodalInput], config: GenerationConfig, token_callback: Optional[Callable[[str], None]] = None, - stats_callback: Optional[Callable[[Stats], None]] = None + stats_callback: Optional[Callable[[Stats], None]] = None, ) -> None: """ Generate text from multimodal inputs. - + Args: inputs: List of multimodal inputs (text, images, etc.) config: Generation configuration token_callback: Optional callback called for each generated token stats_callback: Optional callback called with generation statistics - + Raises: RuntimeError: If generation fails """ ... - + def generate_text( - self, - inputs: List[MultimodalInput], - config: GenerationConfig + self, inputs: List[MultimodalInput], config: GenerationConfig ) -> str: """ Generate text and return the complete result as a string. - + Args: inputs: List of multimodal inputs (text, images, etc.) config: Generation configuration - + Returns: The generated text as a string - + Raises: RuntimeError: If generation fails """ ... - + def stop(self) -> None: """Stop the current generation process.""" ... - + def reset(self) -> None: """Reset the runner state and KV cache.""" ... - + def get_vocab_size(self) -> int: """ Get the vocabulary size of the model. - + Returns: The vocabulary size, or -1 if not available """ ... - - def __repr__(self) -> str: ... + def __repr__(self) -> str: ... def make_text_input(text: str) -> MultimodalInput: """ Create a text input for multimodal processing. - + Args: text: The input text string - + Returns: A MultimodalInput containing the text """ ... - def make_image_input(image_array: NDArray[np.uint8]) -> MultimodalInput: """ Create an image input from a numpy array. - + Args: image_array: Numpy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) - + Returns: A MultimodalInput containing the image - + Raises: RuntimeError: If the array has invalid dimensions or number of channels """ - ... \ No newline at end of file + ... diff --git a/extension/llm/runner/test/test_pybindings.py b/extension/llm/runner/test/test_pybindings.py index f914a785e70..06c7392a227 100644 --- a/extension/llm/runner/test/test_pybindings.py +++ b/extension/llm/runner/test/test_pybindings.py @@ -12,28 +12,29 @@ python -m pytest test_pybindings.py -v """ -import unittest -import tempfile -import numpy as np import os -import sys -from unittest.mock import Mock, patch, MagicMock +import tempfile +import unittest -# Try to import the module -try: - import _llm_runner -except ImportError: - print("Warning: _llm_runner module not found. Make sure it's built and in PYTHONPATH.") - sys.exit(1) +import numpy as np +from executorch.extension.llm.runner import ( + GenerationConfig, + Image, + make_image_input, + make_text_input, + MultimodalInput, + MultimodalRunner, + Stats, +) class TestGenerationConfig(unittest.TestCase): """Test the GenerationConfig class.""" - + def test_default_values(self): """Test that GenerationConfig has correct default values.""" - config = _llm_runner.GenerationConfig() - + config = GenerationConfig() + # Check defaults based on irunner.h self.assertEqual(config.echo, True) self.assertEqual(config.max_new_tokens, -1) @@ -42,11 +43,11 @@ def test_default_values(self): self.assertAlmostEqual(config.temperature, 0.8, places=5) self.assertEqual(config.num_bos, 0) self.assertEqual(config.num_eos, 0) - + def test_set_values(self): """Test setting values on GenerationConfig.""" - config = _llm_runner.GenerationConfig() - + config = GenerationConfig() + config.echo = False config.max_new_tokens = 100 config.warming = True @@ -54,7 +55,7 @@ def test_set_values(self): config.temperature = 0.5 config.num_bos = 1 config.num_eos = 2 - + self.assertEqual(config.echo, False) self.assertEqual(config.max_new_tokens, 100) self.assertEqual(config.warming, True) @@ -62,48 +63,48 @@ def test_set_values(self): self.assertAlmostEqual(config.temperature, 0.5, places=5) self.assertEqual(config.num_bos, 1) self.assertEqual(config.num_eos, 2) - + def test_resolve_max_new_tokens(self): """Test the resolve_max_new_tokens method.""" - config = _llm_runner.GenerationConfig() - + config = GenerationConfig() + # Test case 1: Both seq_len and max_new_tokens are -1 config.seq_len = -1 config.max_new_tokens = -1 result = config.resolve_max_new_tokens(1024, 100) self.assertEqual(result, 924) # 1024 - 100 - + # Test case 2: Only max_new_tokens is specified config.seq_len = -1 config.max_new_tokens = 200 result = config.resolve_max_new_tokens(1024, 100) self.assertEqual(result, 200) # min(200, 1024-100) - + # Test case 3: Only seq_len is specified config.seq_len = 512 config.max_new_tokens = -1 result = config.resolve_max_new_tokens(1024, 100) self.assertEqual(result, 412) # min(512, 1024) - 100 - + # Test case 4: Both are specified config.seq_len = 512 config.max_new_tokens = 200 result = config.resolve_max_new_tokens(1024, 100) self.assertEqual(result, 200) # min(min(512, 1024) - 100, 200) - + # Test case 5: Result would be negative config.seq_len = 50 config.max_new_tokens = -1 result = config.resolve_max_new_tokens(1024, 100) self.assertEqual(result, 0) # max(0, 50 - 100) - + def test_repr(self): """Test the string representation.""" - config = _llm_runner.GenerationConfig() + config = GenerationConfig() config.max_new_tokens = 100 config.seq_len = 512 config.temperature = 0.7 - + repr_str = repr(config) self.assertIn("GenerationConfig", repr_str) self.assertIn("max_new_tokens=100", repr_str) @@ -115,48 +116,48 @@ def test_repr(self): class TestStats(unittest.TestCase): """Test the Stats class.""" - + def test_attributes(self): """Test that Stats has all expected attributes.""" - stats = _llm_runner.Stats() - + stats = Stats() + # Check all timing attributes exist - self.assertTrue(hasattr(stats, 'SCALING_FACTOR_UNITS_PER_SECOND')) - self.assertTrue(hasattr(stats, 'model_load_start_ms')) - self.assertTrue(hasattr(stats, 'model_load_end_ms')) - self.assertTrue(hasattr(stats, 'inference_start_ms')) - self.assertTrue(hasattr(stats, 'token_encode_end_ms')) - self.assertTrue(hasattr(stats, 'model_execution_start_ms')) - self.assertTrue(hasattr(stats, 'model_execution_end_ms')) - self.assertTrue(hasattr(stats, 'prompt_eval_end_ms')) - self.assertTrue(hasattr(stats, 'first_token_ms')) - self.assertTrue(hasattr(stats, 'inference_end_ms')) - self.assertTrue(hasattr(stats, 'aggregate_sampling_time_ms')) - self.assertTrue(hasattr(stats, 'num_prompt_tokens')) - self.assertTrue(hasattr(stats, 'num_generated_tokens')) - + self.assertTrue(hasattr(stats, "SCALING_FACTOR_UNITS_PER_SECOND")) + self.assertTrue(hasattr(stats, "model_load_start_ms")) + self.assertTrue(hasattr(stats, "model_load_end_ms")) + self.assertTrue(hasattr(stats, "inference_start_ms")) + self.assertTrue(hasattr(stats, "token_encode_end_ms")) + self.assertTrue(hasattr(stats, "model_execution_start_ms")) + self.assertTrue(hasattr(stats, "model_execution_end_ms")) + self.assertTrue(hasattr(stats, "prompt_eval_end_ms")) + self.assertTrue(hasattr(stats, "first_token_ms")) + self.assertTrue(hasattr(stats, "inference_end_ms")) + self.assertTrue(hasattr(stats, "aggregate_sampling_time_ms")) + self.assertTrue(hasattr(stats, "num_prompt_tokens")) + self.assertTrue(hasattr(stats, "num_generated_tokens")) + def test_scaling_factor(self): """Test the scaling factor constant.""" - stats = _llm_runner.Stats() + stats = Stats() self.assertEqual(stats.SCALING_FACTOR_UNITS_PER_SECOND, 1000) - + def test_methods(self): """Test Stats methods.""" - stats = _llm_runner.Stats() - + stats = Stats() + # Test on_sampling_begin and on_sampling_end stats.on_sampling_begin() stats.on_sampling_end() - + # Test reset without all_stats stats.model_load_start_ms = 100 stats.model_load_end_ms = 200 stats.inference_start_ms = 300 stats.num_prompt_tokens = 10 stats.num_generated_tokens = 20 - + stats.reset(False) - + # Model load times should be preserved self.assertEqual(stats.model_load_start_ms, 100) self.assertEqual(stats.model_load_end_ms, 200) @@ -164,36 +165,36 @@ def test_methods(self): self.assertEqual(stats.inference_start_ms, 0) self.assertEqual(stats.num_prompt_tokens, 0) self.assertEqual(stats.num_generated_tokens, 0) - + # Test reset with all_stats stats.reset(True) self.assertEqual(stats.model_load_start_ms, 0) self.assertEqual(stats.model_load_end_ms, 0) - + def test_to_json_string(self): """Test JSON string conversion.""" - stats = _llm_runner.Stats() + stats = Stats() stats.num_prompt_tokens = 10 stats.num_generated_tokens = 20 stats.model_load_start_ms = 100 stats.model_load_end_ms = 200 stats.inference_start_ms = 300 stats.inference_end_ms = 1300 - + json_str = stats.to_json_string() self.assertIn('"prompt_tokens":10', json_str) self.assertIn('"generated_tokens":20', json_str) self.assertIn('"model_load_start_ms":100', json_str) self.assertIn('"model_load_end_ms":200', json_str) - + def test_repr(self): """Test string representation.""" - stats = _llm_runner.Stats() + stats = Stats() stats.num_prompt_tokens = 10 stats.num_generated_tokens = 20 stats.inference_start_ms = 1000 stats.inference_end_ms = 2000 - + repr_str = repr(stats) self.assertIn("Stats", repr_str) self.assertIn("num_prompt_tokens=10", repr_str) @@ -203,29 +204,29 @@ def test_repr(self): class TestImage(unittest.TestCase): """Test the Image class.""" - + def test_creation(self): """Test creating an Image object.""" - image = _llm_runner.Image() - + image = Image() + # Set properties image.data = [1, 2, 3, 4] image.width = 2 image.height = 2 image.channels = 1 - + self.assertEqual(image.data, [1, 2, 3, 4]) self.assertEqual(image.width, 2) self.assertEqual(image.height, 2) self.assertEqual(image.channels, 1) - + def test_repr(self): """Test string representation.""" - image = _llm_runner.Image() + image = Image() image.width = 640 image.height = 480 image.channels = 3 - + repr_str = repr(image) self.assertIn("Image", repr_str) self.assertIn("height=480", repr_str) @@ -235,179 +236,123 @@ def test_repr(self): class TestMultimodalInput(unittest.TestCase): """Test the MultimodalInput class.""" - + def test_text_input(self): """Test creating a text MultimodalInput.""" # Test direct constructor - text_input = _llm_runner.MultimodalInput("Hello, world!") + text_input = MultimodalInput("Hello, world!") self.assertTrue(text_input.is_text()) self.assertFalse(text_input.is_image()) self.assertEqual(text_input.get_text(), "Hello, world!") - + # Test helper function - text_input2 = _llm_runner.make_text_input("Test text") + text_input2 = make_text_input("Test text") self.assertTrue(text_input2.is_text()) self.assertEqual(text_input2.get_text(), "Test text") - + def test_image_input(self): """Test creating an image MultimodalInput.""" # Create an image - image = _llm_runner.Image() + image = Image() image.data = [255] * (100 * 100 * 3) image.width = 100 image.height = 100 image.channels = 3 - + # Test direct constructor - image_input = _llm_runner.MultimodalInput(image) + image_input = MultimodalInput(image) self.assertTrue(image_input.is_image()) self.assertFalse(image_input.is_text()) - + # Test helper function with numpy array img_array = np.ones((50, 60, 3), dtype=np.uint8) * 128 - image_input2 = _llm_runner.make_image_input(img_array) + image_input2 = make_image_input(img_array) self.assertTrue(image_input2.is_image()) self.assertFalse(image_input2.is_text()) - + def test_invalid_image_array(self): """Test error handling for invalid image arrays.""" # Wrong dimensions with self.assertRaises(RuntimeError) as cm: - _llm_runner.make_image_input(np.ones((100,), dtype=np.uint8)) + make_image_input(np.ones((100,), dtype=np.uint8)) self.assertIn("3-dimensional", str(cm.exception)) - + # Wrong number of channels with self.assertRaises(RuntimeError) as cm: - _llm_runner.make_image_input(np.ones((100, 100, 2), dtype=np.uint8)) + make_image_input(np.ones((100, 100, 2), dtype=np.uint8)) self.assertIn("3 (RGB) or 4 (RGBA)", str(cm.exception)) - + def test_repr(self): """Test string representation.""" # Text input - text_input = _llm_runner.MultimodalInput("This is a test") + text_input = MultimodalInput("This is a test") repr_str = repr(text_input) self.assertIn("MultimodalInput", repr_str) self.assertIn("type=text", repr_str) self.assertIn("This is a test", repr_str) - + # Long text input (should be truncated) long_text = "a" * 100 - text_input2 = _llm_runner.MultimodalInput(long_text) + text_input2 = MultimodalInput(long_text) repr_str2 = repr(text_input2) self.assertIn("...", repr_str2) - + # Image input - image = _llm_runner.Image() - image_input = _llm_runner.MultimodalInput(image) + image = Image() + image_input = MultimodalInput(image) repr_str3 = repr(image_input) self.assertIn("type=image", repr_str3) class TestMultimodalRunner(unittest.TestCase): """Test the MultimodalRunner class.""" - + def setUp(self): """Set up test fixtures.""" # Create temporary files for testing self.temp_dir = tempfile.mkdtemp() self.model_path = os.path.join(self.temp_dir, "model.pte") self.tokenizer_path = os.path.join(self.temp_dir, "tokenizer.bin") - + # Create dummy files (these won't actually work, but we can test initialization failure) - with open(self.model_path, 'wb') as f: + with open(self.model_path, "wb") as f: f.write(b"dummy model") - with open(self.tokenizer_path, 'wb') as f: + with open(self.tokenizer_path, "wb") as f: f.write(b"dummy tokenizer") - + def tearDown(self): """Clean up test fixtures.""" import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) - + def test_initialization_failure(self): """Test that initialization fails gracefully with invalid files.""" with self.assertRaises(RuntimeError) as cm: - runner = _llm_runner.MultimodalRunner( - self.model_path, - self.tokenizer_path - ) + runner = MultimodalRunner(self.model_path, self.tokenizer_path) # Should fail because the tokenizer file is not valid self.assertIn("Failed to", str(cm.exception)) class TestHelperFunctions(unittest.TestCase): """Test helper functions.""" - + def test_make_text_input(self): """Test make_text_input helper.""" - text_input = _llm_runner.make_text_input("Hello") + text_input = make_text_input("Hello") self.assertTrue(text_input.is_text()) self.assertEqual(text_input.get_text(), "Hello") - + def test_make_image_input(self): """Test make_image_input helper.""" # Create a test image array (RGB) img_array = np.zeros((100, 150, 3), dtype=np.uint8) img_array[:, :, 0] = 255 # Red channel - - image_input = _llm_runner.make_image_input(img_array) + + image_input = make_image_input(img_array) self.assertTrue(image_input.is_image()) - + # Test with RGBA img_array_rgba = np.ones((50, 50, 4), dtype=np.uint8) * 128 - image_input_rgba = _llm_runner.make_image_input(img_array_rgba) + image_input_rgba = make_image_input(img_array_rgba) self.assertTrue(image_input_rgba.is_image()) - - -class TestIntegration(unittest.TestCase): - """Integration tests for the module.""" - - def test_module_attributes(self): - """Test that the module has expected attributes.""" - # Classes - self.assertTrue(hasattr(_llm_runner, 'GenerationConfig')) - self.assertTrue(hasattr(_llm_runner, 'Stats')) - self.assertTrue(hasattr(_llm_runner, 'Image')) - self.assertTrue(hasattr(_llm_runner, 'MultimodalInput')) - self.assertTrue(hasattr(_llm_runner, 'MultimodalRunner')) - - # Helper functions - self.assertTrue(hasattr(_llm_runner, 'make_text_input')) - self.assertTrue(hasattr(_llm_runner, 'make_image_input')) - - def test_workflow_simulation(self): - """Test a simulated workflow (without actual model).""" - # Create configuration - config = _llm_runner.GenerationConfig() - config.max_new_tokens = 50 - config.temperature = 0.7 - config.echo = False - - # Create inputs - inputs = [] - - # Add text input - text = "Describe this image in detail:" - inputs.append(_llm_runner.make_text_input(text)) - - # Add image input - image_array = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) - inputs.append(_llm_runner.make_image_input(image_array)) - - # Verify inputs - self.assertEqual(len(inputs), 2) - self.assertTrue(inputs[0].is_text()) - self.assertTrue(inputs[1].is_image()) - self.assertEqual(inputs[0].get_text(), text) - - # Test Stats - stats = _llm_runner.Stats() - stats.num_prompt_tokens = 15 - stats.num_generated_tokens = 45 - stats.inference_start_ms = 1000 - stats.inference_end_ms = 3000 - - json_output = stats.to_json_string() - self.assertIsInstance(json_output, str) - self.assertIn("prompt_tokens", json_output) - self.assertIn("generated_tokens", json_output) \ No newline at end of file From 1e76deda65172b2934b1019ce9aec87a3681edba Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 15 Sep 2025 11:41:31 -0700 Subject: [PATCH 06/40] Fix --- extension/llm/runner/__init__.py | 6 +- extension/llm/runner/test/test_pybindings.py | 91 +------------------- extension/llm/runner/utils.py | 2 +- 3 files changed, 6 insertions(+), 93 deletions(-) diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py index 466c2101ab8..80d2768dd11 100644 --- a/extension/llm/runner/__init__.py +++ b/extension/llm/runner/__init__.py @@ -25,7 +25,7 @@ try: # Import shared components from the compiled C++ extension - from ._llm_runner import ( + from executorch.extension.llm.runner._llm_runner import ( # noqa: F401 GenerationConfig, Image, make_image_input, @@ -105,7 +105,9 @@ def create_text_input(self, text: str): """ return make_text_input(text) - def create_image_input(self, image: Union[str, Path, np.ndarray, "PILImage.Image"]): + def create_image_input( # noqa: C901 + self, image: Union[str, Path, np.ndarray, "PILImage.Image"] + ): """ Create an image input for multimodal processing. diff --git a/extension/llm/runner/test/test_pybindings.py b/extension/llm/runner/test/test_pybindings.py index 06c7392a227..3abb43b0042 100644 --- a/extension/llm/runner/test/test_pybindings.py +++ b/extension/llm/runner/test/test_pybindings.py @@ -24,7 +24,6 @@ make_text_input, MultimodalInput, MultimodalRunner, - Stats, ) @@ -114,94 +113,6 @@ def test_repr(self): self.assertIn("warming=False", repr_str) -class TestStats(unittest.TestCase): - """Test the Stats class.""" - - def test_attributes(self): - """Test that Stats has all expected attributes.""" - stats = Stats() - - # Check all timing attributes exist - self.assertTrue(hasattr(stats, "SCALING_FACTOR_UNITS_PER_SECOND")) - self.assertTrue(hasattr(stats, "model_load_start_ms")) - self.assertTrue(hasattr(stats, "model_load_end_ms")) - self.assertTrue(hasattr(stats, "inference_start_ms")) - self.assertTrue(hasattr(stats, "token_encode_end_ms")) - self.assertTrue(hasattr(stats, "model_execution_start_ms")) - self.assertTrue(hasattr(stats, "model_execution_end_ms")) - self.assertTrue(hasattr(stats, "prompt_eval_end_ms")) - self.assertTrue(hasattr(stats, "first_token_ms")) - self.assertTrue(hasattr(stats, "inference_end_ms")) - self.assertTrue(hasattr(stats, "aggregate_sampling_time_ms")) - self.assertTrue(hasattr(stats, "num_prompt_tokens")) - self.assertTrue(hasattr(stats, "num_generated_tokens")) - - def test_scaling_factor(self): - """Test the scaling factor constant.""" - stats = Stats() - self.assertEqual(stats.SCALING_FACTOR_UNITS_PER_SECOND, 1000) - - def test_methods(self): - """Test Stats methods.""" - stats = Stats() - - # Test on_sampling_begin and on_sampling_end - stats.on_sampling_begin() - stats.on_sampling_end() - - # Test reset without all_stats - stats.model_load_start_ms = 100 - stats.model_load_end_ms = 200 - stats.inference_start_ms = 300 - stats.num_prompt_tokens = 10 - stats.num_generated_tokens = 20 - - stats.reset(False) - - # Model load times should be preserved - self.assertEqual(stats.model_load_start_ms, 100) - self.assertEqual(stats.model_load_end_ms, 200) - # Other stats should be reset - self.assertEqual(stats.inference_start_ms, 0) - self.assertEqual(stats.num_prompt_tokens, 0) - self.assertEqual(stats.num_generated_tokens, 0) - - # Test reset with all_stats - stats.reset(True) - self.assertEqual(stats.model_load_start_ms, 0) - self.assertEqual(stats.model_load_end_ms, 0) - - def test_to_json_string(self): - """Test JSON string conversion.""" - stats = Stats() - stats.num_prompt_tokens = 10 - stats.num_generated_tokens = 20 - stats.model_load_start_ms = 100 - stats.model_load_end_ms = 200 - stats.inference_start_ms = 300 - stats.inference_end_ms = 1300 - - json_str = stats.to_json_string() - self.assertIn('"prompt_tokens":10', json_str) - self.assertIn('"generated_tokens":20', json_str) - self.assertIn('"model_load_start_ms":100', json_str) - self.assertIn('"model_load_end_ms":200', json_str) - - def test_repr(self): - """Test string representation.""" - stats = Stats() - stats.num_prompt_tokens = 10 - stats.num_generated_tokens = 20 - stats.inference_start_ms = 1000 - stats.inference_end_ms = 2000 - - repr_str = repr(stats) - self.assertIn("Stats", repr_str) - self.assertIn("num_prompt_tokens=10", repr_str) - self.assertIn("num_generated_tokens=20", repr_str) - self.assertIn("tokens_per_second=20", repr_str) # 20 tokens / 1 second - - class TestImage(unittest.TestCase): """Test the Image class.""" @@ -329,7 +240,7 @@ def tearDown(self): def test_initialization_failure(self): """Test that initialization fails gracefully with invalid files.""" with self.assertRaises(RuntimeError) as cm: - runner = MultimodalRunner(self.model_path, self.tokenizer_path) + MultimodalRunner(self.model_path, self.tokenizer_path, None) # Should fail because the tokenizer file is not valid self.assertIn("Failed to", str(cm.exception)) diff --git a/extension/llm/runner/utils.py b/extension/llm/runner/utils.py index af6b19a94e4..a1669e33068 100644 --- a/extension/llm/runner/utils.py +++ b/extension/llm/runner/utils.py @@ -23,7 +23,7 @@ except ImportError: HAS_PIL = False -from ._llm_runner import GenerationConfig +from executorch.extension.llm.runner._llm_runner import GenerationConfig # noqa: F401 def load_image_from_file( From 6fc63d7554e3af74fd1bcdf5752453a43d1d93d3 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 15 Sep 2025 14:35:43 -0700 Subject: [PATCH 07/40] Rename test --- extension/llm/runner/__init__.py | 2 +- extension/llm/runner/pybindings.cpp | 25 +++++++++++++++++++ ...ybindings.py => test_runner_pybindings.py} | 0 3 files changed, 26 insertions(+), 1 deletion(-) rename extension/llm/runner/test/{test_pybindings.py => test_runner_pybindings.py} (100%) diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py index 80d2768dd11..f2203ae988e 100644 --- a/extension/llm/runner/__init__.py +++ b/extension/llm/runner/__init__.py @@ -238,7 +238,7 @@ def generate_text( if hasattr(config, key): setattr(config, key, value) - return self._runner.generate_text(inputs, config) + return self._runner.generate_text(inputs, config) # type: ignore[attr-defined] def stop(self): """Stop the current generation process.""" diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index 77d1e95c88f..12329baeafa 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -100,6 +100,24 @@ class PyMultimodalRunner { } } + std::string generate_text( + const std::vector& inputs, + const GenerationConfig& config) { + if (!runner_) { + throw std::runtime_error("Runner not initialized"); + } + + std::string generated_text; + auto cpp_token_callback = [&generated_text](const std::string& token) { + generated_text += token; + }; + Error error = + runner_->generate(inputs, config, cpp_token_callback, nullptr); + THROW_IF_ERROR(error, "Generation failed"); + + return generated_text; + } + void stop() { if (runner_) { runner_->stop(); @@ -306,6 +324,13 @@ PYBIND11_MODULE(_llm_runner, m) { py::arg("stats_callback") = py::none(), "Generate text from multimodal inputs with optional callbacks") .def("stop", &PyMultimodalRunner::stop, "Stop the current generation") + .def( + "generate_text", + &PyMultimodalRunner::generate_text, + py::arg("inputs"), + py::arg("config"), + "Generate text from multimodal inputs and return the complete " + "result") .def( "reset", &PyMultimodalRunner::reset, diff --git a/extension/llm/runner/test/test_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py similarity index 100% rename from extension/llm/runner/test/test_pybindings.py rename to extension/llm/runner/test/test_runner_pybindings.py From 4c1c1d09ed703cfe94a2894b368ce37adefe6eba Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 16 Sep 2025 14:20:17 -0700 Subject: [PATCH 08/40] make_image_input take tensor --- CMakeLists.txt | 9 ++--- extension/llm/runner/CMakeLists.txt | 9 +++-- extension/llm/runner/pybindings.cpp | 55 ++++++++++++++++++++++------- 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e419a45a879..483a199fb56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -650,10 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM) list(APPEND _executorch_extensions tokenizers) endif() -if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) - list(APPEND _executorch_extensions extension_llm_runner) -endif() if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) @@ -904,6 +900,11 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING) list(APPEND _executorch_extensions extension_training) endif() +if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) + list(APPEND _executorch_extensions extension_llm_runner) +endif() + if(EXECUTORCH_BUILD_KERNELS_LLM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index fedb7a91162..c231276149d 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -87,10 +87,13 @@ if(EXECUTORCH_BUILD_PYBIND) _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp ) + find_package_torch() + find_library( + TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib" + ) # Link with the extension_llm_runner library and its dependencies target_link_libraries( - _llm_runner PRIVATE extension_llm_runner executorch_core extension_module - extension_tensor tokenizers::tokenizers + _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers portable_lib ) # Set properties for the Python extension @@ -102,7 +105,7 @@ if(EXECUTORCH_BUILD_PYBIND) ) # Add include directories - target_include_directories(_llm_runner PRIVATE ${_common_include_directories}) + target_include_directories(_llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}) install(TARGETS _llm_runner LIBRARY DESTINATION executorch/extension/llm/runner diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index 12329baeafa..6a99ce8727c 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -271,27 +272,55 @@ PYBIND11_MODULE(_llm_runner, m) { m.def( "make_image_input", - [](py::array_t image_array) -> MultimodalInput { - // Get image dimensions - py::buffer_info buf = image_array.request(); + [](torch::Tensor image_tensor) -> MultimodalInput { + if (image_tensor.dim() == 4) { + if (image_tensor.size(0) != 1) { + throw std::runtime_error( + "Batch size for 4D image tensor must be 1"); + } + image_tensor = image_tensor.squeeze(0); + } - if (buf.ndim != 3) { + + if (image_tensor.dim() != 3) { throw std::runtime_error( - "Image array must be 3-dimensional (H, W, C)"); + "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)"); } - size_t height = buf.shape[0]; - size_t width = buf.shape[1]; - size_t channels = buf.shape[2]; + int64_t height, width, channels; + // Check for memory format and permute to CHW if necessary + if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) { + // Input is HWC, permute to CHW + height = image_tensor.size(0); + width = image_tensor.size(1); + channels = image_tensor.size(2); + image_tensor = image_tensor.permute({2, 0, 1}); + } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) { + // Input is CHW + channels = image_tensor.size(0); + height = image_tensor.size(1); + width = image_tensor.size(2); + } else { + throw std::runtime_error( + "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format."); + } if (channels != 3 && channels != 4) { throw std::runtime_error( "Image must have 3 (RGB) or 4 (RGBA) channels"); } - // Create Image object from numpy array - uint8_t* data = static_cast(buf.ptr); - std::vector image_data(data, data + height * width * channels); + if (image_tensor.scalar_type() != torch::kUInt8) { + if (image_tensor.max().item() <= 1.0) { + image_tensor = (image_tensor * 255).to(torch::kUInt8); + } else { + image_tensor = image_tensor.to(torch::kUInt8); + } + } + + image_tensor = image_tensor.contiguous(); + uint8_t* data = image_tensor.data_ptr(); + std::vector image_data(data, data + image_tensor.numel()); Image image; image.data = std::move(image_data); @@ -300,8 +329,8 @@ PYBIND11_MODULE(_llm_runner, m) { image.channels = static_cast(channels); return MultimodalInput(std::move(image)); }, - "Create an image input from a numpy array (H, W, C)", - py::arg("image_array")); + "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)", + py::arg("image_tensor")); // Bind PyMultimodalRunner py::class_(m, "MultimodalRunner") From a182c0bc21c12ce327f1eb1250cb83712fbcc70e Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 17 Sep 2025 15:28:33 -0700 Subject: [PATCH 09/40] More changes --- CMakeLists.txt | 1 - extension/llm/runner/CMakeLists.txt | 7 ++-- extension/llm/runner/pybindings.cpp | 55 +++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 483a199fb56..0ce99bfe339 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -650,7 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM) list(APPEND _executorch_extensions tokenizers) endif() - if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) endif() diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index c231276149d..8d985957ecc 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -93,7 +93,8 @@ if(EXECUTORCH_BUILD_PYBIND) ) # Link with the extension_llm_runner library and its dependencies target_link_libraries( - _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers portable_lib + _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers + portable_lib ) # Set properties for the Python extension @@ -105,7 +106,9 @@ if(EXECUTORCH_BUILD_PYBIND) ) # Add include directories - target_include_directories(_llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}) + target_include_directories( + _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS} + ) install(TARGETS _llm_runner LIBRARY DESTINATION executorch/extension/llm/runner diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index 6a99ce8727c..92984b2e08f 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -219,15 +219,42 @@ PYBIND11_MODULE(_llm_runner, m) { // Bind Image class py::class_(m, "Image") - .def(py::init<>()) - .def_readwrite("data", &Image::data) - .def_readwrite("width", &Image::width) - .def_readwrite("height", &Image::height) - .def_readwrite("channels", &Image::channels) + .def( + py::init&&, int32_t, int32_t, int32_t>(), + py::arg("data"), + py::arg("width"), + py::arg("height"), + py::arg("channels")) + .def( + py::init&&, int32_t, int32_t, int32_t>(), + py::arg("data"), + py::arg("width"), + py::arg("height"), + py::arg("channels")) + .def("is_uint8", &Image::is_uint8) + .def("is_float", &Image::is_float) + .def_property_readonly("width", &Image::width) + .def_property_readonly("height", &Image::height) + .def_property_readonly("channels", &Image::channels) + .def_property_readonly( + "uint8_data", + static_cast& (Image::*)() const&>( + &Image::get_uint8_data)) + .def_property_readonly( + "float_data", + static_cast& (Image::*)() const&>( + &Image::get_float_data)) .def("__repr__", [](const Image& img) { - return ""; + std::string dtype = "unknown"; + if (img.is_uint8()) { + dtype = "uint8"; + } else if (img.is_float()) { + dtype = "float32"; + } + return ""; }); // Bind MultimodalInput @@ -281,7 +308,6 @@ PYBIND11_MODULE(_llm_runner, m) { image_tensor = image_tensor.squeeze(0); } - if (image_tensor.dim() != 3) { throw std::runtime_error( "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)"); @@ -322,12 +348,11 @@ PYBIND11_MODULE(_llm_runner, m) { uint8_t* data = image_tensor.data_ptr(); std::vector image_data(data, data + image_tensor.numel()); - Image image; - image.data = std::move(image_data); - image.width = static_cast(width); - image.height = static_cast(height); - image.channels = static_cast(channels); - return MultimodalInput(std::move(image)); + return MultimodalInput(Image( + std::move(image_data), + static_cast(width), + static_cast(height), + static_cast(channels))); }, "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)", py::arg("image_tensor")); From 7b7f360ec96046a4d7a5647ba5205b0f175bb63b Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 18 Sep 2025 00:14:03 -0700 Subject: [PATCH 10/40] More changes --- examples/models/llava/main.cpp | 3 +- extension/llm/runner/pybindings.cpp | 44 ++++++++++++++++++----------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 3946a629ade..635fd7888d2 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -131,8 +131,7 @@ int32_t main(int32_t argc, char** argv) { #endif // Load tokenizer std::unique_ptr<::tokenizers::Tokenizer> tokenizer = - std::make_unique(); - tokenizer->load(tokenizer_path); + ::executorch::extension::llm::load_tokenizer(tokenizer_path); if (tokenizer == nullptr) { ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); return 1; diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index 92984b2e08f..fe5f26f45fd 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -277,6 +277,14 @@ PYBIND11_MODULE(_llm_runner, m) { } return py::none(); }) + .def( + "get_image", + [](const MultimodalInput& input) -> py::object { + if (input.is_image()) { + return py::cast(input.get_image()); + } + return py::none(); + }) .def("__repr__", [](const MultimodalInput& input) -> std::string { if (input.is_text()) { return "() <= 1.0) { - image_tensor = (image_tensor * 255).to(torch::kUInt8); - } else { - image_tensor = image_tensor.to(torch::kUInt8); - } - } - image_tensor = image_tensor.contiguous(); - uint8_t* data = image_tensor.data_ptr(); - std::vector image_data(data, data + image_tensor.numel()); - - return MultimodalInput(Image( - std::move(image_data), - static_cast(width), - static_cast(height), - static_cast(channels))); + if (image_tensor.scalar_type() == torch::kUInt8) { + uint8_t* data = image_tensor.data_ptr(); + std::vector image_data(data, data + image_tensor.numel()); + return MultimodalInput(Image( + std::move(image_data), + static_cast(width), + static_cast(height), + static_cast(channels))); + } else if (image_tensor.scalar_type() == torch::kFloat) { + float* data = image_tensor.data_ptr(); + std::vector image_data(data, data + image_tensor.numel()); + return MultimodalInput(Image( + std::move(image_data), + static_cast(width), + static_cast(height), + static_cast(channels))); + } else { + throw std::runtime_error( + "Unsupported image tensor dtype. Only uint8 and float32 are supported."); + } }, "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)", py::arg("image_tensor")); From 5be86d22bb896928291ae95a4a69fcb4e7e84885 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 18 Sep 2025 00:31:48 -0700 Subject: [PATCH 11/40] Address comments --- extension/llm/runner/llm_runner_helper.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h index 76f129774cf..191ea3ab090 100644 --- a/extension/llm/runner/llm_runner_helper.h +++ b/extension/llm/runner/llm_runner_helper.h @@ -121,21 +121,4 @@ ET_EXPERIMENTAL std::unique_ptr create_multimodal_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path = std::nullopt); -/** - * @brief Creates a MultimodalRunner instance with a shared tokenizer - * - * This overload allows using a tokenizer that is shared/owned by Python or - * other code. The tokenizer must remain valid for the lifetime of the runner. - * - * @param model_path Path to the model file - * @param tokenizer Shared pointer to an initialized tokenizer instance - * @param data_path Optional path to additional .ptd required by the model - * @return std::unique_ptr Initialized MultimodalRunner - * instance, or nullptr on failure - */ -ET_EXPERIMENTAL std::unique_ptr create_multimodal_runner( - const std::string& model_path, - std::shared_ptr<::tokenizers::Tokenizer> tokenizer, - std::optional data_path = std::nullopt); - } // namespace executorch::extension::llm From 5742bafa9fb0dcbf89838754b543246d0819ff2e Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Sep 2025 00:18:55 -0700 Subject: [PATCH 12/40] Add support for audio and token input --- extension/llm/runner/CMakeLists.txt | 7 +- extension/llm/runner/__init__.py | 244 +----------- extension/llm/runner/_llm_runner.pyi | 195 +++++++++- extension/llm/runner/pybindings.cpp | 190 +++++++++ extension/llm/runner/test.ipynb | 468 ++++++++++++++++++++++ extension/llm/runner/test2.ipynb | 561 +++++++++++++++++++++++++++ 6 files changed, 1414 insertions(+), 251 deletions(-) create mode 100644 extension/llm/runner/test.ipynb create mode 100644 extension/llm/runner/test2.ipynb diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 8d985957ecc..989f794ab07 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -104,7 +104,12 @@ if(EXECUTORCH_BUILD_PYBIND) CXX_VISIBILITY_PRESET "hidden" INTERPROCEDURAL_OPTIMIZATION TRUE ) - + if(APPLE) + set(RPATH "@loader_path/../../pybindings") + else() + set(RPATH "$ORIGIN/../../pybindings") + endif() + set_target_properties(_llm_runner PROPERTIES INSTALL_RPATH ${RPATH}) # Add include directories target_include_directories( _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS} diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py index f2203ae988e..6d878308677 100644 --- a/extension/llm/runner/__init__.py +++ b/extension/llm/runner/__init__.py @@ -28,10 +28,13 @@ from executorch.extension.llm.runner._llm_runner import ( # noqa: F401 GenerationConfig, Image, + make_audio_input, make_image_input, + make_raw_audio_input, make_text_input, + make_token_input, MultimodalInput, - MultimodalRunner as _MultimodalRunnerCpp, + MultimodalRunner, Stats, ) except ImportError: @@ -40,242 +43,6 @@ ) -# Define the high-level Python wrapper for MultimodalRunner -class MultimodalRunner: - """ - High-level Python wrapper for the ExecuTorch MultimodalRunner. - - This class provides a convenient interface for running multimodal language models - that can process text, images, and other modalities to generate text output. - - Args: - model_path: Path to the ExecuTorch model file (.pte) - tokenizer_path: Path to the tokenizer file - temperature: Default temperature for text generation (default: 0.8) - device: Device to run on (currently only 'cpu' is supported) - - Example: - >>> runner = MultimodalRunner("model.pte", "tokenizer.bin") - >>> inputs = [ - ... runner.create_text_input("Describe this image:"), - ... runner.create_image_input("image.jpg") - ... ] - >>> response = runner.generate_text(inputs, max_new_tokens=100) - >>> print(response) - """ - - def __init__( - self, - model_path: Union[str, Path], - tokenizer_path: Union[str, Path], - temperature: float = 0.8, - device: str = "cpu", - ): - """Initialize the MultimodalRunner.""" - if device != "cpu": - raise ValueError( - f"Currently only 'cpu' device is supported, got '{device}'" - ) - - # Convert paths to strings - model_path = str(Path(model_path).resolve()) - tokenizer_path = str(Path(tokenizer_path).resolve()) - - # Validate paths exist - if not Path(model_path).exists(): - raise FileNotFoundError(f"Model file not found: {model_path}") - if not Path(tokenizer_path).exists(): - raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}") - - # Initialize the C++ runner - self._runner = _MultimodalRunnerCpp(model_path, tokenizer_path, temperature) - self._model_path = model_path - self._tokenizer_path = tokenizer_path - self._default_temperature = temperature - - def create_text_input(self, text: str): - """ - Create a text input for multimodal processing. - - Args: - text: The input text string - - Returns: - A MultimodalInput object containing the text - """ - return make_text_input(text) - - def create_image_input( # noqa: C901 - self, image: Union[str, Path, np.ndarray, "PILImage.Image"] - ): - """ - Create an image input for multimodal processing. - - Args: - image: Can be: - - Path to an image file (str or Path) - - NumPy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) - - PIL Image object - - Returns: - A MultimodalInput object containing the image - - Raises: - ValueError: If the image format is not supported - FileNotFoundError: If the image file doesn't exist - """ - if isinstance(image, (str, Path)): - # Load image from file - image_path = Path(image) - if not image_path.exists(): - raise FileNotFoundError(f"Image file not found: {image_path}") - - if HAS_PIL: - pil_image = PILImage.open(image_path) - # Convert to RGB if necessary - if pil_image.mode != "RGB": - pil_image = pil_image.convert("RGB") - image = np.array(pil_image, dtype=np.uint8) - else: - # Try to use cv2 if available - try: - import cv2 - - image = cv2.imread(str(image_path)) - if image is None: - raise ValueError(f"Failed to load image: {image_path}") - # Convert BGR to RGB - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - except ImportError: - raise ImportError( - "Either PIL or OpenCV is required to load images from files. " - "Install with: pip install pillow or pip install opencv-python" - ) - - elif HAS_PIL and isinstance(image, PILImage.Image): - # Convert PIL Image to numpy array - if image.mode != "RGB": - image = image.convert("RGB") - image = np.array(image, dtype=np.uint8) - - elif isinstance(image, np.ndarray): - # Validate numpy array - if image.ndim != 3: - raise ValueError( - f"Image array must be 3-dimensional (H, W, C), got shape {image.shape}" - ) - if image.shape[2] not in [3, 4]: - raise ValueError( - f"Image must have 3 (RGB) or 4 (RGBA) channels, got {image.shape[2]}" - ) - if image.dtype != np.uint8: - # Convert to uint8 if necessary - if image.max() <= 1.0: - # Assume normalized [0, 1] range - image = (image * 255).astype(np.uint8) - else: - image = image.astype(np.uint8) - else: - raise ValueError(f"Unsupported image type: {type(image)}") - - return make_image_input(image) - - def generate( - self, - inputs: List[Any], - config: Optional[GenerationConfig] = None, - token_callback: Optional[Callable[[str], None]] = None, - stats_callback: Optional[Callable[[Any], None]] = None, - ): - """ - Generate text from multimodal inputs with streaming callbacks. - - Args: - inputs: List of multimodal inputs (text, images, etc.) - config: Generation configuration (uses defaults if None) - token_callback: Function called for each generated token - stats_callback: Function called with generation statistics - """ - if config is None: - config = GenerationConfig() - config.temperature = self._default_temperature - - self._runner.generate(inputs, config, token_callback, stats_callback) - - def generate_text( - self, - inputs: List[Any], - config: Optional[GenerationConfig] = None, - max_new_tokens: Optional[int] = None, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - **kwargs, - ) -> str: - """ - Generate text from multimodal inputs and return the complete result. - - Args: - inputs: List of multimodal inputs (text, images, etc.) - config: Generation configuration (overrides other parameters if provided) - max_new_tokens: Maximum number of tokens to generate - temperature: Sampling temperature (0.0 to 1.0) - top_p: Top-p sampling parameter - **kwargs: Additional generation parameters - - Returns: - The generated text as a string - """ - if config is None: - config = GenerationConfig() - config.temperature = temperature or self._default_temperature - if max_new_tokens is not None: - config.max_new_tokens = max_new_tokens - if top_p is not None: - config.top_p = top_p - - # Set any additional parameters - for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) - - return self._runner.generate_text(inputs, config) # type: ignore[attr-defined] - - def stop(self): - """Stop the current generation process.""" - self._runner.stop() - - @property - def vocab_size(self) -> int: - """Get the vocabulary size of the model.""" - return self._runner.get_vocab_size() - - @property - def model_path(self) -> str: - """Get the path to the loaded model.""" - return self._model_path - - @property - def tokenizer_path(self) -> str: - """Get the path to the loaded tokenizer.""" - return self._tokenizer_path - - def __repr__(self) -> str: - return ( - f"MultimodalRunner(model='{Path(self._model_path).name}', " - f"tokenizer='{Path(self._tokenizer_path).name}', " - f"vocab_size={self.vocab_size})" - ) - - def __enter__(self): - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit - ensures cleanup.""" - self.stop() - return False - - # Import utility functions from .utils import create_generation_config, load_image_from_file, preprocess_image @@ -285,7 +52,10 @@ def __exit__(self, exc_type, exc_val, exc_tb): "Stats", "Image", "MultimodalInput", + "make_audio_input", + "make_raw_audio_input", "make_text_input", + "make_token_input", "make_image_input", "load_image_from_file", "preprocess_image", diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi index e245301747b..785b776c816 100644 --- a/extension/llm/runner/_llm_runner.pyi +++ b/extension/llm/runner/_llm_runner.pyi @@ -7,6 +7,7 @@ This file provides type annotations for the ExecuTorch LLM Runner Python binding from typing import Callable, List, Optional, Union import numpy as np +import torch from numpy.typing import NDArray class GenerationConfig: @@ -123,26 +124,111 @@ class Stats: class Image: """Container for image data.""" + def __init__(self) -> None: + """Initialize an empty Image.""" + ... + + def __init__(self, data: List[int], width: int, height: int, channels: int) -> None: + """Initialize an Image with uint8 data.""" + ... + + def __init__( + self, data: List[float], width: int, height: int, channels: int + ) -> None: + """Initialize an Image with float data.""" + ... + + def is_uint8(self) -> bool: + """Check if image data is uint8 format.""" + ... + + def is_float(self) -> bool: + """Check if image data is float format.""" + ... + + @property + def width(self) -> int: + """Image width in pixels.""" + ... + + @property + def height(self) -> int: + """Image height in pixels.""" + ... + + @property + def channels(self) -> int: + """Number of color channels (3 for RGB, 4 for RGBA).""" + ... + + @property + def uint8_data(self) -> List[int]: + """Raw image data as uint8 values.""" + ... + + @property + def float_data(self) -> List[float]: + """Raw image data as float values.""" + ... + + def __repr__(self) -> str: ... + +class Audio: + """Container for preprocessed audio data.""" + data: List[int] - """Raw image data as a list of uint8 values.""" + """Raw audio data as a list of uint8 values.""" - width: int - """Image width in pixels.""" + batch_size: int + """Batch size of the audio data.""" - height: int - """Image height in pixels.""" + n_bins: int + """Number of frequency bins (for spectrograms).""" - channels: int - """Number of color channels (3 for RGB, 4 for RGBA).""" + n_frames: int + """Number of time frames.""" def __init__(self) -> None: - """Initialize an empty Image.""" + """Initialize an empty Audio.""" + ... + + def __init__( + self, data: List[int], batch_size: int, n_bins: int, n_frames: int + ) -> None: + """Initialize Audio with preprocessed data.""" + ... + + def __repr__(self) -> str: ... + +class RawAudio: + """Container for raw audio data.""" + + data: List[int] + """Raw audio data as a list of uint8 values.""" + + batch_size: int + """Batch size of the audio data.""" + + n_channels: int + """Number of audio channels (1 for mono, 2 for stereo).""" + + n_samples: int + """Number of audio samples.""" + + def __init__(self) -> None: + """Initialize an empty RawAudio.""" + ... + + def __init__( + self, data: List[int], batch_size: int, n_channels: int, n_samples: int + ) -> None: + """Initialize RawAudio with raw data.""" ... def __repr__(self) -> str: ... class MultimodalInput: - """Container for multimodal input data (text, image, etc.).""" + """Container for multimodal input data (text, image, audio, etc.).""" def __init__(self, text: str) -> None: """ @@ -162,6 +248,24 @@ class MultimodalInput: """ ... + def __init__(self, audio: Audio) -> None: + """ + Create a MultimodalInput with preprocessed audio. + + Args: + audio: The input audio data + """ + ... + + def __init__(self, raw_audio: RawAudio) -> None: + """ + Create a MultimodalInput with raw audio. + + Args: + raw_audio: The input raw audio data + """ + ... + def is_text(self) -> bool: """Check if this input contains text.""" ... @@ -170,6 +274,14 @@ class MultimodalInput: """Check if this input contains an image.""" ... + def is_audio(self) -> bool: + """Check if this input contains preprocessed audio.""" + ... + + def is_raw_audio(self) -> bool: + """Check if this input contains raw audio.""" + ... + def get_text(self) -> Optional[str]: """ Get the text content if this is a text input. @@ -179,6 +291,33 @@ class MultimodalInput: """ ... + def get_image(self) -> Optional[Image]: + """ + Get the image content if this is an image input. + + Returns: + The Image object if this is an image input, None otherwise + """ + ... + + def get_audio(self) -> Optional[Audio]: + """ + Get the audio content if this is an audio input. + + Returns: + The Audio object if this is an audio input, None otherwise + """ + ... + + def get_raw_audio(self) -> Optional[RawAudio]: + """ + Get the raw audio content if this is a raw audio input. + + Returns: + The RawAudio object if this is a raw audio input, None otherwise + """ + ... + def __repr__(self) -> str: ... class MultimodalRunner: @@ -270,17 +409,47 @@ def make_text_input(text: str) -> MultimodalInput: """ ... -def make_image_input(image_array: NDArray[np.uint8]) -> MultimodalInput: +def make_image_input(image_tensor: torch.Tensor) -> MultimodalInput: """ - Create an image input from a numpy array. + Create an image input from a torch tensor. Args: - image_array: Numpy array with shape (H, W, C) where C is 3 (RGB) or 4 (RGBA) + image_tensor: Torch tensor with shape (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W) Returns: A MultimodalInput containing the image Raises: - RuntimeError: If the array has invalid dimensions or number of channels + RuntimeError: If the tensor has invalid dimensions or number of channels + """ + ... + +def make_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput: + """ + Create a preprocessed audio input from a torch tensor. + + Args: + audio_tensor: Torch tensor with shape (batch_size, n_bins, n_frames) + + Returns: + A MultimodalInput containing the preprocessed audio + + Raises: + RuntimeError: If the tensor has invalid dimensions or dtype + """ + ... + +def make_raw_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput: + """ + Create a raw audio input from a torch tensor. + + Args: + audio_tensor: Torch tensor with shape (batch_size, n_channels, n_samples) + + Returns: + A MultimodalInput containing the raw audio + + Raises: + RuntimeError: If the tensor has invalid dimensions or dtype """ ... diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp index fe5f26f45fd..310d05ad59e 100644 --- a/extension/llm/runner/pybindings.cpp +++ b/extension/llm/runner/pybindings.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -257,18 +258,97 @@ PYBIND11_MODULE(_llm_runner, m) { ">"; }); + // Bind Audio class + py::class_