Skip to content

Commit 6e71eec

Browse files
committed
Fix windows build
1 parent 4218c04 commit 6e71eec

File tree

2 files changed

+249
-69
lines changed

2 files changed

+249
-69
lines changed

extension/llm/runner/README.md

Lines changed: 246 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -166,120 +166,298 @@ int main() {
166166

167167
## Python API
168168

169-
The LLM Runner framework also provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features.
169+
The LLM Runner framework provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features like torch tensor support and Hugging Face compatibility.
170170

171171
### Installation
172172

173173
Build the Python bindings as part of the ExecuTorch build:
174174

175175
```bash
176-
# Build with Python bindings enabled
177-
cmake -DPYTHON_EXECUTABLE=$(which python3) \
178-
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
179-
-DEXECUTORCH_BUILD_PYTHON_BINDINGS=ON \
180-
..
181-
make -j8 _llm_runner
176+
# Build from source with Python bindings enabled:
177+
# In executorch root directory
178+
bash install_executorch.sh
182179
```
183180

184-
### Quick Start - Python
181+
### Quick Start Examples
182+
183+
#### Basic Multimodal Generation
185184

186185
```python
187-
import _llm_runner
188-
import numpy as np
186+
from executorch.extension.llm.runner import (
187+
GenerationConfig, MultimodalRunner,
188+
make_text_input, make_image_input, make_audio_input
189+
)
190+
import torch
189191

190192
# Create a multimodal runner
191-
runner = _llm_runner.MultimodalRunner(
193+
runner = MultimodalRunner(
192194
model_path="/path/to/model.pte",
193195
tokenizer_path="/path/to/tokenizer.bin"
194196
)
195197

196198
# Create multimodal inputs
197199
inputs = []
200+
inputs.append(make_text_input("What do you see in this image?"))
198201

199-
# Add text input
200-
inputs.append(_llm_runner.make_text_input("Describe this image:"))
201-
202-
# Add image input from numpy array
203-
image_array = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
204-
inputs.append(_llm_runner.make_image_input(image_array))
202+
# Add image from torch tensor (supports both CHW and HWC formats)
203+
image_tensor = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) # CHW format
204+
inputs.append(make_image_input(image_tensor))
205205

206206
# Configure generation
207-
config = _llm_runner.GenerationConfig()
208-
config.max_new_tokens = 100
209-
config.temperature = 0.7
210-
config.echo = False
207+
config = GenerationConfig(
208+
max_new_tokens=100,
209+
temperature=0.7,
210+
echo=False
211+
)
211212

212-
# Generate text with callback
213+
# Generate with streaming output
213214
def token_callback(token: str):
214215
print(token, end='', flush=True)
215216

216217
def stats_callback(stats):
217-
print(f"\nGenerated {stats.num_generated_tokens} tokens")
218-
print(f"Tokens/sec: {stats.num_generated_tokens * 1000 / (stats.inference_end_ms - stats.inference_start_ms):.1f}")
218+
print(f"\n[Stats] Generated {stats.num_generated_tokens} tokens")
219+
inference_time = stats.inference_end_ms - stats.inference_start_ms
220+
if inference_time > 0:
221+
tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
222+
print(f"[Stats] Speed: {tokens_per_sec:.1f} tokens/sec")
219223

220-
# Run generation
221224
runner.generate(inputs, config, token_callback, stats_callback)
225+
```
226+
227+
#### Working with Different Input Types
228+
229+
```python
230+
from executorch.extension.llm.runner import (
231+
MultimodalRunner, GenerationConfig,
232+
make_text_input, make_token_input, make_image_input,
233+
make_audio_input, make_raw_audio_input
234+
)
235+
import torch
236+
237+
runner = MultimodalRunner("model.pte", "tokenizer.bin")
238+
239+
# 1. Text input
240+
text_input = make_text_input("Analyze this multimodal content:")
241+
242+
# 2. Pre-tokenized input (useful for chat templates)
243+
token_ids = [1, 15043, 445, 2420] # Example token IDs
244+
token_input = make_token_input(token_ids)
245+
246+
# 3. Image input from torch tensor
247+
# Supports multiple formats: (H,W,C), (C,H,W), (1,H,W,C), (1,C,H,W)
248+
image_hwc = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8) # HWC
249+
image_input = make_image_input(image_hwc)
250+
251+
# Float tensors also supported for normalized images
252+
image_float = torch.rand(3, 224, 224, dtype=torch.float32) # CHW, normalized
253+
image_input_float = make_image_input(image_float)
254+
255+
# 4. Preprocessed audio input (e.g., mel spectrograms)
256+
audio_features = torch.rand(1, 80, 100, dtype=torch.float32) # (batch, n_bins, n_frames)
257+
audio_input = make_audio_input(audio_features)
258+
259+
# 5. Raw audio input (for models with built-in audio processing)
260+
raw_audio = torch.randint(0, 255, (1, 1, 16000), dtype=torch.uint8) # (batch, channels, samples)
261+
raw_audio_input = make_raw_audio_input(raw_audio)
222262

223-
# Or get complete text result
224-
result = runner.generate_text(inputs, config)
225-
print(f"Generated text: {result}")
263+
# Combine inputs and generate
264+
inputs = [text_input, image_input, audio_input]
265+
config = GenerationConfig(max_new_tokens=50, temperature=0.8)
266+
response = runner.generate_text(inputs, config)
267+
print(f"Response: {response}")
226268
```
227269

228-
### Python API Features
270+
#### Hugging Face Integration
229271

230-
- **Type hints**: Full type annotations with `.pyi` stub files for IDE support
231-
- **NumPy integration**: Direct support for numpy arrays as image inputs
232-
- **Callbacks**: Optional token and statistics callbacks for streaming generation
233-
- **Exception handling**: Pythonic error handling with RuntimeError for failures
234-
- **Memory management**: Automatic resource cleanup with Python garbage collection
272+
```python
273+
from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
274+
from transformers import AutoProcessor
275+
from PIL import Image
276+
import torch
277+
278+
# Load HF processor for your model
279+
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
280+
281+
# Create runner
282+
runner = MultimodalRunner("llava_model.pte", "tokenizer.bin")
283+
284+
# Process inputs with HF processor
285+
image = Image.open("photo.jpg")
286+
conversation = [
287+
{"role": "user", "content": [
288+
{"type": "text", "text": "What's in this image?"},
289+
{"type": "image"}
290+
]}
291+
]
292+
293+
# Apply chat template and process
294+
prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
295+
inputs_hf = processor(prompt, image, return_tensors="pt")
296+
297+
# Generate using HF inputs directly
298+
config = GenerationConfig(max_new_tokens=100, temperature=0.7)
299+
runner.generate_hf(
300+
inputs_hf,
301+
config,
302+
image_token_id=processor.tokenizer.convert_tokens_to_ids("<image>"),
303+
token_callback=lambda token: print(token, end='', flush=True)
304+
)
305+
```
306+
307+
#### Chat Session with State Management
308+
309+
```python
310+
from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig, make_text_input
311+
312+
class ChatSession:
313+
def __init__(self, model_path: str, tokenizer_path: str):
314+
self.runner = MultimodalRunner(model_path, tokenizer_path)
315+
self.config = GenerationConfig(max_new_tokens=150, temperature=0.7, echo=False)
316+
317+
def send_message(self, message: str) -> str:
318+
"""Send a message and get response"""
319+
inputs = [make_text_input(message)]
320+
response = self.runner.generate_text(inputs, self.config)
321+
return response
322+
323+
def send_multimodal(self, text: str, image_tensor: torch.Tensor) -> str:
324+
"""Send text + image and get response"""
325+
inputs = [
326+
make_text_input(text),
327+
make_image_input(image_tensor)
328+
]
329+
response = self.runner.generate_text(inputs, self.config)
330+
return response
331+
332+
def reset_conversation(self):
333+
"""Reset the conversation state"""
334+
self.runner.reset()
335+
336+
# Usage
337+
chat = ChatSession("model.pte", "tokenizer.bin")
338+
print(chat.send_message("Hello! How are you?"))
339+
340+
# Continue conversation (KV cache maintains context)
341+
print(chat.send_message("What's the weather like?"))
342+
343+
# Reset when starting new conversation
344+
chat.reset_conversation()
345+
```
235346

236347
### Python API Classes
237348

238349
#### GenerationConfig
239350
```python
240-
config = _llm_runner.GenerationConfig()
241-
config.max_new_tokens = 50 # Maximum tokens to generate
242-
config.temperature = 0.8 # Sampling temperature
243-
config.echo = True # Echo input prompt
244-
config.seq_len = 512 # Maximum sequence length
245-
config.num_bos = 1 # Number of BOS tokens
246-
config.num_eos = 1 # Number of EOS tokens
351+
from executorch.extension.llm.runner import GenerationConfig
352+
353+
# Create with defaults
354+
config = GenerationConfig()
355+
356+
# Or specify parameters
357+
config = GenerationConfig(
358+
max_new_tokens=100, # Maximum tokens to generate (-1 = auto)
359+
temperature=0.8, # Sampling temperature (0.0 = deterministic)
360+
echo=True, # Echo input prompt in output
361+
seq_len=2048, # Maximum sequence length (-1 = auto)
362+
num_bos=0, # Number of BOS tokens
363+
num_eos=0 # Number of EOS tokens
364+
)
365+
366+
# Modify after creation
367+
config.temperature = 0.5
368+
config.max_new_tokens = 50
247369
```
248370

249-
#### MultimodalInput
371+
#### MultimodalInput Types
250372
```python
373+
from executorch.extension.llm.runner import (
374+
MultimodalInput, make_text_input, make_token_input,
375+
make_image_input, make_audio_input
376+
)
377+
251378
# Text input
252-
text_input = _llm_runner.MultimodalInput("Hello, world!")
253-
# Or using helper
254-
text_input = _llm_runner.make_text_input("Hello, world!")
255-
256-
# Image input
257-
image = _llm_runner.Image()
258-
image.data = [255] * (224 * 224 * 3) # RGB data
259-
image.width = 224
260-
image.height = 224
261-
image.channels = 3
262-
image_input = _llm_runner.MultimodalInput(image)
263-
264-
# Or from numpy array
265-
img_array = np.ones((224, 224, 3), dtype=np.uint8) * 128
266-
image_input = _llm_runner.make_image_input(img_array)
379+
text_input = make_text_input("Hello, world!")
380+
print(text_input.is_text()) # True
381+
print(text_input.get_text()) # "Hello, world!"
382+
383+
# Token input (pre-tokenized)
384+
token_input = make_token_input([1, 2, 3, 4])
385+
print(token_input.is_tokens()) # True
386+
print(token_input.get_tokens()) # [1, 2, 3, 4]
387+
388+
# Image input from torch tensor
389+
import torch
390+
image_tensor = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8)
391+
image_input = make_image_input(image_tensor)
392+
print(image_input.is_image()) # True
393+
image = image_input.get_image()
394+
print(f"Image: {image.width}x{image.height}x{image.channels}")
395+
396+
# Check input types safely
397+
if text_input.is_text():
398+
text = text_input.get_text()
399+
elif text_input.is_image():
400+
image = text_input.get_image()
267401
```
268402

269-
#### Stats
403+
#### Stats and Performance Monitoring
404+
```python
405+
def detailed_stats_callback(stats):
406+
"""Comprehensive stats monitoring"""
407+
print(f"\n=== Generation Statistics ===")
408+
print(f"Prompt tokens: {stats.num_prompt_tokens}")
409+
print(f"Generated tokens: {stats.num_generated_tokens}")
410+
411+
# Timing breakdown
412+
model_load_time = stats.model_load_end_ms - stats.model_load_start_ms
413+
if model_load_time > 0:
414+
print(f"Model load time: {model_load_time}ms")
415+
416+
inference_time = stats.inference_end_ms - stats.inference_start_ms
417+
if inference_time > 0:
418+
print(f"Total inference time: {inference_time}ms")
419+
420+
# Calculate throughput
421+
tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
422+
print(f"Generation speed: {tokens_per_sec:.1f} tokens/sec")
423+
424+
# Time to first token
425+
if stats.first_token_ms > stats.inference_start_ms:
426+
ttft = stats.first_token_ms - stats.inference_start_ms
427+
print(f"Time to first token: {ttft}ms")
428+
429+
# Export to JSON for logging
430+
json_stats = stats.to_json_string()
431+
print(f"JSON stats: {json_stats}")
432+
433+
# Use in generation
434+
runner.generate(inputs, config, token_callback, detailed_stats_callback)
435+
```
436+
437+
### Error Handling
438+
270439
```python
271-
# Access timing and performance statistics
272-
stats = _llm_runner.Stats()
273-
print(f"Model load time: {stats.model_load_end_ms - stats.model_load_start_ms}ms")
274-
print(f"Inference time: {stats.inference_end_ms - stats.inference_start_ms}ms")
275-
print(f"Tokens generated: {stats.num_generated_tokens}")
276-
print(f"Prompt tokens: {stats.num_prompt_tokens}")
277-
278-
# JSON export
279-
json_str = stats.to_json_string()
440+
from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
441+
import torch
442+
443+
try:
444+
runner = MultimodalRunner("model.pte", "tokenizer.bin")
445+
446+
# Invalid image tensor will raise RuntimeError
447+
invalid_image = torch.rand(2, 224, 224, 3) # Wrong number of dimensions
448+
inputs = [make_image_input(invalid_image)]
449+
450+
config = GenerationConfig(max_new_tokens=50)
451+
runner.generate_text(inputs, config)
452+
453+
except RuntimeError as e:
454+
print(f"Generation failed: {e}")
455+
456+
except FileNotFoundError as e:
457+
print(f"Model or tokenizer file not found: {e}")
280458
```
281459

282-
For detailed Python API documentation and examples, see [README_PYTHON_BINDINGS.md](README_PYTHON_BINDINGS.md).
460+
For more C++ API documentation and implementation details, see the [Core Components](#core-components) section below.
283461

284462
## Core Components
285463

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,8 @@ def run(self): # noqa C901
814814
if cmake_cache.is_enabled("EXECUTORCH_BUILD_PYBIND"):
815815
cmake_build_args += ["--target", "portable_lib"]
816816
cmake_build_args += ["--target", "selective_build"]
817+
# TODO(larryliu0820): Temporarily disable building llm_runner for Windows
818+
if cmake_cache.is_enabled("EXECUTORCH_BUILD_LLM_RUNNER") and not _is_windows():
817819
cmake_build_args += ["--target", "_llm_runner"]
818820

819821
if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_MODULE"):
@@ -886,7 +888,7 @@ def run(self): # noqa C901
886888
dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"],
887889
),
888890
BuiltExtension(
889-
src="extension/llm/runner/_llm_runner.*",
891+
src="extension/llm/runner/_llm_runner.*", # @lint-ignore https://github.com/pytorch/executorch/blob/cb3eba0d7f630bc8cec0a9cc1df8ae2f17af3f7a/scripts/lint_xrefs.sh
890892
modpath="executorch.extension.llm.runner._llm_runner",
891893
dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"],
892894
),

0 commit comments

Comments
 (0)