@@ -166,120 +166,298 @@ int main() {
166166
167167## Python API
168168
169- The LLM Runner framework also provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features.
169+ The LLM Runner framework provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features like torch tensor support and Hugging Face compatibility .
170170
171171### Installation
172172
173173Build the Python bindings as part of the ExecuTorch build:
174174
175175``` bash
176- # Build with Python bindings enabled
177- cmake -DPYTHON_EXECUTABLE=$( which python3) \
178- -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
179- -DEXECUTORCH_BUILD_PYTHON_BINDINGS=ON \
180- ..
181- make -j8 _llm_runner
176+ # Build from source with Python bindings enabled:
177+ # In executorch root directory
178+ bash install_executorch.sh
182179```
183180
184- ### Quick Start - Python
181+ ### Quick Start Examples
182+
183+ #### Basic Multimodal Generation
185184
186185``` python
187- import _llm_runner
188- import numpy as np
186+ from executorch.extension.llm.runner import (
187+ GenerationConfig, MultimodalRunner,
188+ make_text_input, make_image_input, make_audio_input
189+ )
190+ import torch
189191
190192# Create a multimodal runner
191- runner = _llm_runner. MultimodalRunner(
193+ runner = MultimodalRunner(
192194 model_path = " /path/to/model.pte" ,
193195 tokenizer_path = " /path/to/tokenizer.bin"
194196)
195197
196198# Create multimodal inputs
197199inputs = []
200+ inputs.append(make_text_input(" What do you see in this image?" ))
198201
199- # Add text input
200- inputs.append(_llm_runner.make_text_input(" Describe this image:" ))
201-
202- # Add image input from numpy array
203- image_array = np.random.randint(0 , 255 , (224 , 224 , 3 ), dtype = np.uint8)
204- inputs.append(_llm_runner.make_image_input(image_array))
202+ # Add image from torch tensor (supports both CHW and HWC formats)
203+ image_tensor = torch.randint(0 , 255 , (3 , 224 , 224 ), dtype = torch.uint8) # CHW format
204+ inputs.append(make_image_input(image_tensor))
205205
206206# Configure generation
207- config = _llm_runner.GenerationConfig()
208- config.max_new_tokens = 100
209- config.temperature = 0.7
210- config.echo = False
207+ config = GenerationConfig(
208+ max_new_tokens = 100 ,
209+ temperature = 0.7 ,
210+ echo = False
211+ )
211212
212- # Generate text with callback
213+ # Generate with streaming output
213214def token_callback (token : str ):
214215 print (token, end = ' ' , flush = True )
215216
216217def stats_callback (stats ):
217- print (f " \n Generated { stats.num_generated_tokens} tokens " )
218- print (f " Tokens/sec: { stats.num_generated_tokens * 1000 / (stats.inference_end_ms - stats.inference_start_ms):.1f } " )
218+ print (f " \n [Stats] Generated { stats.num_generated_tokens} tokens " )
219+ inference_time = stats.inference_end_ms - stats.inference_start_ms
220+ if inference_time > 0 :
221+ tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
222+ print (f " [Stats] Speed: { tokens_per_sec:.1f } tokens/sec " )
219223
220- # Run generation
221224runner.generate(inputs, config, token_callback, stats_callback)
225+ ```
226+
227+ #### Working with Different Input Types
228+
229+ ``` python
230+ from executorch.extension.llm.runner import (
231+ MultimodalRunner, GenerationConfig,
232+ make_text_input, make_token_input, make_image_input,
233+ make_audio_input, make_raw_audio_input
234+ )
235+ import torch
236+
237+ runner = MultimodalRunner(" model.pte" , " tokenizer.bin" )
238+
239+ # 1. Text input
240+ text_input = make_text_input(" Analyze this multimodal content:" )
241+
242+ # 2. Pre-tokenized input (useful for chat templates)
243+ token_ids = [1 , 15043 , 445 , 2420 ] # Example token IDs
244+ token_input = make_token_input(token_ids)
245+
246+ # 3. Image input from torch tensor
247+ # Supports multiple formats: (H,W,C), (C,H,W), (1,H,W,C), (1,C,H,W)
248+ image_hwc = torch.randint(0 , 255 , (224 , 224 , 3 ), dtype = torch.uint8) # HWC
249+ image_input = make_image_input(image_hwc)
250+
251+ # Float tensors also supported for normalized images
252+ image_float = torch.rand(3 , 224 , 224 , dtype = torch.float32) # CHW, normalized
253+ image_input_float = make_image_input(image_float)
254+
255+ # 4. Preprocessed audio input (e.g., mel spectrograms)
256+ audio_features = torch.rand(1 , 80 , 100 , dtype = torch.float32) # (batch, n_bins, n_frames)
257+ audio_input = make_audio_input(audio_features)
258+
259+ # 5. Raw audio input (for models with built-in audio processing)
260+ raw_audio = torch.randint(0 , 255 , (1 , 1 , 16000 ), dtype = torch.uint8) # (batch, channels, samples)
261+ raw_audio_input = make_raw_audio_input(raw_audio)
222262
223- # Or get complete text result
224- result = runner.generate_text(inputs, config)
225- print (f " Generated text: { result} " )
263+ # Combine inputs and generate
264+ inputs = [text_input, image_input, audio_input]
265+ config = GenerationConfig(max_new_tokens = 50 , temperature = 0.8 )
266+ response = runner.generate_text(inputs, config)
267+ print (f " Response: { response} " )
226268```
227269
228- ### Python API Features
270+ #### Hugging Face Integration
229271
230- - ** Type hints** : Full type annotations with ` .pyi ` stub files for IDE support
231- - ** NumPy integration** : Direct support for numpy arrays as image inputs
232- - ** Callbacks** : Optional token and statistics callbacks for streaming generation
233- - ** Exception handling** : Pythonic error handling with RuntimeError for failures
234- - ** Memory management** : Automatic resource cleanup with Python garbage collection
272+ ``` python
273+ from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
274+ from transformers import AutoProcessor
275+ from PIL import Image
276+ import torch
277+
278+ # Load HF processor for your model
279+ processor = AutoProcessor.from_pretrained(" llava-hf/llava-1.5-7b-hf" )
280+
281+ # Create runner
282+ runner = MultimodalRunner(" llava_model.pte" , " tokenizer.bin" )
283+
284+ # Process inputs with HF processor
285+ image = Image.open(" photo.jpg" )
286+ conversation = [
287+ {" role" : " user" , " content" : [
288+ {" type" : " text" , " text" : " What's in this image?" },
289+ {" type" : " image" }
290+ ]}
291+ ]
292+
293+ # Apply chat template and process
294+ prompt = processor.apply_chat_template(conversation, tokenize = False , add_generation_prompt = True )
295+ inputs_hf = processor(prompt, image, return_tensors = " pt" )
296+
297+ # Generate using HF inputs directly
298+ config = GenerationConfig(max_new_tokens = 100 , temperature = 0.7 )
299+ runner.generate_hf(
300+ inputs_hf,
301+ config,
302+ image_token_id = processor.tokenizer.convert_tokens_to_ids(" <image>" ),
303+ token_callback = lambda token : print (token, end = ' ' , flush = True )
304+ )
305+ ```
306+
307+ #### Chat Session with State Management
308+
309+ ``` python
310+ from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig, make_text_input
311+
312+ class ChatSession :
313+ def __init__ (self , model_path : str , tokenizer_path : str ):
314+ self .runner = MultimodalRunner(model_path, tokenizer_path)
315+ self .config = GenerationConfig(max_new_tokens = 150 , temperature = 0.7 , echo = False )
316+
317+ def send_message (self , message : str ) -> str :
318+ """ Send a message and get response"""
319+ inputs = [make_text_input(message)]
320+ response = self .runner.generate_text(inputs, self .config)
321+ return response
322+
323+ def send_multimodal (self , text : str , image_tensor : torch.Tensor) -> str :
324+ """ Send text + image and get response"""
325+ inputs = [
326+ make_text_input(text),
327+ make_image_input(image_tensor)
328+ ]
329+ response = self .runner.generate_text(inputs, self .config)
330+ return response
331+
332+ def reset_conversation (self ):
333+ """ Reset the conversation state"""
334+ self .runner.reset()
335+
336+ # Usage
337+ chat = ChatSession(" model.pte" , " tokenizer.bin" )
338+ print (chat.send_message(" Hello! How are you?" ))
339+
340+ # Continue conversation (KV cache maintains context)
341+ print (chat.send_message(" What's the weather like?" ))
342+
343+ # Reset when starting new conversation
344+ chat.reset_conversation()
345+ ```
235346
236347### Python API Classes
237348
238349#### GenerationConfig
239350``` python
240- config = _llm_runner.GenerationConfig()
241- config.max_new_tokens = 50 # Maximum tokens to generate
242- config.temperature = 0.8 # Sampling temperature
243- config.echo = True # Echo input prompt
244- config.seq_len = 512 # Maximum sequence length
245- config.num_bos = 1 # Number of BOS tokens
246- config.num_eos = 1 # Number of EOS tokens
351+ from executorch.extension.llm.runner import GenerationConfig
352+
353+ # Create with defaults
354+ config = GenerationConfig()
355+
356+ # Or specify parameters
357+ config = GenerationConfig(
358+ max_new_tokens = 100 , # Maximum tokens to generate (-1 = auto)
359+ temperature = 0.8 , # Sampling temperature (0.0 = deterministic)
360+ echo = True , # Echo input prompt in output
361+ seq_len = 2048 , # Maximum sequence length (-1 = auto)
362+ num_bos = 0 , # Number of BOS tokens
363+ num_eos = 0 # Number of EOS tokens
364+ )
365+
366+ # Modify after creation
367+ config.temperature = 0.5
368+ config.max_new_tokens = 50
247369```
248370
249- #### MultimodalInput
371+ #### MultimodalInput Types
250372``` python
373+ from executorch.extension.llm.runner import (
374+ MultimodalInput, make_text_input, make_token_input,
375+ make_image_input, make_audio_input
376+ )
377+
251378# Text input
252- text_input = _llm_runner.MultimodalInput(" Hello, world!" )
253- # Or using helper
254- text_input = _llm_runner.make_text_input(" Hello, world!" )
255-
256- # Image input
257- image = _llm_runner.Image()
258- image.data = [255 ] * (224 * 224 * 3 ) # RGB data
259- image.width = 224
260- image.height = 224
261- image.channels = 3
262- image_input = _llm_runner.MultimodalInput(image)
263-
264- # Or from numpy array
265- img_array = np.ones((224 , 224 , 3 ), dtype = np.uint8) * 128
266- image_input = _llm_runner.make_image_input(img_array)
379+ text_input = make_text_input(" Hello, world!" )
380+ print (text_input.is_text()) # True
381+ print (text_input.get_text()) # "Hello, world!"
382+
383+ # Token input (pre-tokenized)
384+ token_input = make_token_input([1 , 2 , 3 , 4 ])
385+ print (token_input.is_tokens()) # True
386+ print (token_input.get_tokens()) # [1, 2, 3, 4]
387+
388+ # Image input from torch tensor
389+ import torch
390+ image_tensor = torch.randint(0 , 255 , (224 , 224 , 3 ), dtype = torch.uint8)
391+ image_input = make_image_input(image_tensor)
392+ print (image_input.is_image()) # True
393+ image = image_input.get_image()
394+ print (f " Image: { image.width} x { image.height} x { image.channels} " )
395+
396+ # Check input types safely
397+ if text_input.is_text():
398+ text = text_input.get_text()
399+ elif text_input.is_image():
400+ image = text_input.get_image()
267401```
268402
269- #### Stats
403+ #### Stats and Performance Monitoring
404+ ``` python
405+ def detailed_stats_callback (stats ):
406+ """ Comprehensive stats monitoring"""
407+ print (f " \n === Generation Statistics === " )
408+ print (f " Prompt tokens: { stats.num_prompt_tokens} " )
409+ print (f " Generated tokens: { stats.num_generated_tokens} " )
410+
411+ # Timing breakdown
412+ model_load_time = stats.model_load_end_ms - stats.model_load_start_ms
413+ if model_load_time > 0 :
414+ print (f " Model load time: { model_load_time} ms " )
415+
416+ inference_time = stats.inference_end_ms - stats.inference_start_ms
417+ if inference_time > 0 :
418+ print (f " Total inference time: { inference_time} ms " )
419+
420+ # Calculate throughput
421+ tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
422+ print (f " Generation speed: { tokens_per_sec:.1f } tokens/sec " )
423+
424+ # Time to first token
425+ if stats.first_token_ms > stats.inference_start_ms:
426+ ttft = stats.first_token_ms - stats.inference_start_ms
427+ print (f " Time to first token: { ttft} ms " )
428+
429+ # Export to JSON for logging
430+ json_stats = stats.to_json_string()
431+ print (f " JSON stats: { json_stats} " )
432+
433+ # Use in generation
434+ runner.generate(inputs, config, token_callback, detailed_stats_callback)
435+ ```
436+
437+ ### Error Handling
438+
270439``` python
271- # Access timing and performance statistics
272- stats = _llm_runner.Stats()
273- print (f " Model load time: { stats.model_load_end_ms - stats.model_load_start_ms} ms " )
274- print (f " Inference time: { stats.inference_end_ms - stats.inference_start_ms} ms " )
275- print (f " Tokens generated: { stats.num_generated_tokens} " )
276- print (f " Prompt tokens: { stats.num_prompt_tokens} " )
277-
278- # JSON export
279- json_str = stats.to_json_string()
440+ from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
441+ import torch
442+
443+ try :
444+ runner = MultimodalRunner(" model.pte" , " tokenizer.bin" )
445+
446+ # Invalid image tensor will raise RuntimeError
447+ invalid_image = torch.rand(2 , 224 , 224 , 3 ) # Wrong number of dimensions
448+ inputs = [make_image_input(invalid_image)]
449+
450+ config = GenerationConfig(max_new_tokens = 50 )
451+ runner.generate_text(inputs, config)
452+
453+ except RuntimeError as e:
454+ print (f " Generation failed: { e} " )
455+
456+ except FileNotFoundError as e:
457+ print (f " Model or tokenizer file not found: { e} " )
280458```
281459
282- For detailed Python API documentation and examples , see [ README_PYTHON_BINDINGS.md ] ( README_PYTHON_BINDINGS.md ) .
460+ For more C++ API documentation and implementation details , see the [ Core Components ] ( #core-components ) section below .
283461
284462## Core Components
285463
0 commit comments