diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index 30b9427824f..4cf99a4f78e 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-828ae02053a6e0e20a2dfd6e737ba10c6f4dee6b
+bd06b54e627fbfd354a2cffa4c80fb21883209a9
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 05b25299522..e5d815cfc00 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -43,7 +43,9 @@ def cli_export(command, model_dir):
 
 
 def check_causal_lm_output_quality(
-    model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0
+    model_id: str,
+    generated_tokens: List[int],
+    max_perplexity_threshold: float = 100.0,
 ):
     """
     Evaluates the quality of text generated by a causal language model by calculating its perplexity.
@@ -58,12 +60,24 @@ def check_causal_lm_output_quality(
     """
     logging.info(f"Starting perplexity check with model '{model_id}' ...")
     # Load model
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        low_cpu_mem_usage=True,
-        use_cache=False,
-        torch_dtype=torch.bfloat16,
-    )
+    cls_name = AutoModelForCausalLM
+    if "llava" in model_id:
+        from transformers import LlavaForConditionalGeneration
+
+        cls_name = LlavaForConditionalGeneration
+    try:
+        model = cls_name.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_cache=False,
+            torch_dtype=torch.bfloat16,
+        )
+    except TypeError:
+        model = cls_name.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.bfloat16,
+        )
 
     with torch.no_grad():
         outputs = model(input_ids=generated_tokens, labels=generated_tokens)
@@ -156,6 +170,86 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
     assert check_causal_lm_output_quality(model_id, generated_tokens) is True
 
 
+def test_llm_with_image_modality(
+    model_id, model_dir, recipe, *, quantize=True, run_only=False
+):
+    command = [
+        "optimum-cli",
+        "export",
+        "executorch",
+        "--model",
+        model_id,
+        "--task",
+        "multimodal-text-to-text",
+        "--recipe",
+        recipe,
+        "--output_dir",
+        model_dir,
+        "--use_custom_sdpa",
+        "--use_custom_kv_cache",
+        "--qlinear",
+        "8da4w",
+        "--qembedding",
+        "8w",
+    ]
+    if not run_only:
+        cli_export(command, model_dir)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.save_pretrained(model_dir)
+
+    # input
+    processor = AutoProcessor.from_pretrained(model_id)
+    image_url = "https://llava-vl.github.io/static/images/view.jpg"
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": image_url},
+                {
+                    "type": "text",
+                    "text": "What are the things I should be cautious about when I visit here?",
+                },
+            ],
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    from executorch.extension.llm.runner import GenerationConfig, MultimodalRunner
+
+    runner = MultimodalRunner(f"{model_dir}/model.pte", f"{model_dir}/tokenizer.model")
+    generated_text = runner.generate_text_hf(
+        inputs,
+        GenerationConfig(max_new_tokens=128, temperature=0, echo=False),
+        processor.image_token_id,
+    )
+    print(f"\nGenerated text:\n\t{generated_text}")
+    # Free memory before loading eager for quality check
+    del runner
+    gc.collect()
+    assert (
+        check_causal_lm_output_quality(
+            model_id, tokenizer.encode(generated_text, return_tensors="pt")
+        )
+        is True
+    )
+
+
 def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
     command = [
         "optimum-cli",
@@ -353,6 +447,9 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
         required=False,
         help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.",
     )
+    parser.add_argument(
+        "--run_only", action="store_true", help="Skip export and only run the test"
+    )
     args = parser.parse_args()
 
     _text_generation_mapping = {
@@ -384,8 +481,16 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
         "vit": ("google/vit-base-patch16-224", test_vit),
     }
 
+    _multimodal_model_mapping = {
+        "gemma3-4b": ("google/gemma-3-4b-it", test_llm_with_image_modality),
+        "llava": ("llava-hf/llava-1.5-7b-hf", test_llm_with_image_modality),
+    }
+
     model_to_model_id_and_test_function = (
-        _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping
+        _text_generation_mapping
+        | _mask_fill_mapping
+        | _misc_model_mapping
+        | _multimodal_model_mapping
     )
 
     if args.model not in model_to_model_id_and_test_function:
@@ -400,4 +505,5 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
             model_dir=tmp_dir if args.model_dir is None else args.model_dir,
             recipe=args.recipe,
             quantize=args.quantize,
+            run_only=args.run_only,
         )
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index f372be0e46f..4215db1e2ca 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -286,15 +286,20 @@ jobs:
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
 
-  test-llava-runner-linux:
-    name: test-llava-runner-linux
+  test-multimodal-linux:
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    name: test-multimodal-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
+    secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        model: ["gemma3-4b"]  # llava gives segfault so not covering.
     with:
+      secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.24xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: 'recursive'
@@ -305,17 +310,20 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        echo "::group::Setup ExecuTorch"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
-
-        # install Llava requirements
-        bash examples/models/llama/install_requirements.sh
-        bash examples/models/llava/install_requirements.sh
-
-        # run python unittest
-        python -m unittest examples.models.llava.test.test_llava
-
-        # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        echo "::group::Test ${{ matrix.model }}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
+        echo "::endgroup::"
 
   test-moshi-linux:
     name: test-moshi-linux
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 629c84847f6..362df17dc9b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -616,34 +616,45 @@ jobs:
 
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
 
-  # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
-  # test-llava-runner-macos:
-  #   name: test-llava-runner-macos
-  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-  #   strategy:
-  #     fail-fast: false
-  #   with:
-  #     runner: macos-14-xlarge
-  #     python-version: '3.11'
-  #     submodules: 'recursive'
-  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-  #     timeout: 900
-  #     script: |
-  #       BUILD_TOOL=cmake
-
-  #       bash .ci/scripts/setup-conda.sh
-  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
-  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-
-  #       # install Llava requirements
-  #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
-  #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
-
-  #       # run python unittest
-  #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
-
-  #       # run e2e (export, tokenizer and runner)
-  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh
+  test-multimodal-macos:
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    name: test-multimodal-macos
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        model: ["gemma3-4b"] # llava gives segfault so not covering.
+    with:
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: macos-15-xlarge
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        echo "::group::Set up ExecuTorch"
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        echo "::endgroup::"
+
+        echo "::group::Set up Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        ${CONDA_RUN} pip list
+        echo "::endgroup::"
+
+        echo "::group::Test ${{ matrix.model }}"
+        ${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
+        echo "::endgroup::"
 
   test-qnn-model:
     name: test-qnn-model
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e419a45a879..0fbd77aeec7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -650,15 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
-  list(APPEND _executorch_extensions extension_llm_runner)
-endif()
-
-if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
   install(
@@ -904,6 +895,15 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   list(APPEND _executorch_extensions extension_training)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
+  list(APPEND _executorch_extensions extension_llm_runner)
+endif()
+
+if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 4dcdeea83bf..9dfccf11600 100755
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -7,9 +7,4 @@
 
 set -x
 
-pip install transformers accelerate sentencepiece tiktoken
-
-# Run llama2/install requirements for torchao deps
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-bash "$SCRIPT_DIR"/../llama/install_requirements.sh
+pip install git+https://github.com/huggingface/optimum-executorch.git@d4d3046738ca31b5542506aaa76a28d540600227
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 3946a629ade..635fd7888d2 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -131,8 +131,7 @@ int32_t main(int32_t argc, char** argv) {
 #endif
   // Load tokenizer
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
-      std::make_unique<tokenizers::Llama2cTokenizer>();
-  tokenizer->load(tokenizer_path);
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
   if (tokenizer == nullptr) {
     ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
     return 1;
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index cf8983db1fb..8d280b4eaf9 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -79,3 +79,43 @@ install(
 if(BUILD_TESTING)
   add_subdirectory(test)
 endif()
+
+# Python bindings for MultimodalRunner
+if(EXECUTORCH_BUILD_PYBIND)
+  # Create the Python extension module for LLM runners
+  pybind11_add_module(
+    _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp
+  )
+
+  find_package_torch()
+  find_library(
+    TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
+  )
+  # Link with the extension_llm_runner library and its dependencies
+  target_link_libraries(
+    _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers
+                        portable_lib ${TORCH_PYTHON_LIBRARY} ${TORCH_LIBRARIES}
+  )
+
+  # Set properties for the Python extension
+  set_target_properties(
+    _llm_runner
+    PROPERTIES POSITION_INDEPENDENT_CODE ON
+               CXX_VISIBILITY_PRESET "hidden"
+               INTERPROCEDURAL_OPTIMIZATION TRUE
+  )
+  if(APPLE)
+    set(RPATH "@loader_path/../../pybindings")
+  else()
+    set(RPATH "$ORIGIN/../../pybindings")
+  endif()
+  set_target_properties(_llm_runner PROPERTIES INSTALL_RPATH ${RPATH})
+  # Add include directories
+  target_include_directories(
+    _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
+  )
+
+  install(TARGETS _llm_runner
+          LIBRARY DESTINATION executorch/extension/llm/runner
+  )
+endif()
diff --git a/extension/llm/runner/README.md b/extension/llm/runner/README.md
index ab8ec8964dd..0bede23a228 100644
--- a/extension/llm/runner/README.md
+++ b/extension/llm/runner/README.md
@@ -164,6 +164,301 @@ int main() {
 }
 ```
 
+## Python API
+
+The LLM Runner framework provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features like torch tensor support and Hugging Face compatibility.
+
+### Installation
+
+Build the Python bindings as part of the ExecuTorch build:
+
+```bash
+# Build from source with Python bindings enabled:
+# In executorch root directory
+bash install_executorch.sh
+```
+
+### Quick Start Examples
+
+#### Basic Multimodal Generation
+
+```python
+from executorch.extension.llm.runner import (
+    GenerationConfig, MultimodalRunner, 
+    make_text_input, make_image_input, make_audio_input
+)
+import torch
+
+# Create a multimodal runner
+runner = MultimodalRunner(
+    model_path="/path/to/model.pte",
+    tokenizer_path="/path/to/tokenizer.bin"
+)
+
+# Create multimodal inputs
+inputs = []
+inputs.append(make_text_input("What do you see in this image?"))
+
+# Add image from torch tensor (supports both CHW and HWC formats)
+image_tensor = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)  # CHW format
+inputs.append(make_image_input(image_tensor))
+
+# Configure generation
+config = GenerationConfig(
+    max_new_tokens=100,
+    temperature=0.7,
+    echo=False
+)
+
+# Generate with streaming output
+def token_callback(token: str):
+    print(token, end='', flush=True)
+
+def stats_callback(stats):
+    print(f"\n[Stats] Generated {stats.num_generated_tokens} tokens")
+    inference_time = stats.inference_end_ms - stats.inference_start_ms
+    if inference_time > 0:
+        tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
+        print(f"[Stats] Speed: {tokens_per_sec:.1f} tokens/sec")
+
+runner.generate(inputs, config, token_callback, stats_callback)
+```
+
+#### Working with Different Input Types
+
+```python
+from executorch.extension.llm.runner import (
+    MultimodalRunner, GenerationConfig,
+    make_text_input, make_token_input, make_image_input, 
+    make_audio_input, make_raw_audio_input
+)
+import torch
+
+runner = MultimodalRunner("model.pte", "tokenizer.bin")
+
+# 1. Text input
+text_input = make_text_input("Analyze this multimodal content:")
+
+# 2. Pre-tokenized input (useful for chat templates)
+token_ids = [1, 15043, 445, 2420]  # Example token IDs
+token_input = make_token_input(token_ids)
+
+# 3. Image input from torch tensor
+# Supports multiple formats: (H,W,C), (C,H,W), (1,H,W,C), (1,C,H,W)
+image_hwc = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8)  # HWC
+image_input = make_image_input(image_hwc)
+
+# Float tensors also supported for normalized images
+image_float = torch.rand(3, 224, 224, dtype=torch.float32)  # CHW, normalized
+image_input_float = make_image_input(image_float)
+
+# 4. Preprocessed audio input (e.g., mel spectrograms)
+audio_features = torch.rand(1, 80, 100, dtype=torch.float32)  # (batch, n_bins, n_frames)
+audio_input = make_audio_input(audio_features)
+
+# 5. Raw audio input (for models with built-in audio processing)
+raw_audio = torch.randint(0, 255, (1, 1, 16000), dtype=torch.uint8)  # (batch, channels, samples)
+raw_audio_input = make_raw_audio_input(raw_audio)
+
+# Combine inputs and generate
+inputs = [text_input, image_input, audio_input]
+config = GenerationConfig(max_new_tokens=50, temperature=0.8)
+response = runner.generate_text(inputs, config)
+print(f"Response: {response}")
+```
+
+#### Hugging Face Integration
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
+from transformers import AutoProcessor
+from PIL import Image
+import torch
+
+# Load HF processor for your model
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+# Create runner
+runner = MultimodalRunner("llava_model.pte", "tokenizer.bin")
+
+# Process inputs with HF processor
+image = Image.open("photo.jpg")
+conversation = [
+    {"role": "user", "content": [
+        {"type": "text", "text": "What's in this image?"},
+        {"type": "image"}
+    ]}
+]
+
+# Apply chat template and process
+prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs_hf = processor(prompt, image, return_tensors="pt")
+
+# Generate using HF inputs directly
+config = GenerationConfig(max_new_tokens=100, temperature=0.7)
+runner.generate_hf(
+    inputs_hf, 
+    config, 
+    image_token_id=processor.tokenizer.convert_tokens_to_ids("<image>"),
+    token_callback=lambda token: print(token, end='', flush=True)
+)
+```
+
+#### Chat Session with State Management
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig, make_text_input
+
+class ChatSession:
+    def __init__(self, model_path: str, tokenizer_path: str):
+        self.runner = MultimodalRunner(model_path, tokenizer_path)
+        self.config = GenerationConfig(max_new_tokens=150, temperature=0.7, echo=False)
+        
+    def send_message(self, message: str) -> str:
+        """Send a message and get response"""
+        inputs = [make_text_input(message)]
+        response = self.runner.generate_text(inputs, self.config)
+        return response
+        
+    def send_multimodal(self, text: str, image_tensor: torch.Tensor) -> str:
+        """Send text + image and get response"""
+        inputs = [
+            make_text_input(text),
+            make_image_input(image_tensor)
+        ]
+        response = self.runner.generate_text(inputs, self.config)
+        return response
+        
+    def reset_conversation(self):
+        """Reset the conversation state"""
+        self.runner.reset()
+
+# Usage
+chat = ChatSession("model.pte", "tokenizer.bin")
+print(chat.send_message("Hello! How are you?"))
+
+# Continue conversation (KV cache maintains context)
+print(chat.send_message("What's the weather like?"))
+
+# Reset when starting new conversation
+chat.reset_conversation()
+```
+
+### Python API Classes
+
+#### GenerationConfig
+```python
+from executorch.extension.llm.runner import GenerationConfig
+
+# Create with defaults
+config = GenerationConfig()
+
+# Or specify parameters
+config = GenerationConfig(
+    max_new_tokens=100,    # Maximum tokens to generate (-1 = auto)
+    temperature=0.8,       # Sampling temperature (0.0 = deterministic)
+    echo=True,            # Echo input prompt in output
+    seq_len=2048,         # Maximum sequence length (-1 = auto)
+    num_bos=0,            # Number of BOS tokens
+    num_eos=0             # Number of EOS tokens
+)
+
+# Modify after creation
+config.temperature = 0.5
+config.max_new_tokens = 50
+```
+
+#### MultimodalInput Types
+```python
+from executorch.extension.llm.runner import (
+    MultimodalInput, make_text_input, make_token_input, 
+    make_image_input, make_audio_input
+)
+
+# Text input
+text_input = make_text_input("Hello, world!")
+print(text_input.is_text())  # True
+print(text_input.get_text())  # "Hello, world!"
+
+# Token input (pre-tokenized)
+token_input = make_token_input([1, 2, 3, 4])
+print(token_input.is_tokens())  # True
+print(token_input.get_tokens())  # [1, 2, 3, 4]
+
+# Image input from torch tensor
+import torch
+image_tensor = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8)
+image_input = make_image_input(image_tensor)
+print(image_input.is_image())  # True
+image = image_input.get_image()
+print(f"Image: {image.width}x{image.height}x{image.channels}")
+
+# Check input types safely
+if text_input.is_text():
+    text = text_input.get_text()
+elif text_input.is_image():
+    image = text_input.get_image()
+```
+
+#### Stats and Performance Monitoring
+```python
+def detailed_stats_callback(stats):
+    """Comprehensive stats monitoring"""
+    print(f"\n=== Generation Statistics ===")
+    print(f"Prompt tokens: {stats.num_prompt_tokens}")
+    print(f"Generated tokens: {stats.num_generated_tokens}")
+    
+    # Timing breakdown
+    model_load_time = stats.model_load_end_ms - stats.model_load_start_ms
+    if model_load_time > 0:
+        print(f"Model load time: {model_load_time}ms")
+    
+    inference_time = stats.inference_end_ms - stats.inference_start_ms
+    if inference_time > 0:
+        print(f"Total inference time: {inference_time}ms")
+        
+        # Calculate throughput
+        tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
+        print(f"Generation speed: {tokens_per_sec:.1f} tokens/sec")
+    
+    # Time to first token
+    if stats.first_token_ms > stats.inference_start_ms:
+        ttft = stats.first_token_ms - stats.inference_start_ms
+        print(f"Time to first token: {ttft}ms")
+    
+    # Export to JSON for logging
+    json_stats = stats.to_json_string()
+    print(f"JSON stats: {json_stats}")
+
+# Use in generation
+runner.generate(inputs, config, token_callback, detailed_stats_callback)
+```
+
+### Error Handling
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
+import torch
+
+try:
+    runner = MultimodalRunner("model.pte", "tokenizer.bin")
+    
+    # Invalid image tensor will raise RuntimeError
+    invalid_image = torch.rand(2, 224, 224, 3)  # Wrong number of dimensions
+    inputs = [make_image_input(invalid_image)]
+    
+    config = GenerationConfig(max_new_tokens=50)
+    runner.generate_text(inputs, config)
+    
+except RuntimeError as e:
+    print(f"Generation failed: {e}")
+    
+except FileNotFoundError as e:
+    print(f"Model or tokenizer file not found: {e}")
+```
+
+For more C++ API documentation and implementation details, see the [Core Components](#core-components) section below.
+
 ## Core Components
 
 ### Component Architecture
diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py
new file mode 100644
index 00000000000..f62d62d3429
--- /dev/null
+++ b/extension/llm/runner/__init__.py
@@ -0,0 +1,235 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Python bindings for ExecuTorch MultimodalRunner.
+
+This module provides a Python interface to the ExecuTorch multimodal LLM runner,
+enabling processing of mixed inputs (text, images, audio) and text generation.
+"""
+
+try:
+    # Import shared components from the compiled C++ extension
+    from executorch.extension.llm.runner._llm_runner import (  # noqa: F401
+        GenerationConfig,
+        Image,
+        make_audio_input,
+        make_image_input,
+        make_raw_audio_input,
+        make_text_input,
+        make_token_input,
+        MultimodalInput,
+        MultimodalRunner,
+        Stats,
+    )
+except ImportError:
+    raise RuntimeError(
+        "LLM runner is not installed. Please build ExecuTorch from source with EXECUTORCH_BUILD_PYBIND=ON"
+    )
+
+
+import logging
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+
+
+def _find_image_token_runs(
+    input_ids: torch.Tensor, image_token_id: Optional[int]
+) -> List[tuple[int, int, int]]:
+    """Return contiguous runs (start, end, length) of image_token_id in input_ids.
+
+    input_ids must be a 1D torch.Tensor. If image_token_id is None, returns an empty list.
+    """
+    if image_token_id is None:
+        return []
+
+    ids_list = input_ids.tolist()
+    runs: List[tuple[int, int, int]] = []
+    i = 0
+    L = len(ids_list)
+    while i < L:
+        if ids_list[i] == image_token_id:
+            j = i
+            while j < L and ids_list[j] == image_token_id:
+                j += 1
+            runs.append((i, j - 1, j - i))
+            i = j
+        else:
+            i += 1
+
+    return runs
+
+
+def _hf_to_multimodal_inputs(  # noqa: C901
+    inputs: BatchFeature, image_token_id: Optional[int] = None
+) -> List[MultimodalInput]:
+    """Convert a HuggingFace AutoProcessor dict to ExecuTorch MultimodalInputs.
+    Currently only support 1 image inside the input.
+
+    Args:
+      - inputs: A BatchFeature containing the input data.
+      - image_token_id: The token ID for the image, if present.
+
+    `inputs` expected keys:
+      - 'input_ids': torch.Tensor of shape (L,) or (1, L)
+      - Optional 'pixel_values': torch.Tensor; if present, must also provide
+        'image_token_id' (or alias 'image_token_index') and there must be
+        exactly one image token occurrence in input_ids.
+
+    Raises:
+      RuntimeError: missing keys, invalid shapes/dtypes, or unsupported cases.
+    """
+    if "input_ids" not in inputs:
+        raise RuntimeError("HF inputs dict must contain 'input_ids' (torch.Tensor)")
+
+    input_ids = inputs["input_ids"]
+    if not isinstance(input_ids, torch.Tensor):
+        raise RuntimeError("'input_ids' must be a torch.Tensor")
+
+    if input_ids.dim() == 2:
+        if input_ids.size(0) != 1:
+            raise RuntimeError(
+                "Expected 'input_ids' with batch size 1 when 2D (shape (1, L))"
+            )
+        input_ids = input_ids.squeeze(0)
+    if input_ids.dim() != 1:
+        raise RuntimeError("'input_ids' must be 1D (L) or 2D with batch size 1")
+
+    has_pixel_values = "pixel_values" in inputs
+
+    # If pixel_values in dict, require image_token_id
+    if has_pixel_values and image_token_id is None:
+        raise RuntimeError("'pixel_values' provided but missing 'image_token_id'")
+
+    # If there are image token ids but no pixel_values, it's an error
+    if (
+        image_token_id is not None
+        and (input_ids == image_token_id).any().item()
+        and not has_pixel_values
+    ):
+        raise RuntimeError(
+            "Found image token(s) in input_ids but 'pixel_values' not provided"
+        )
+
+    # No images: return a single tokens input
+    if not has_pixel_values:
+        return [make_token_input(input_ids.to(torch.long).tolist())]
+
+    # Determine number of images from pixel_values shape
+    pv = inputs["pixel_values"]
+    if not isinstance(pv, torch.Tensor):
+        raise RuntimeError(
+            "'pixel_values' must be a torch.Tensor, run with `return_tensors='pt'` in HF processor"
+        )
+    if pv.dim() == 4:
+        num_images = int(pv.size(0))
+    elif pv.dim() == 3:
+        num_images = 1
+    else:
+        raise RuntimeError(
+            f"'pixel_values' must be 3D (C,H,W) or 4D (N,C,H,W)/(N,H,W,C), got shape {pv.shape}"
+        )
+
+    # Only support batch size 1 for now:
+    if num_images != 1:
+        raise RuntimeError("Only 1 image is supported for now")
+    # Find contiguous runs of image_token_id in input_ids
+    runs = _find_image_token_runs(input_ids, image_token_id)
+
+    if len(runs) == 0:
+        raise RuntimeError(
+            "'pixel_values' provided but no occurrence of 'image_token_id' in input_ids"
+        )
+
+    # Support only one image/run for now; enforce exact match
+    if num_images != 1 or len(runs) != 1:
+        raise RuntimeError(
+            f"Mismatch between images and image token runs: images={num_images}, runs={len(runs)} (only batch=1 and a single contiguous run are supported)"
+        )
+
+    first, last, _ = runs[0]
+
+    combined: List[MultimodalInput] = []
+    if first > 0:
+        combined.append(make_token_input(input_ids[:first].to(torch.long).tolist()))
+
+    # Use C++ checked creator for images (handles 3D/4D, CHW/HWC, uint8/float32)
+    combined.append(make_image_input(inputs["pixel_values"]))
+
+    if (last + 1) < input_ids.numel():
+        combined.append(make_token_input(input_ids[last + 1 :].to(torch.long).tolist()))
+
+    return combined
+
+
+def generate_hf(
+    runner: MultimodalRunner,
+    inputs: Union[BatchFeature, List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+    token_callback: Optional[Callable[[str], None]] = None,
+    stats_callback: Optional[Callable[[Stats], None]] = None,
+) -> None:
+    """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, BatchFeature):
+        logging.info(
+            "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    elif isinstance(inputs, list) and all(
+        isinstance(i, MultimodalInput) for i in inputs
+    ):
+        converted = inputs
+    else:
+        raise RuntimeError(
+            "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput"
+        )
+
+    runner.generate(converted, config, token_callback, stats_callback)
+
+
+def generate_text_hf(
+    runner: MultimodalRunner,
+    inputs: Union[BatchFeature, List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+) -> str:
+    """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, BatchFeature):
+        logging.info(
+            "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    elif isinstance(inputs, list) and all(
+        isinstance(i, MultimodalInput) for i in inputs
+    ):
+        converted = inputs
+    else:
+        raise RuntimeError(
+            "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput"
+        )
+
+    return runner.generate_text(converted, config)
+
+
+setattr(MultimodalRunner, "generate_hf", generate_hf)  # noqa B010
+setattr(MultimodalRunner, "generate_text_hf", generate_text_hf)  # noqa B010
+
+
+__all__ = [
+    "GenerationConfig",
+    "Image",
+    "make_audio_input",
+    "make_image_input",
+    "make_raw_audio_input",
+    "make_text_input",
+    "make_token_input",
+    "MultimodalInput",
+    "MultimodalRunner",
+    "Stats",
+]
diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi
new file mode 100644
index 00000000000..295601b092c
--- /dev/null
+++ b/extension/llm/runner/_llm_runner.pyi
@@ -0,0 +1,523 @@
+"""
+Type stubs for _llm_runner module.
+
+This file provides type annotations for the ExecuTorch LLM Runner Python bindings.
+"""
+
+from typing import Callable, List, Optional, overload
+
+import torch
+
+class GenerationConfig:
+    """Configuration for text generation."""
+
+    echo: bool
+    """Whether to echo the input prompt in the output."""
+
+    max_new_tokens: int
+    """Maximum number of new tokens to generate (-1 for auto)."""
+
+    warming: bool
+    """Whether this is a warmup run (affects perf benchmarking)."""
+
+    seq_len: int
+    """Maximum number of total tokens (-1 for auto)."""
+
+    temperature: float
+    """Temperature for sampling (higher = more random)."""
+
+    num_bos: int
+    """Number of BOS tokens to add to the prompt."""
+
+    num_eos: int
+    """Number of EOS tokens to add to the prompt."""
+
+    def __init__(
+        self,
+        *,
+        echo: bool = True,
+        max_new_tokens: int = -1,
+        warming: bool = False,
+        seq_len: int = -1,
+        temperature: float = 0.8,
+        num_bos: int = 0,
+        num_eos: int = 0,
+    ) -> None:
+        """Initialize GenerationConfig with optional keyword arguments for all fields."""
+        ...
+
+    def resolve_max_new_tokens(
+        self, max_context_len: int, num_prompt_tokens: int
+    ) -> int:
+        """
+        Resolve the maximum number of new tokens to generate based on constraints.
+
+        Args:
+            max_context_len: The maximum context length supported by the model
+            num_prompt_tokens: The number of tokens in the input prompt
+
+        Returns:
+            The resolved maximum number of new tokens to generate
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Stats:
+    """Statistics for LLM generation performance."""
+
+    SCALING_FACTOR_UNITS_PER_SECOND: int
+    """Scaling factor for timestamps (1000 for milliseconds)."""
+
+    model_load_start_ms: int
+    """Start time of model loading in milliseconds."""
+
+    model_load_end_ms: int
+    """End time of model loading in milliseconds."""
+
+    inference_start_ms: int
+    """Start time of inference in milliseconds."""
+
+    token_encode_end_ms: int
+    """End time of tokenizer encoding in milliseconds."""
+
+    model_execution_start_ms: int
+    """Start time of model execution in milliseconds."""
+
+    model_execution_end_ms: int
+    """End time of model execution in milliseconds."""
+
+    prompt_eval_end_ms: int
+    """End time of prompt evaluation in milliseconds."""
+
+    first_token_ms: int
+    """Timestamp when the first generated token is emitted."""
+
+    inference_end_ms: int
+    """End time of inference/generation in milliseconds."""
+
+    aggregate_sampling_time_ms: int
+    """Total time spent in sampling across all tokens."""
+
+    num_prompt_tokens: int
+    """Number of tokens in the input prompt."""
+
+    num_generated_tokens: int
+    """Number of tokens generated."""
+
+    def on_sampling_begin(self) -> None:
+        """Mark the beginning of a sampling operation."""
+        ...
+
+    def on_sampling_end(self) -> None:
+        """Mark the end of a sampling operation."""
+        ...
+
+    def reset(self, all_stats: bool = False) -> None:
+        """
+        Reset statistics.
+
+        Args:
+            all_stats: If True, reset all stats including model load times.
+                      If False, preserve model load times.
+        """
+        ...
+
+    def to_json_string(self) -> str:
+        """Convert stats to JSON string representation."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Image:
+    """Container for image data."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty Image."""
+        ...
+
+    @overload
+    def __init__(self, data: List[int], width: int, height: int, channels: int) -> None:
+        """Initialize an Image with uint8 data."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[float], width: int, height: int, channels: int
+    ) -> None:
+        """Initialize an Image with float data."""
+        ...
+
+    def is_uint8(self) -> bool:
+        """Check if image data is uint8 format."""
+        ...
+
+    def is_float(self) -> bool:
+        """Check if image data is float format."""
+        ...
+
+    @property
+    def width(self) -> int:
+        """Image width in pixels."""
+        ...
+
+    @property
+    def height(self) -> int:
+        """Image height in pixels."""
+        ...
+
+    @property
+    def channels(self) -> int:
+        """Number of color channels (3 for RGB, 4 for RGBA)."""
+        ...
+
+    @property
+    def uint8_data(self) -> List[int]:
+        """Raw image data as uint8 values."""
+        ...
+
+    @property
+    def float_data(self) -> List[float]:
+        """Raw image data as float values."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Audio:
+    """Container for preprocessed audio data."""
+
+    data: List[int]
+    """Raw audio data as a list of uint8 values."""
+
+    batch_size: int
+    """Batch size of the audio data."""
+
+    n_bins: int
+    """Number of frequency bins (for spectrograms)."""
+
+    n_frames: int
+    """Number of time frames."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty Audio."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[int], batch_size: int, n_bins: int, n_frames: int
+    ) -> None:
+        """Initialize Audio with preprocessed data."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class RawAudio:
+    """Container for raw audio data."""
+
+    data: List[int]
+    """Raw audio data as a list of uint8 values."""
+
+    batch_size: int
+    """Batch size of the audio data."""
+
+    n_channels: int
+    """Number of audio channels (1 for mono, 2 for stereo)."""
+
+    n_samples: int
+    """Number of audio samples."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty RawAudio."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[int], batch_size: int, n_channels: int, n_samples: int
+    ) -> None:
+        """Initialize RawAudio with raw data."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class MultimodalInput:
+    """Container for multimodal input data (text, image, audio, etc.)."""
+
+    @overload
+    def __init__(self, text: str) -> None:
+        """
+        Create a MultimodalInput with text.
+
+        Args:
+            text: The input text string
+        """
+        ...
+
+    @overload
+    def __init__(self, image: Image) -> None:
+        """
+        Create a MultimodalInput with an image.
+
+        Args:
+            image: The input image
+        """
+        ...
+
+    @overload
+    def __init__(self, audio: Audio) -> None:
+        """
+        Create a MultimodalInput with preprocessed audio.
+
+        Args:
+            audio: The input audio data
+        """
+        ...
+
+    @overload
+    def __init__(self, raw_audio: RawAudio) -> None:
+        """
+        Create a MultimodalInput with raw audio.
+
+        Args:
+            raw_audio: The input raw audio data
+        """
+        ...
+
+    def is_text(self) -> bool:
+        """Check if this input contains text."""
+        ...
+
+    def is_image(self) -> bool:
+        """Check if this input contains an image."""
+        ...
+
+    def is_audio(self) -> bool:
+        """Check if this input contains preprocessed audio."""
+        ...
+
+    def is_raw_audio(self) -> bool:
+        """Check if this input contains raw audio."""
+        ...
+
+    def get_text(self) -> Optional[str]:
+        """
+        Get the text content if this is a text input.
+
+        Returns:
+            The text string if this is a text input, None otherwise
+        """
+        ...
+
+    def get_image(self) -> Optional[Image]:
+        """
+        Get the image content if this is an image input.
+
+        Returns:
+            The Image object if this is an image input, None otherwise
+        """
+        ...
+
+    def get_audio(self) -> Optional[Audio]:
+        """
+        Get the audio content if this is an audio input.
+
+        Returns:
+            The Audio object if this is an audio input, None otherwise
+        """
+        ...
+
+    def get_raw_audio(self) -> Optional[RawAudio]:
+        """
+        Get the raw audio content if this is a raw audio input.
+
+        Returns:
+            The RawAudio object if this is a raw audio input, None otherwise
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+class MultimodalRunner:
+    """Runner for multimodal language models."""
+
+    def __init__(
+        self, model_path: str, tokenizer_path: str, data_path: Optional[str] = None
+    ) -> None:
+        """
+        Initialize a MultimodalRunner.
+
+        Args:
+            model_path: Path to the model file (.pte)
+            tokenizer_path: Path to the tokenizer file
+            data_path: Optional path to additional data file
+        Raises:
+            RuntimeError: If initialization fails
+        """
+        ...
+
+    def generate(
+        self,
+        inputs: List[MultimodalInput],
+        config: GenerationConfig,
+        token_callback: Optional[Callable[[str], None]] = None,
+        stats_callback: Optional[Callable[[Stats], None]] = None,
+    ) -> None:
+        """
+        Generate text from multimodal inputs.
+
+        Args:
+            inputs: List of multimodal inputs (text, images, etc.)
+            config: Generation configuration
+            token_callback: Optional callback called for each generated token
+            stats_callback: Optional callback called with generation statistics
+
+        Raises:
+            RuntimeError: If generation fails
+        """
+        ...
+
+    def generate_hf(
+        self,
+        inputs: dict,
+        config: GenerationConfig,
+        token_callback: Optional[Callable[[str], None]] = None,
+        stats_callback: Optional[Callable[[Stats], None]] = None,
+        image_token_id: Optional[int] = None,
+    ) -> None:
+        """
+        Generate text directly from a HuggingFace processor dict.
+
+        Expects at least 'input_ids' (torch.Tensor). If 'pixel_values' is provided,
+        an 'image_token_id' (or 'image_token_index') must also be present to locate
+        the image position(s) in input_ids.
+
+        Args:
+            inputs: HF processor outputs (e.g., from AutoProcessor.apply_chat_template)
+            config: Generation configuration
+            token_callback: Optional per-token callback
+            stats_callback: Optional stats callback
+            image_token_id: Optional image token ID (or index)
+
+        Raises:
+            RuntimeError: If required keys are missing, shapes are invalid, or generation fails
+        """
+        ...
+
+    def prefill(self, inputs: List[MultimodalInput]) -> None:
+        """
+        Prefill multimodal inputs (e.g., to rebuild KV cache from chat history)
+        without generating tokens.
+
+        Args:
+            inputs: List of multimodal inputs to prefill
+
+        Raises:
+            RuntimeError: If prefill fails
+        """
+        ...
+
+    def generate_text(
+        self, inputs: List[MultimodalInput], config: GenerationConfig
+    ) -> str:
+        """
+        Generate text and return the complete result as a string.
+
+        Args:
+            inputs: List of multimodal inputs (text, images, etc.)
+            config: Generation configuration
+
+        Returns:
+            The generated text as a string
+
+        Raises:
+            RuntimeError: If generation fails
+        """
+        ...
+
+    def generate_text_hf(
+        self, inputs: dict, config: GenerationConfig, image_token_id
+    ) -> str:
+        """
+        Generate text directly from a HuggingFace processor dict and return as string.
+
+        See generate_hf(inputs: dict, ...) for expected keys and constraints.
+        """
+        ...
+
+    def stop(self) -> None:
+        """Stop the current generation process."""
+        ...
+
+    def reset(self) -> None:
+        """Reset the runner state and KV cache."""
+        ...
+
+    def get_vocab_size(self) -> int:
+        """
+        Get the vocabulary size of the model.
+
+        Returns:
+            The vocabulary size, or -1 if not available
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+def make_text_input(text: str) -> MultimodalInput:
+    """
+    Create a text input for multimodal processing.
+
+    Args:
+        text: The input text string
+
+    Returns:
+        A MultimodalInput containing the text
+    """
+    ...
+
+def make_image_input(image_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create an image input from a torch tensor.
+
+    Args:
+        image_tensor: Torch tensor with shape (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)
+
+    Returns:
+        A MultimodalInput containing the image
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or number of channels
+    """
+    ...
+
+def make_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create a preprocessed audio input from a torch tensor.
+
+    Args:
+        audio_tensor: Torch tensor with shape (batch_size, n_bins, n_frames)
+
+    Returns:
+        A MultimodalInput containing the preprocessed audio
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or dtype
+    """
+    ...
+
+def make_raw_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create a raw audio input from a torch tensor.
+
+    Args:
+        audio_tensor: Torch tensor with shape (batch_size, n_channels, n_samples)
+
+    Returns:
+        A MultimodalInput containing the raw audio
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or dtype
+    """
+    ...
diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h
index 737821f51e9..01f73e3314c 100644
--- a/extension/llm/runner/multimodal_input.h
+++ b/extension/llm/runner/multimodal_input.h
@@ -31,7 +31,7 @@ class ET_EXPERIMENTAL MultimodalInput {
   /// Type of multimodal input data
   enum class Type {
     TEXT, ///< Text string input
-    TOKENS, ///< Pre-tokenized input (vector of token IDs)
+    TOKENS, ///< Tokenizer encoded input (vector of token IDs)
     IMAGE, ///< Processed image input
     AUDIO, ///< Processed audio input
     RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file)
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
new file mode 100644
index 00000000000..bcc6aba0f8e
--- /dev/null
+++ b/extension/llm/runner/pybindings.cpp
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/python.h>
+
+#include <executorch/extension/llm/runner/audio.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace py = pybind11;
+using namespace executorch::extension::llm;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+// Helper macro for error handling
+#define THROW_IF_ERROR(error, message, ...)                       \
+  ({                                                              \
+    if ((error) != Error::Ok) {                                   \
+      char msg_buf[256];                                          \
+      snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
+      throw std::runtime_error(msg_buf);                          \
+    }                                                             \
+  })
+
+// Python wrapper class for MultimodalRunner
+class PyMultimodalRunner {
+ public:
+  // Constructor that takes a tokenizer path
+  PyMultimodalRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      std::optional<const std::string> data_path = std::nullopt) {
+    // Load tokenizer using the helper function
+    auto tokenizer =
+        load_tokenizer(tokenizer_path, nullptr, std::nullopt, 0, 0);
+    if (!tokenizer) {
+      throw std::runtime_error(
+          "Failed to load tokenizer from: " + tokenizer_path);
+    }
+
+    // Create multimodal runner using the helper function
+    runner_ =
+        create_multimodal_runner(model_path, std::move(tokenizer), data_path);
+    if (!runner_) {
+      throw std::runtime_error(
+          "Failed to create multimodal runner with model: " + model_path);
+    }
+  }
+
+  void generate(
+      const std::vector<MultimodalInput>& inputs,
+      const GenerationConfig& config,
+      py::object token_callback = py::none(),
+      py::object stats_callback = py::none()) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+
+    // Convert Python callbacks to C++ std::function
+    std::function<void(const std::string&)> cpp_token_callback = nullptr;
+    if (!token_callback.is_none()) {
+      cpp_token_callback = [token_callback](const std::string& token) {
+        py::gil_scoped_acquire acquire;
+        token_callback(token);
+      };
+    }
+
+    std::function<void(const Stats&)> cpp_stats_callback = nullptr;
+    if (!stats_callback.is_none()) {
+      cpp_stats_callback = [stats_callback](const Stats& stats) {
+        py::gil_scoped_acquire acquire;
+        stats_callback(stats);
+      };
+    }
+
+    // Release GIL during generation
+    {
+      py::gil_scoped_release release;
+      Error error = runner_->generate(
+          inputs, config, cpp_token_callback, cpp_stats_callback);
+      THROW_IF_ERROR(error, "Generation failed");
+    }
+  }
+
+  std::string generate_text(
+      const std::vector<MultimodalInput>& inputs,
+      const GenerationConfig& config) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+
+    std::string generated_text;
+    auto cpp_token_callback = [&generated_text](const std::string& token) {
+      generated_text += token;
+    };
+    Error error =
+        runner_->generate(inputs, config, cpp_token_callback, nullptr);
+    THROW_IF_ERROR(error, "Generation failed");
+
+    return generated_text;
+  }
+
+  void stop() {
+    if (runner_) {
+      runner_->stop();
+    }
+  }
+
+  void reset() {
+    if (runner_) {
+      runner_->reset();
+    }
+  }
+
+  void prefill(std::vector<MultimodalInput> inputs) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+    {
+      py::gil_scoped_release release;
+      Error error = runner_->prefill(inputs);
+      THROW_IF_ERROR(error, "Prefill failed");
+    }
+  }
+
+  // Note: Since the runner owns the tokenizer and metadata after creation,
+  // we cannot directly access them. This is a limitation of the current design.
+  // For now, we'll return a placeholder value.
+  int32_t get_vocab_size() const {
+    // TODO: Consider exposing metadata through the MultimodalRunner interface
+    return -1; // Indicate that vocab size is not available
+  }
+
+ private:
+  std::unique_ptr<MultimodalRunner> runner_;
+};
+
+PYBIND11_MODULE(_llm_runner, m) {
+  m.doc() = "Python bindings for ExecuTorch LLM Runners";
+
+  // Initialize ExecuTorch runtime
+  runtime_init();
+
+  // Bind GenerationConfig
+  py::class_<GenerationConfig>(m, "GenerationConfig")
+      // Constructor with keyword arguments for all fields (all optional via
+      // defaults)
+      .def(
+          py::init([](bool echo,
+                      int32_t max_new_tokens,
+                      bool warming,
+                      int32_t seq_len,
+                      float temperature,
+                      int32_t num_bos,
+                      int32_t num_eos) {
+            GenerationConfig cfg;
+            cfg.echo = echo;
+            cfg.max_new_tokens = max_new_tokens;
+            cfg.warming = warming;
+            cfg.seq_len = seq_len;
+            cfg.temperature = temperature;
+            cfg.num_bos = num_bos;
+            cfg.num_eos = num_eos;
+            return cfg;
+          }),
+          py::arg("echo") = true,
+          py::arg("max_new_tokens") = -1,
+          py::arg("warming") = false,
+          py::arg("seq_len") = -1,
+          py::arg("temperature") = 0.8f,
+          py::arg("num_bos") = 0,
+          py::arg("num_eos") = 0)
+      .def_readwrite("echo", &GenerationConfig::echo)
+      .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens)
+      .def_readwrite("warming", &GenerationConfig::warming)
+      .def_readwrite("seq_len", &GenerationConfig::seq_len)
+      .def_readwrite("temperature", &GenerationConfig::temperature)
+      .def_readwrite("num_bos", &GenerationConfig::num_bos)
+      .def_readwrite("num_eos", &GenerationConfig::num_eos)
+      .def(
+          "resolve_max_new_tokens",
+          &GenerationConfig::resolve_max_new_tokens,
+          py::arg("max_context_len"),
+          py::arg("num_prompt_tokens"),
+          "Resolve the maximum number of new tokens to generate based on constraints")
+      .def("__repr__", [](const GenerationConfig& config) {
+        return "<GenerationConfig max_new_tokens=" +
+            std::to_string(config.max_new_tokens) +
+            " seq_len=" + std::to_string(config.seq_len) +
+            " temperature=" + std::to_string(config.temperature) +
+            " echo=" + (config.echo ? "True" : "False") +
+            " warming=" + (config.warming ? "True" : "False") + ">";
+      });
+
+  // Bind Stats
+  py::class_<Stats>(m, "Stats")
+      .def_readonly(
+          "SCALING_FACTOR_UNITS_PER_SECOND",
+          &Stats::SCALING_FACTOR_UNITS_PER_SECOND)
+      .def_readonly("model_load_start_ms", &Stats::model_load_start_ms)
+      .def_readonly("model_load_end_ms", &Stats::model_load_end_ms)
+      .def_readonly("inference_start_ms", &Stats::inference_start_ms)
+      .def_readonly("token_encode_end_ms", &Stats::token_encode_end_ms)
+      .def_readonly(
+          "model_execution_start_ms", &Stats::model_execution_start_ms)
+      .def_readonly("model_execution_end_ms", &Stats::model_execution_end_ms)
+      .def_readonly("prompt_eval_end_ms", &Stats::prompt_eval_end_ms)
+      .def_readonly("first_token_ms", &Stats::first_token_ms)
+      .def_readonly("inference_end_ms", &Stats::inference_end_ms)
+      .def_readonly(
+          "aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms)
+      .def_readonly("num_prompt_tokens", &Stats::num_prompt_tokens)
+      .def_readonly("num_generated_tokens", &Stats::num_generated_tokens)
+      .def("on_sampling_begin", &Stats::on_sampling_begin)
+      .def("on_sampling_end", &Stats::on_sampling_end)
+      .def(
+          "reset",
+          &Stats::reset,
+          py::arg("all_stats") = false,
+          "Reset stats, optionally including model load times")
+      .def(
+          "to_json_string",
+          [](const Stats& stats) { return stats_to_json_string(stats); },
+          "Convert stats to JSON string representation")
+      .def("__repr__", [](const Stats& stats) {
+        double tokens_per_second = 0.0;
+        if (stats.inference_end_ms > stats.inference_start_ms) {
+          tokens_per_second = static_cast<double>(stats.num_generated_tokens) *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND /
+              (stats.inference_end_ms - stats.inference_start_ms);
+        }
+        return "<Stats num_prompt_tokens=" +
+            std::to_string(stats.num_prompt_tokens) + " num_generated_tokens=" +
+            std::to_string(stats.num_generated_tokens) +
+            " tokens_per_second=" + std::to_string(tokens_per_second) + ">";
+      });
+
+  // Bind Image class
+  py::class_<Image>(m, "Image")
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def(
+          py::init<std::vector<float>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def("is_uint8", &Image::is_uint8)
+      .def("is_float", &Image::is_float)
+      .def_property_readonly("width", &Image::width)
+      .def_property_readonly("height", &Image::height)
+      .def_property_readonly("channels", &Image::channels)
+      .def_property_readonly(
+          "uint8_data",
+          static_cast<const std::vector<uint8_t>& (Image::*)() const&>(
+              &Image::get_uint8_data))
+      .def_property_readonly(
+          "float_data",
+          static_cast<const std::vector<float>& (Image::*)() const&>(
+              &Image::get_float_data))
+      .def("__repr__", [](const Image& img) {
+        std::string dtype = "unknown";
+        if (img.is_uint8()) {
+          dtype = "uint8";
+        } else if (img.is_float()) {
+          dtype = "float32";
+        }
+        return "<Image height=" + std::to_string(img.height()) +
+            " width=" + std::to_string(img.width()) +
+            " channels=" + std::to_string(img.channels()) + " dtype=" + dtype +
+            ">";
+      });
+
+  // Bind Audio class
+  py::class_<Audio>(m, "Audio")
+      .def(py::init<>())
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_bins"),
+          py::arg("n_frames"),
+          "Create preprocessed audio data (uint8)")
+      .def(
+          py::init<std::vector<float>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_bins"),
+          py::arg("n_frames"),
+          "Create preprocessed audio data (float32)")
+      .def("is_uint8", &Audio::is_uint8)
+      .def("is_float", &Audio::is_float)
+      .def_property_readonly(
+          "uint8_data",
+          static_cast<const std::vector<uint8_t>& (Audio::*)() const&>(
+              &Audio::get_uint8_data))
+      .def_property_readonly(
+          "float_data",
+          static_cast<const std::vector<float>& (Audio::*)() const&>(
+              &Audio::get_float_data))
+      .def_property_readonly("batch_size", &Audio::get_batch_size)
+      .def_property_readonly("n_bins", &Audio::get_n_bins)
+      .def_property_readonly("n_frames", &Audio::get_n_frames)
+      .def("toTensor", &Audio::toTensor)
+      .def("__repr__", [](const Audio& audio) {
+        std::string dtype = "unknown";
+        if (audio.is_uint8()) {
+          dtype = "uint8";
+        } else if (audio.is_float()) {
+          dtype = "float32";
+        }
+        return "<Audio batch_size=" + std::to_string(audio.get_batch_size()) +
+            " n_bins=" + std::to_string(audio.get_n_bins()) +
+            " n_frames=" + std::to_string(audio.get_n_frames()) +
+            " dtype=" + dtype + ">";
+      });
+
+  // Bind RawAudio class
+  py::class_<RawAudio>(m, "RawAudio")
+      .def(py::init<>())
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_channels"),
+          py::arg("n_samples"),
+          "Create raw audio data")
+      .def_readwrite("data", &RawAudio::data)
+      .def_readwrite("batch_size", &RawAudio::batch_size)
+      .def_readwrite("n_channels", &RawAudio::n_channels)
+      .def_readwrite("n_samples", &RawAudio::n_samples)
+      .def("__repr__", [](const RawAudio& audio) {
+        return "<RawAudio batch_size=" + std::to_string(audio.batch_size) +
+            " n_channels=" + std::to_string(audio.n_channels) +
+            " n_samples=" + std::to_string(audio.n_samples) + ">";
+      });
+
+  // Bind MultimodalInput
+  py::class_<MultimodalInput>(m, "MultimodalInput")
+      .def(
+          py::init<const std::string&>(),
+          py::arg("text"),
+          "Create a MultimodalInput with text")
+      .def(
+          py::init<const std::vector<uint64_t>&>(),
+          py::arg("tokens"),
+          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
+      .def(
+          py::init<const std::vector<uint64_t>&>(),
+          py::arg("tokens"),
+          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
+      .def(
+          py::init<const Image&>(),
+          py::arg("image"),
+          "Create a MultimodalInput with an image")
+      .def(
+          py::init<const Audio&>(),
+          py::arg("audio"),
+          "Create a MultimodalInput with preprocessed audio")
+      .def(
+          py::init<const RawAudio&>(),
+          py::arg("raw_audio"),
+          "Create a MultimodalInput with raw audio")
+      .def("is_text", &MultimodalInput::is_text)
+      .def("is_tokens", &MultimodalInput::is_tokens)
+      .def("is_image", &MultimodalInput::is_image)
+      .def("is_audio", &MultimodalInput::is_audio)
+      .def("is_raw_audio", &MultimodalInput::is_raw_audio)
+      .def(
+          "get_text",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_text()) {
+              return py::cast(input.get_text());
+            }
+            return py::none();
+          })
+      .def(
+          "get_tokens",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_tokens()) {
+              return py::cast(input.get_tokens());
+            }
+            return py::none();
+          })
+      .def(
+          "get_image",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_image()) {
+              return py::cast(input.get_image());
+            }
+            return py::none();
+          })
+      .def(
+          "get_audio",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_audio()) {
+              return py::cast(input.get_audio());
+            }
+            return py::none();
+          })
+      .def(
+          "get_raw_audio",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_raw_audio()) {
+              return py::cast(input.get_raw_audio());
+            }
+            return py::none();
+          })
+      .def("__repr__", [](const MultimodalInput& input) -> std::string {
+        if (input.is_text()) {
+          return "<MultimodalInput type=text content=\"" +
+              input.get_text().substr(0, 50) +
+              (input.get_text().length() > 50 ? "..." : "") + "\">";
+        } else if (input.is_image()) {
+          return "<MultimodalInput type=image>";
+        } else if (input.is_tokens()) {
+          return "<MultimodalInput type=tokens>";
+        } else if (input.is_audio()) {
+          return "<MultimodalInput type=audio>";
+        } else if (input.is_raw_audio()) {
+          return "<MultimodalInput type=raw_audio>";
+        }
+        return "<MultimodalInput type=unknown>";
+      });
+
+  // Bind helper functions using lambdas
+  m.def(
+      "make_token_input",
+      [](py::sequence tokens) -> MultimodalInput {
+        std::vector<uint64_t> vec;
+        vec.reserve(py::len(tokens));
+        for (auto item : tokens) {
+          uint64_t v = py::cast<uint64_t>(item);
+          vec.push_back(v);
+        }
+        return MultimodalInput(std::move(vec));
+      },
+      "Create a token input from a Python sequence of ints",
+      py::arg("tokens"));
+
+  m.def(
+      "make_text_input",
+      [](const std::string& text) -> MultimodalInput {
+        return MultimodalInput(text);
+      },
+      "Create a text input for multimodal processing",
+      py::arg("text"));
+
+  m.def(
+      "make_image_input",
+      [](torch::Tensor image_tensor) -> MultimodalInput {
+        if (image_tensor.dim() == 4) {
+          if (image_tensor.size(0) != 1) {
+            throw std::runtime_error(
+                "Batch size for 4D image tensor must be 1");
+          }
+          image_tensor = image_tensor.squeeze(0);
+        }
+
+        if (image_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
+        }
+
+        int64_t height, width, channels;
+        // Check for memory format and permute to CHW if necessary
+        if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+          // Input is HWC, permute to CHW
+          height = image_tensor.size(0);
+          width = image_tensor.size(1);
+          channels = image_tensor.size(2);
+          image_tensor = image_tensor.permute({2, 0, 1});
+        } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) {
+          // Input is CHW
+          channels = image_tensor.size(0);
+          height = image_tensor.size(1);
+          width = image_tensor.size(2);
+        } else {
+          throw std::runtime_error(
+              "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format.");
+        }
+
+        if (channels != 3 && channels != 4) {
+          throw std::runtime_error(
+              "Image must have 3 (RGB) or 4 (RGBA) channels");
+        }
+
+        image_tensor = image_tensor.contiguous();
+        if (image_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = image_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else if (image_tensor.scalar_type() == torch::kFloat) {
+          float* data = image_tensor.data_ptr<float>();
+          std::vector<float> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else {
+          throw std::runtime_error(
+              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
+        }
+      },
+      "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
+      py::arg("image_tensor"));
+
+  m.def(
+      "make_audio_input",
+      [](torch::Tensor audio_tensor) -> MultimodalInput {
+        if (audio_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
+        }
+
+        int64_t batch_size = audio_tensor.size(0);
+        int64_t n_bins = audio_tensor.size(1);
+        int64_t n_frames = audio_tensor.size(2);
+
+        audio_tensor = audio_tensor.contiguous();
+        if (audio_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(Audio(
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_bins),
+              static_cast<int32_t>(n_frames)));
+        } else if (audio_tensor.scalar_type() == torch::kFloat) {
+          float* data = audio_tensor.data_ptr<float>();
+          std::vector<float> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(Audio(
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_bins),
+              static_cast<int32_t>(n_frames)));
+        } else {
+          throw std::runtime_error(
+              "Unsupported audio tensor dtype. Only uint8 and float32 are supported for preprocessed audio.");
+        }
+      },
+      "Create a preprocessed audio input from a torch tensor (batch_size, n_bins, n_frames)",
+      py::arg("audio_tensor"));
+
+  m.def(
+      "make_raw_audio_input",
+      [](torch::Tensor audio_tensor) -> MultimodalInput {
+        if (audio_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Raw audio tensor must be 3-dimensional (batch_size, n_channels, n_samples)");
+        }
+
+        int64_t batch_size = audio_tensor.size(0);
+        int64_t n_channels = audio_tensor.size(1);
+        int64_t n_samples = audio_tensor.size(2);
+
+        audio_tensor = audio_tensor.contiguous();
+        if (audio_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(RawAudio{
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_channels),
+              static_cast<int32_t>(n_samples)});
+        } else {
+          throw std::runtime_error(
+              "Unsupported raw audio tensor dtype. Only uint8 is supported for raw audio.");
+        }
+      },
+      "Create a raw audio input from a torch tensor (batch_size, n_channels, n_samples)",
+      py::arg("audio_tensor"));
+
+  // Bind PyMultimodalRunner
+  py::class_<PyMultimodalRunner>(m, "MultimodalRunner")
+      // Constructor with tokenizer path
+      .def(
+          py::init<
+              const std::string&,
+              const std::string&,
+              std::optional<const std::string>>(),
+          py::arg("model_path"),
+          py::arg("tokenizer_path"),
+          py::arg("data_path") = py::none(),
+          "Initialize a MultimodalRunner with model and tokenizer paths")
+      .def(
+          "generate",
+          &PyMultimodalRunner::generate,
+          py::arg("inputs"),
+          py::arg("config"),
+          py::arg("token_callback") = py::none(),
+          py::arg("stats_callback") = py::none(),
+          "Generate text from multimodal inputs with optional callbacks")
+      .def(
+          "prefill",
+          &PyMultimodalRunner::prefill,
+          py::arg("inputs"),
+          "Prefill multimodal inputs (e.g., chat history) without generating tokens")
+      .def("stop", &PyMultimodalRunner::stop, "Stop the current generation")
+      .def(
+          "generate_text",
+          &PyMultimodalRunner::generate_text,
+          py::arg("inputs"),
+          py::arg("config"),
+          "Generate text from multimodal inputs and return the complete "
+          "result")
+      .def(
+          "reset",
+          &PyMultimodalRunner::reset,
+          "Reset the runner state and KV cache")
+      .def(
+          "get_vocab_size",
+          &PyMultimodalRunner::get_vocab_size,
+          "Get the vocabulary size of the model")
+      .def("__repr__", [](const PyMultimodalRunner& runner) {
+        return "<MultimodalRunner>";
+      });
+}
\ No newline at end of file
diff --git a/extension/llm/runner/test/test_runner_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py
new file mode 100644
index 00000000000..f30226bf3e2
--- /dev/null
+++ b/extension/llm/runner/test/test_runner_pybindings.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Unit tests for the ExecuTorch LLM Runner Python bindings.
+
+To run these tests:
+    python -m pytest test_pybindings.py -v
+"""
+
+import os
+import tempfile
+import unittest
+
+import torch
+from executorch.extension.llm.runner import (
+    GenerationConfig,
+    Image,
+    make_image_input,
+    make_text_input,
+    MultimodalInput,
+    MultimodalRunner,
+)
+
+
+class TestGenerationConfig(unittest.TestCase):
+    """Test the GenerationConfig class."""
+
+    def test_default_values(self):
+        """Test that GenerationConfig has correct default values."""
+        config = GenerationConfig()
+
+        # Check defaults based on irunner.h
+        self.assertEqual(config.echo, True)
+        self.assertEqual(config.max_new_tokens, -1)
+        self.assertEqual(config.warming, False)
+        self.assertEqual(config.seq_len, -1)
+        self.assertAlmostEqual(config.temperature, 0.8, places=5)
+        self.assertEqual(config.num_bos, 0)
+        self.assertEqual(config.num_eos, 0)
+
+    def test_set_values(self):
+        """Test setting values on GenerationConfig."""
+        config = GenerationConfig()
+
+        config.echo = False
+        config.max_new_tokens = 100
+        config.warming = True
+        config.seq_len = 512
+        config.temperature = 0.5
+        config.num_bos = 1
+        config.num_eos = 2
+
+        self.assertEqual(config.echo, False)
+        self.assertEqual(config.max_new_tokens, 100)
+        self.assertEqual(config.warming, True)
+        self.assertEqual(config.seq_len, 512)
+        self.assertAlmostEqual(config.temperature, 0.5, places=5)
+        self.assertEqual(config.num_bos, 1)
+        self.assertEqual(config.num_eos, 2)
+
+    def test_resolve_max_new_tokens(self):
+        """Test the resolve_max_new_tokens method."""
+        config = GenerationConfig()
+
+        # Test case 1: Both seq_len and max_new_tokens are -1
+        config.seq_len = -1
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 924)  # 1024 - 100
+
+        # Test case 2: Only max_new_tokens is specified
+        config.seq_len = -1
+        config.max_new_tokens = 200
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 200)  # min(200, 1024-100)
+
+        # Test case 3: Only seq_len is specified
+        config.seq_len = 512
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 412)  # min(512, 1024) - 100
+
+        # Test case 4: Both are specified
+        config.seq_len = 512
+        config.max_new_tokens = 200
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 200)  # min(min(512, 1024) - 100, 200)
+
+        # Test case 5: Result would be negative
+        config.seq_len = 50
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 0)  # max(0, 50 - 100)
+
+    def test_repr(self):
+        """Test the string representation."""
+        config = GenerationConfig()
+        config.max_new_tokens = 100
+        config.seq_len = 512
+        config.temperature = 0.7
+
+        repr_str = repr(config)
+        self.assertIn("GenerationConfig", repr_str)
+        self.assertIn("max_new_tokens=100", repr_str)
+        self.assertIn("seq_len=512", repr_str)
+        self.assertIn("temperature=0.7", repr_str)
+        self.assertIn("echo=True", repr_str)
+        self.assertIn("warming=False", repr_str)
+
+
+class TestImage(unittest.TestCase):
+    """Test the Image class."""
+
+    def test_creation(self):
+        """Test creating an Image object."""
+        # Construct using binding constructor (uint8 data)
+        image = Image([1, 2, 3, 4], 2, 2, 1)
+
+        # Properties are read-only
+        self.assertEqual(image.uint8_data, [1, 2, 3, 4])
+        self.assertEqual(image.width, 2)
+        self.assertEqual(image.height, 2)
+        self.assertEqual(image.channels, 1)
+
+    def test_repr(self):
+        """Test string representation."""
+        image = Image([0] * (480 * 640 * 3), 640, 480, 3)
+
+        repr_str = repr(image)
+        self.assertIn("Image", repr_str)
+        self.assertIn("height=480", repr_str)
+        self.assertIn("width=640", repr_str)
+        self.assertIn("channels=3", repr_str)
+
+
+class TestMultimodalInput(unittest.TestCase):
+    """Test the MultimodalInput class."""
+
+    def test_text_input(self):
+        """Test creating a text MultimodalInput."""
+        # Test direct constructor
+        text_input = MultimodalInput("Hello, world!")
+        self.assertTrue(text_input.is_text())
+        self.assertFalse(text_input.is_image())
+        self.assertEqual(text_input.get_text(), "Hello, world!")
+
+        # Test helper function
+        text_input2 = make_text_input("Test text")
+        self.assertTrue(text_input2.is_text())
+        self.assertEqual(text_input2.get_text(), "Test text")
+
+    def test_image_input(self):
+        """Test creating an image MultimodalInput."""
+        # Create an image
+        image = Image([255] * (100 * 100 * 3), 100, 100, 3)
+
+        # Test direct constructor
+        image_input = MultimodalInput(image)
+        self.assertTrue(image_input.is_image())
+        self.assertFalse(image_input.is_text())
+
+        # Test helper function with torch tensor (CHW)
+        img_tensor = torch.ones((3, 50, 60), dtype=torch.uint8) * 128
+        image_input2 = make_image_input(img_tensor)
+        self.assertTrue(image_input2.is_image())
+        self.assertFalse(image_input2.is_text())
+
+    def test_invalid_image_array(self):
+        """Test error handling for invalid image arrays."""
+        # Wrong dimensions (expects 3D or 4D tensor)
+        with self.assertRaises(RuntimeError) as cm:
+            make_image_input(torch.ones((100,), dtype=torch.uint8))
+        self.assertIn("3-dimensional", str(cm.exception))
+
+        # Wrong number of channels
+        with self.assertRaises(RuntimeError) as cm:
+            make_image_input(torch.ones((2, 100, 100), dtype=torch.uint8))
+        self.assertIn("3 (RGB) or 4 (RGBA)", str(cm.exception))
+
+    def test_repr(self):
+        """Test string representation."""
+        # Text input
+        text_input = MultimodalInput("This is a test")
+        repr_str = repr(text_input)
+        self.assertIn("MultimodalInput", repr_str)
+        self.assertIn("type=text", repr_str)
+        self.assertIn("This is a test", repr_str)
+
+        # Long text input (should be truncated)
+        long_text = "a" * 100
+        text_input2 = MultimodalInput(long_text)
+        repr_str2 = repr(text_input2)
+        self.assertIn("...", repr_str2)
+
+        # Image input
+        image = Image([0, 0, 0], 1, 1, 3)
+        image_input = MultimodalInput(image)
+        repr_str3 = repr(image_input)
+        self.assertIn("type=image", repr_str3)
+
+
+class TestMultimodalRunner(unittest.TestCase):
+    """Test the MultimodalRunner class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create temporary files for testing
+        self.temp_dir = tempfile.mkdtemp()
+        self.model_path = os.path.join(self.temp_dir, "model.pte")
+        self.tokenizer_path = os.path.join(self.temp_dir, "tokenizer.bin")
+
+        # Create dummy files (these won't actually work, but we can test initialization failure)
+        with open(self.model_path, "wb") as f:
+            f.write(b"dummy model")
+        with open(self.tokenizer_path, "wb") as f:
+            f.write(b"dummy tokenizer")
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        import shutil
+
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_initialization_failure(self):
+        """Test that initialization fails gracefully with invalid files."""
+        with self.assertRaises(RuntimeError) as cm:
+            MultimodalRunner(self.model_path, self.tokenizer_path, None)
+        # Should fail because the tokenizer file is not valid
+        self.assertIn("Failed to", str(cm.exception))
+
+
+class TestHelperFunctions(unittest.TestCase):
+    """Test helper functions."""
+
+    def test_make_text_input(self):
+        """Test make_text_input helper."""
+        text_input = make_text_input("Hello")
+        self.assertTrue(text_input.is_text())
+        self.assertEqual(text_input.get_text(), "Hello")
+
+    def test_make_image_input(self):
+        """Test make_image_input helper."""
+        # Create a test image tensor (RGB, CHW)
+        img_tensor = torch.zeros((3, 100, 150), dtype=torch.uint8)
+        img_tensor[0, :, :] = 255  # Red channel
+
+        image_input = make_image_input(img_tensor)
+        self.assertTrue(image_input.is_image())
+
+        # Test with RGBA (CHW)
+        img_tensor_rgba = torch.ones((4, 50, 50), dtype=torch.uint8) * 128
+        image_input_rgba = make_image_input(img_tensor_rgba)
+        self.assertTrue(image_input_rgba.is_image())
diff --git a/setup.py b/setup.py
index def9b996be0..fe9543f3243 100644
--- a/setup.py
+++ b/setup.py
@@ -815,6 +815,9 @@ def run(self):  # noqa C901
             cmake_build_args += ["--target", "portable_lib"]
             cmake_build_args += ["--target", "selective_build"]
 
+        if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"):
+            cmake_build_args += ["--target", "_llm_runner"]
+
         if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_MODULE"):
             cmake_build_args += ["--target", "extension_module"]
 
@@ -884,6 +887,11 @@ def run(self):  # noqa C901
             modpath="executorch.codegen.tools.selective_build",
             dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"],
         ),
+        BuiltExtension(
+            src="extension/llm/runner/_llm_runner.*",  # @lint-ignore https://github.com/pytorch/executorch/blob/cb3eba0d7f630bc8cec0a9cc1df8ae2f17af3f7a/scripts/lint_xrefs.sh
+            modpath="executorch.extension.llm.runner._llm_runner",
+            dependent_cmake_flags=["EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"],
+        ),
         BuiltExtension(
             src="executorchcoreml.*",
             src_dir="backends/apple/coreml",
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
index c7ad94cd8be..f98e68ef5ac 100644
--- a/tools/cmake/preset/pybind.cmake
+++ b/tools/cmake/preset/pybind.cmake
@@ -22,12 +22,18 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 
+# TODO(larryliu0820): Temporarily disable building llm_runner for Windows wheel
+# due to the issue of tokenizer file path length limitation.
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL
                                                "WIN32"
 )