diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index f59a29420d7..448987e8d6b 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -165,7 +165,7 @@ jobs:
         echo "::endgroup::"
 
   export-gemma3-cuda-artifact:
-    name: export-gemma3-cuda-artifact
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -173,6 +173,19 @@ jobs:
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          # TODO: enable gemma3 quantization
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -198,7 +211,8 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Gemma3"
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "google/gemma-3-4b-it" \
             --task "multimodal-text-to-text" \
@@ -212,7 +226,7 @@ jobs:
         test -f aoti_cuda_blob.ptd
         echo "::endgroup::"
 
-        echo "::group::Store Gemma3 Artifacts"
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}/"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -407,3 +421,87 @@ jobs:
           exit $EXIT_CODE
         fi
         echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          # TODO: enable quantized gemma3.
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
diff --git a/examples/models/gemma3/CMakeLists.txt b/examples/models/gemma3/CMakeLists.txt
new file mode 100644
index 00000000000..0be346d70f2
--- /dev/null
+++ b/examples/models/gemma3/CMakeLists.txt
@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for gemma3 e2e runner.
+#
+cmake_minimum_required(VERSION 3.24)
+project(gemma3)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find `executorch` libraries, same as for gflags
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+set(_srcs e2e_runner.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# stb_image: a lightweight library to load images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+# Add deprecated/ to use stb_image_resize.h for internal compatibility
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
+# Add the required ExecuTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(gemma3_e2e_runner ${_srcs})
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(gemma3_e2e_runner)
+  if(NOT APPLE)
+    target_link_options(gemma3_e2e_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  gemma3_e2e_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries})
+target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/gemma3/e2e_runner.cpp b/examples/models/gemma3/e2e_runner.cpp
new file mode 100644
index 00000000000..68f19e8296d
--- /dev/null
+++ b/examples/models/gemma3/e2e_runner.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include <stb_image_resize.h>
+
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
+DEFINE_string(
+    model_path,
+    "multimodal.pte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_string(data_path, "", "Path to data file.");
+DEFINE_string(tokenizer_path, "tokenizer.json", "Tokenizer stuff.");
+
+DEFINE_string(prompt, "What is in this image?", "Text prompt.");
+
+DEFINE_string(image_path, "", "Path to input image file.");
+
+DEFINE_double(
+    temperature,
+    0.0f,
+    "Temperature; Default is 0. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+
+DEFINE_bool(warmup, false, "Whether to run a warmup run.");
+
+namespace {
+
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+using ::executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_image_input;
+using ::executorch::extension::llm::make_text_input;
+using ::executorch::extension::llm::MultimodalInput;
+using ::executorch::runtime::EValue;
+
+bool ends_with(const std::string& str, const std::string& suffix) {
+  return str.size() >= suffix.size() &&
+      str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+/**
+ * @brief Loads an image from a file and resizes it to 896x896
+ *
+ * This function loads an image using stb_image and resizes it to the expected
+ * input size for Gemma3 (896x896). The image is converted to CHW (Channel,
+ * Height, Width) format which is expected by the model.
+ *
+ * @param image_path Path to the image file (.jpg, .png, etc.)
+ * @return MultimodalInput containing the loaded and processed image data
+ * @throws std::runtime_error if image loading fails
+ */
+MultimodalInput loadImage(const std::string& image_path) {
+  if (!ends_with(image_path, ".jpg") && !ends_with(image_path, ".jpeg") &&
+      !ends_with(image_path, ".png") && !ends_with(image_path, ".bmp")) {
+    ET_LOG(
+        Error,
+        "Unsupported image file format: %s (only .jpg, .jpeg, .png, .bmp are supported)",
+        image_path.c_str());
+    throw std::runtime_error("Unsupported image file format");
+  }
+
+  int width, height, channels;
+  unsigned char* data =
+      stbi_load(image_path.c_str(), &width, &height, &channels, 0);
+  if (!data) {
+    ET_LOG(Error, "Failed to load image: %s", image_path.c_str());
+    throw std::runtime_error("Failed to load image");
+  }
+
+  ET_LOG(
+      Info,
+      "Loaded image: %s, original size: %dx%d, channels: %d",
+      image_path.c_str(),
+      width,
+      height,
+      channels);
+
+  // Resize to 896x896 (Gemma3 vision encoder input size)
+  const int target_size = 896;
+  std::vector<uint8_t> resized_data(target_size * target_size * channels);
+
+  int resize_result = stbir_resize_uint8(
+      data,
+      width,
+      height,
+      0,
+      resized_data.data(),
+      target_size,
+      target_size,
+      0,
+      channels);
+
+  if (!resize_result) {
+    stbi_image_free(data);
+    ET_LOG(Error, "Failed to resize image");
+    throw std::runtime_error("Failed to resize image");
+  }
+
+  // Convert from HWC (Height, Width, Channel) to CHW (Channel, Height, Width)
+  // and normalize uint8 [0, 255] to float32 [0.0, 1.0]
+  std::vector<float> chw_data(channels * target_size * target_size);
+  for (int h = 0; h < target_size; ++h) {
+    for (int w = 0; w < target_size; ++w) {
+      for (int c = 0; c < channels; ++c) {
+        uint8_t pixel_value =
+            resized_data[h * target_size * channels + w * channels + c];
+        chw_data[c * target_size * target_size + h * target_size + w] =
+            static_cast<float>(pixel_value) / 255.0f;
+      }
+    }
+  }
+
+  ET_LOG(
+      Info,
+      "Resized and converted image to CHW format (float32): %dx%d, channels: %d",
+      target_size,
+      target_size,
+      channels);
+
+  Image image(std::move(chw_data), target_size, target_size, channels);
+  stbi_image_free(data);
+
+  return make_image_input(std::move(image));
+}
+
+} // namespace
+
+int32_t main(int32_t argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  const char* model_path = FLAGS_model_path.c_str();
+  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
+  const char* prompt = FLAGS_prompt.c_str();
+  const char* image_path = FLAGS_image_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
+  float temperature = FLAGS_temperature;
+  int32_t cpu_threads = FLAGS_cpu_threads;
+  bool warmup = FLAGS_warmup;
+
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif
+
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
+  if (tokenizer == nullptr) {
+    ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
+    return 1;
+  }
+
+  // Create multimodal runner
+  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
+      ::executorch::extension::llm::create_multimodal_runner(
+          model_path, std::move(tokenizer), data_path);
+
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create multimodal runner");
+    return 1;
+  }
+
+  // Load runner
+  auto load_error = runner->load();
+  if (load_error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load multimodal runner");
+    return 1;
+  }
+
+  // Prepare inputs
+  std::vector<MultimodalInput> inputs = {
+      make_text_input("<start_of_turn>user\n<start_of_image>"),
+      loadImage(image_path),
+      make_text_input(
+          std::string(prompt) + "<end_of_turn>\n<start_of_turn>model\n"),
+  };
+
+  ::executorch::extension::llm::GenerationConfig config;
+  config.max_new_tokens = 100;
+  config.temperature = temperature;
+
+  // Run warmup if requested
+  if (warmup) {
+    ET_LOG(Info, "Running warmup...");
+    auto warmup_error = runner->generate(inputs, config);
+    if (warmup_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Error, "Failed to run warmup");
+      return 1;
+    }
+    runner->reset();
+  }
+
+  auto error = runner->generate(inputs, config);
+
+  if (error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with multimodal runner\n");
+    return 1;
+  }
+  ET_LOG(Info, "Generated successfully");
+
+  return 0;
+}
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 8578187128f..ddf9c86f180 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -69,6 +69,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
           image.is_uint8(),
           InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+      ET_CHECK_OR_RETURN_ERROR(
+          image.is_float(),
+          InvalidArgument,
+          "Model expects BFloat16 data, we need to take image in float32 type and convert afterwards. But now image has uint8_t data.");
     } else {
       ET_CHECK_OR_RETURN_ERROR(
           false,
@@ -85,6 +90,16 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     ET_CHECK_OK_OR_RETURN_ERROR(
         image_tensor_result.error(), "Failed to convert image to tensor");
     auto image_tensor = image_tensor_result.get();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+      // Convert to bfloat16 for model input
+      auto image_tensor_return = convert_to_bfloat16(image_tensor);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          image_tensor_return.error(),
+          "Failed to convert image tensor to bfloat16");
+      image_tensor = image_tensor_return.get();
+    }
+
     ET_LOG(
         Info,
         "Image tensor dim: %zu, dtype: %s",