diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index b2e36e0a1f2..1814ac2c717 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import <ExecuTorch/ExecuTorch.h>
+
 #import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
@@ -29,6 +31,16 @@ __attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMImage : NSObject<NSCopying>
 
+/**
+ Initializes an image container from a tensor.
+
+ @param tensor   A tensor with shape {C, H, W} and dtype Byte or Float.
+ @return An initialized ExecuTorchLLMImage instance.
+*/
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor
+    NS_DESIGNATED_INITIALIZER
+    NS_SWIFT_NAME(init(_:));
+
 /**
  Initializes an image container with the provided data and dimensions.
 
@@ -41,16 +53,21 @@ __attribute__((objc_subclassing_restricted))
 - (instancetype)initWithData:(NSData *)data
                        width:(NSInteger)width
                       height:(NSInteger)height
-                    channels:(NSInteger)channels
-    NS_DESIGNATED_INITIALIZER;
+                    channels:(NSInteger)channels;
+
+/**
+ Initializes an image container with the provided float data and dimensions.
 
+ @param data       Float image buffer.
+ @param width      Image width in pixels.
+ @param height     Image height in pixels.
+ @param channels   Number of channels.
+ @return An initialized ExecuTorchLLMImage instance.
+*/
 - (instancetype)initWithFloatData:(NSData *)data
                             width:(NSInteger)width
                            height:(NSInteger)height
-                         channels:(NSInteger)channels
-    NS_DESIGNATED_INITIALIZER;
-
-@property(nonatomic, readonly) NSData *data;
+                         channels:(NSInteger)channels;
 
 @property(nonatomic, readonly) NSInteger width;
 
@@ -60,6 +77,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) BOOL isFloat;
 
+@property(nonatomic, readonly) ExecuTorchTensor *tensor;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -73,6 +92,16 @@ __attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMAudio : NSObject<NSCopying>
 
+/**
+ Initializes an audio features container from a tensor.
+
+ @param tensor   A tensor with shape {batchSize, bins, frames} and dtype Byte or Float.
+ @return An initialized ExecuTorchLLMAudio instance.
+*/
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor
+    NS_DESIGNATED_INITIALIZER
+    NS_SWIFT_NAME(init(_:));
+
 /**
  Initializes an audio features container with the provided data and shape.
 
@@ -85,16 +114,21 @@ __attribute__((objc_subclassing_restricted))
 - (instancetype)initWithData:(NSData *)data
                    batchSize:(NSInteger)batchSize
                         bins:(NSInteger)bins
-                      frames:(NSInteger)frames
-    NS_DESIGNATED_INITIALIZER;
+                      frames:(NSInteger)frames;
+
+/**
+ Initializes an audio features container with the provided float data and shape.
 
+ @param data        Float feature buffer.
+ @param batchSize   Batch dimension size.
+ @param bins        Number of frequency bins.
+ @param frames      Number of time frames.
+ @return An initialized ExecuTorchLLMAudio instance.
+*/
 - (instancetype)initWithFloatData:(NSData *)data
                         batchSize:(NSInteger)batchSize
                              bins:(NSInteger)bins
-                           frames:(NSInteger)frames
-    NS_DESIGNATED_INITIALIZER;
-
-@property(nonatomic, readonly) NSData *data;
+                           frames:(NSInteger)frames;
 
 @property(nonatomic, readonly) NSInteger batchSize;
 
@@ -104,6 +138,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) BOOL isFloat;
 
+@property(nonatomic, readonly) ExecuTorchTensor *tensor;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index 964805053e2..c7ee1956033 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -21,35 +21,60 @@ @interface ExecuTorchLLMConfig ()
 
 @end
 
-@implementation ExecuTorchLLMImage
+@implementation ExecuTorchLLMImage {
+  ExecuTorchTensor *_tensor;
+}
+
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor {
+  ET_CHECK(tensor);
+  if (self = [super init]) {
+    ET_CHECK_MSG(tensor.shape.count == 3, "Image tensor must be rank-3 {C,H,W}");
+    ExecuTorchDataType dataType = tensor.dataType;
+    ET_CHECK_MSG(dataType == ExecuTorchDataTypeByte || dataType == ExecuTorchDataTypeFloat,
+                 "Image tensor must be Byte or Float");
+    _tensor = tensor;
+  }
+  return self;
+}
 
 - (instancetype)initWithData:(NSData *)data
                        width:(NSInteger)width
                       height:(NSInteger)height
                     channels:(NSInteger)channels {
-  if (self = [super init]) {
-    _data = [data copy];
-    _width = width;
-    _height = height;
-    _channels = channels;
-    _isFloat = NO;
-  }
-  return self;
+  return [self initWithTensor:[[ExecuTorchTensor alloc]
+                                 initWithData:data
+                                        shape:@[@(channels), @(height), @(width)]
+                                      dataType:ExecuTorchDataTypeByte]];
 }
 
 - (instancetype)initWithFloatData:(NSData *)data
                             width:(NSInteger)width
                            height:(NSInteger)height
                          channels:(NSInteger)channels {
-  self = [super init];
-  if (self) {
-    _data = [data copy];
-    _width = width;
-    _height = height;
-    _channels = channels;
-    _isFloat = YES;
-  }
-  return self;
+  return [self initWithTensor:[[ExecuTorchTensor alloc]
+                                 initWithData:data
+                                        shape:@[@(channels), @(height), @(width)]
+                                      dataType:ExecuTorchDataTypeFloat]];
+}
+
+- (NSInteger)width {
+  return _tensor.shape[2].integerValue;
+}
+
+- (NSInteger)height {
+  return _tensor.shape[1].integerValue;
+}
+
+- (NSInteger)channels {
+  return _tensor.shape[0].integerValue;
+}
+
+- (BOOL)isFloat {
+  return _tensor.dataType == ExecuTorchDataTypeFloat;
+}
+
+- (ExecuTorchTensor *)tensor {
+  return _tensor;
 }
 
 - (id)copyWithZone:(NSZone *)zone {
@@ -58,35 +83,60 @@ - (id)copyWithZone:(NSZone *)zone {
 
 @end
 
-@implementation ExecuTorchLLMAudio
+@implementation ExecuTorchLLMAudio {
+  ExecuTorchTensor *_tensor;
+}
+
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor {
+  ET_CHECK(tensor);
+  if (self = [super init]) {
+    ET_CHECK_MSG(tensor.shape.count == 3, "Audio tensor must be rank-3 {B,bins,frames}");
+    ExecuTorchDataType dataType = tensor.dataType;
+    ET_CHECK_MSG(dataType == ExecuTorchDataTypeByte || dataType == ExecuTorchDataTypeFloat,
+                 "Audio tensor must be Byte or Float");
+    _tensor = tensor;
+  }
+  return self;
+}
 
 - (instancetype)initWithData:(NSData *)data
                    batchSize:(NSInteger)batchSize
                         bins:(NSInteger)bins
                       frames:(NSInteger)frames {
-  if (self = [super init]) {
-    _data = [data copy];
-    _batchSize = batchSize;
-    _bins = bins;
-    _frames = frames;
-    _isFloat = NO;
-  }
-  return self;
+  return [self initWithTensor:
+      [[ExecuTorchTensor alloc] initWithData:data
+                                       shape:@[@(batchSize), @(bins), @(frames)]
+                                    dataType:ExecuTorchDataTypeByte]];
 }
 
 - (instancetype)initWithFloatData:(NSData *)data
                         batchSize:(NSInteger)batchSize
                              bins:(NSInteger)bins
                            frames:(NSInteger)frames {
-  self = [super init];
-  if (self) {
-    _data = [data copy];
-    _batchSize = batchSize;
-    _bins = bins;
-    _frames = frames;
-    _isFloat = YES;
-  }
-  return self;
+  return [self initWithTensor:
+      [[ExecuTorchTensor alloc] initWithData:data
+                                       shape:@[@(batchSize), @(bins), @(frames)]
+                                    dataType:ExecuTorchDataTypeFloat]];
+}
+
+- (NSInteger)batchSize {
+  return _tensor.shape[0].integerValue;
+}
+
+- (NSInteger)bins {
+  return _tensor.shape[1].integerValue;
+}
+
+- (NSInteger)frames {
+  return _tensor.shape[2].integerValue;
+}
+
+- (BOOL)isFloat {
+  return _tensor.dataType == ExecuTorchDataTypeFloat;
+}
+
+- (ExecuTorchTensor *)tensor {
+  return _tensor;
 }
 
 - (id)copyWithZone:(NSZone *)zone {
@@ -208,54 +258,16 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
       case ExecuTorchLLMMultimodalInputTypeText:
         nativeInputs.emplace_back(llm::MultimodalInput(input.text.UTF8String));
         break;
-      case ExecuTorchLLMMultimodalInputTypeImage: {
-        ExecuTorchLLMImage *image = input.image;
-        if (image.isFloat) {
-          const float *buffer = (const float *)image.data.bytes;
-          size_t elementCount = (size_t)image.data.length / sizeof(float);
-          std::vector<float> data(buffer, buffer + elementCount);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-            std::move(data),
-            (int32_t)image.width,
-            (int32_t)image.height,
-            (int32_t)image.channels
-          )));
-        } else {
-          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
-          std::vector<uint8_t> data(buffer, buffer + image.data.length);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-            std::move(data),
-            (int32_t)image.width,
-            (int32_t)image.height,
-            (int32_t)image.channels
-          )));
-        }
+      case ExecuTorchLLMMultimodalInputTypeImage:
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+          make_tensor_ptr(*reinterpret_cast<TensorPtr *>(input.image.tensor.nativeInstance))
+        )));
         break;
-      }
-      case ExecuTorchLLMMultimodalInputTypeAudio: {
-        ExecuTorchLLMAudio *audio = input.audio;
-        if (audio.isFloat) {
-          const float *buffer = (const float *)audio.data.bytes;
-          size_t elementCount = (size_t)audio.data.length / sizeof(float);
-          std::vector<float> data(buffer, buffer + elementCount);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
-            std::move(data),
-            (int32_t)audio.batchSize,
-            (int32_t)audio.bins,
-            (int32_t)audio.frames
-          )));
-        } else {
-          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
-          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
-            std::move(data),
-            (int32_t)audio.batchSize,
-            (int32_t)audio.bins,
-            (int32_t)audio.frames
-          )));
-        }
+      case ExecuTorchLLMMultimodalInputTypeAudio:
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+          make_tensor_ptr(*reinterpret_cast<TensorPtr *>(input.audio.tensor.nativeInstance))
+        )));
         break;
-      }
       default: {
         if (error) {
           *error = [NSError errorWithDomain:ExecuTorchLLMErrorDomain
diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h
index ce71513ed17..cc7e6b1714a 100644
--- a/extension/llm/runner/audio.h
+++ b/extension/llm/runner/audio.h
@@ -11,7 +11,6 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -41,27 +40,16 @@ struct ET_EXPERIMENTAL RawAudio {
  */
 class ET_EXPERIMENTAL Audio final {
  public:
-  // Default constructor
-  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
-
   // Constructor for uint8_t data
   Audio(
       std::vector<uint8_t>&& data,
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 0 &&
-            std::get<std::vector<uint8_t>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<uint8_t>>(data_).size(),
-        batch_size * n_bins * n_frames);
-  }
+      : Audio(make_tensor_ptr(
+            {batch_size, n_bins, n_frames},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Audio(
@@ -69,89 +57,64 @@ class ET_EXPERIMENTAL Audio final {
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 1 &&
-            std::get<std::vector<float>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<float>>(data_).size(),
-        batch_size * n_bins * n_frames);
+      : Audio(make_tensor_ptr({batch_size, n_bins, n_frames}, std::move(data))) {}
+
+  explicit Audio(
+      executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
   }
 
   // Type checkers
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
   // Data access
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
   int32_t get_batch_size() const {
-    return batch_size_;
+    return tensor_->size(0);
   }
   int32_t get_n_bins() const {
-    return n_bins_;
+    return tensor_->size(1);
   }
   int32_t get_n_frames() const {
-    return n_frames_;
+    return tensor_->size(2);
   }
   /**
    * Convert the audio data to a TensorPtr, with optional batch dimension.
    * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
    * batch_size, n_bins, n_frames) if with_batch is true.
    */
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    std::vector<executorch::aten::SizesType> sizes = {
-        get_batch_size(), get_n_bins(), get_n_frames()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           static_cast<executorch::aten::SizesType>(tensor_->size(0)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(1)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(2))});
     }
-    ET_LOG(
-        Error,
-        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
   // Members
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t batch_size_;
-  int32_t n_bins_;
-  int32_t n_frames_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index dbdba273536..9c7746fff2a 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -10,9 +10,7 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
-#include <cstddef>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -22,21 +20,19 @@ namespace executorch {
 namespace extension {
 namespace llm {
 
+// Assuming NCHW format
 class ET_EXPERIMENTAL Image {
  public:
-  // Default constructor
-  Image() : width_(0), height_(0), channels_(0) {}
-
   // Constructor for uint8_t data
   Image(
       std::vector<uint8_t>&& data,
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr(
+            {channels, height, width},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Image(
@@ -44,78 +40,60 @@ class ET_EXPERIMENTAL Image {
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr({channels, height, width}, std::move(data))) {}
+
+  explicit Image(executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
+  }
 
   // Getters
-  int32_t width() const {
-    return width_;
+  int32_t channels() const {
+    return tensor_->size(0);
   }
+
   int32_t height() const {
-    return height_;
+    return tensor_->size(1);
   }
-  int32_t channels() const {
-    return channels_;
+
+  int32_t width() const {
+    return tensor_->size(2);
   }
 
   // Data access
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
-  }
-
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
-  }
-
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
-    // tensor (NCHW). The caller should handle reshaping if needed.
-    std::vector<executorch::aten::SizesType> sizes = {
-        channels(), height(), width()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           executorch::aten::SizesType(tensor_->size(0)),
+           executorch::aten::SizesType(tensor_->size(1)),
+           executorch::aten::SizesType(tensor_->size(2))});
     }
-    ET_LOG(
-        Error, "Image data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
-  // Assuming NCHW format
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t width_;
-  int32_t height_;
-  int32_t channels_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 7f5a8356979..97d52268fd8 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -77,9 +77,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
     // tensor (CHW). Add a batch dimension of 1 if needed.
     auto expected_dims = input_meta.sizes();
-    auto image_tensor = ET_UNWRAP(
-        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
-        "Failed to convert image to tensor");
+    auto image_tensor = image.tensor(/*with_batch*/ expected_dims.size() == 4);
     ET_LOG(
         Info,
         "Image tensor dim: %zu, dtype: %s",
@@ -108,8 +106,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto expected_dtype = input_meta.scalar_type();
 
     // Create tensor with original dtype
-    auto audio_tensor =
-        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+    auto audio_tensor = audio.tensor();
 
     // Convert to expected dtype if needed
     if (audio_tensor->scalar_type() != expected_dtype) {
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
index bcc6aba0f8e..f9d471edcc4 100644
--- a/extension/llm/runner/pybindings.cpp
+++ b/extension/llm/runner/pybindings.cpp
@@ -132,7 +132,7 @@ class PyMultimodalRunner {
     }
   }
 
-  void prefill(std::vector<MultimodalInput> inputs) {
+  void prefill(const std::vector<MultimodalInput>& inputs) {
     if (!runner_) {
       throw std::runtime_error("Runner not initialized");
     }
@@ -274,14 +274,29 @@ PYBIND11_MODULE(_llm_runner, m) {
       .def_property_readonly("width", &Image::width)
       .def_property_readonly("height", &Image::height)
       .def_property_readonly("channels", &Image::channels)
-      .def_property_readonly(
-          "uint8_data",
-          static_cast<const std::vector<uint8_t>& (Image::*)() const&>(
-              &Image::get_uint8_data))
-      .def_property_readonly(
-          "float_data",
-          static_cast<const std::vector<float>& (Image::*)() const&>(
-              &Image::get_float_data))
+      .def(
+          "tensor",
+          [](const Image& image, bool with_batch) {
+            return tensor_to_torch_tensor(*image.tensor(with_batch));
+          },
+          py::arg("with_batch") = false)
+      .def_buffer([](Image& image) -> py::buffer_info {
+        auto tensor = image.tensor();
+        const auto scalar_type = tensor->scalar_type();
+        const auto element_size = elementSize(scalar_type);
+        const auto* format = scalar_type == aten::ScalarType::Byte
+          ? py::format_descriptor<uint8_t>::format()
+          : py::format_descriptor<float>::format();
+        py::buffer_info buffer_info(
+          tensor->mutable_data_ptr(),
+          element_size,
+          format,
+          tensor->dim(),
+          std::vector<aten::SizesType>{tensor->sizes().begin(), tensor->sizes().end()}
+        );
+        buffer_info.readonly = true;
+        return buffer_info;
+      })
       .def("__repr__", [](const Image& img) {
         std::string dtype = "unknown";
         if (img.is_uint8()) {
@@ -297,7 +312,6 @@ PYBIND11_MODULE(_llm_runner, m) {
 
   // Bind Audio class
   py::class_<Audio>(m, "Audio")
-      .def(py::init<>())
       .def(
           py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
           py::arg("data"),
@@ -314,18 +328,32 @@ PYBIND11_MODULE(_llm_runner, m) {
           "Create preprocessed audio data (float32)")
       .def("is_uint8", &Audio::is_uint8)
       .def("is_float", &Audio::is_float)
-      .def_property_readonly(
-          "uint8_data",
-          static_cast<const std::vector<uint8_t>& (Audio::*)() const&>(
-              &Audio::get_uint8_data))
-      .def_property_readonly(
-          "float_data",
-          static_cast<const std::vector<float>& (Audio::*)() const&>(
-              &Audio::get_float_data))
       .def_property_readonly("batch_size", &Audio::get_batch_size)
       .def_property_readonly("n_bins", &Audio::get_n_bins)
       .def_property_readonly("n_frames", &Audio::get_n_frames)
-      .def("toTensor", &Audio::toTensor)
+      .def(
+          "tensor",
+          [](const Audio& audio, bool with_batch) {
+            return tensor_to_torch_tensor(*audio.tensor(with_batch));
+          },
+          py::arg("with_batch") = false)
+      .def_buffer([](Audio& audio) -> py::buffer_info {
+        auto tensor = audio.tensor();
+        const auto scalar_type = tensor->scalar_type();
+        const auto element_size = elementSize(scalar_type);
+        const auto* format = scalar_type == aten::ScalarType::Byte
+          ? py::format_descriptor<uint8_t>::format()
+          : py::format_descriptor<float>::format();
+        py::buffer_info buffer_info(
+          tensor->mutable_data_ptr(),
+          element_size,
+          format,
+          tensor->dim(),
+          std::vector<aten::SizesType>{tensor->sizes().begin(), tensor->sizes().end()}
+        );
+        buffer_info.readonly = true;
+        return buffer_info;
+      })
       .def("__repr__", [](const Audio& audio) {
         std::string dtype = "unknown";
         if (audio.is_uint8()) {
@@ -369,10 +397,6 @@ PYBIND11_MODULE(_llm_runner, m) {
           py::init<const std::vector<uint64_t>&>(),
           py::arg("tokens"),
           "Create a MultimodalInput with pre-tokenized tokens (List[int])")
-      .def(
-          py::init<const std::vector<uint64_t>&>(),
-          py::arg("tokens"),
-          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
       .def(
           py::init<const Image&>(),
           py::arg("image"),
@@ -473,6 +497,14 @@ PYBIND11_MODULE(_llm_runner, m) {
   m.def(
       "make_image_input",
       [](torch::Tensor image_tensor) -> MultimodalInput {
+        if (!image_tensor.device().is_cpu()) {
+          throw std::runtime_error("Image tensor must be on CPU");
+        }
+        if (image_tensor.scalar_type() != torch::kUInt8 &&
+            image_tensor.scalar_type() != torch::kFloat) {
+          throw std::runtime_error(
+              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
+        }
         if (image_tensor.dim() == 4) {
           if (image_tensor.size(0) != 1) {
             throw std::runtime_error(
@@ -480,56 +512,18 @@ PYBIND11_MODULE(_llm_runner, m) {
           }
           image_tensor = image_tensor.squeeze(0);
         }
-
         if (image_tensor.dim() != 3) {
           throw std::runtime_error(
-              "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
+              "Image tensor must be 3D (H,W,C) or (C,H,W)");
         }
-
-        int64_t height, width, channels;
-        // Check for memory format and permute to CHW if necessary
-        if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-          // Input is HWC, permute to CHW
-          height = image_tensor.size(0);
-          width = image_tensor.size(1);
-          channels = image_tensor.size(2);
+        if (image_tensor.size(2) == 3 || image_tensor.size(2) == 4) {
           image_tensor = image_tensor.permute({2, 0, 1});
-        } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) {
-          // Input is CHW
-          channels = image_tensor.size(0);
-          height = image_tensor.size(1);
-          width = image_tensor.size(2);
-        } else {
-          throw std::runtime_error(
-              "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format.");
         }
-
-        if (channels != 3 && channels != 4) {
+        if (!(image_tensor.size(0) == 3 || image_tensor.size(0) == 4)) {
           throw std::runtime_error(
               "Image must have 3 (RGB) or 4 (RGBA) channels");
         }
-
-        image_tensor = image_tensor.contiguous();
-        if (image_tensor.scalar_type() == torch::kUInt8) {
-          uint8_t* data = image_tensor.data_ptr<uint8_t>();
-          std::vector<uint8_t> image_data(data, data + image_tensor.numel());
-          return MultimodalInput(Image(
-              std::move(image_data),
-              static_cast<int32_t>(width),
-              static_cast<int32_t>(height),
-              static_cast<int32_t>(channels)));
-        } else if (image_tensor.scalar_type() == torch::kFloat) {
-          float* data = image_tensor.data_ptr<float>();
-          std::vector<float> image_data(data, data + image_tensor.numel());
-          return MultimodalInput(Image(
-              std::move(image_data),
-              static_cast<int32_t>(width),
-              static_cast<int32_t>(height),
-              static_cast<int32_t>(channels)));
-        } else {
-          throw std::runtime_error(
-              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
-        }
+        return MultimodalInput(Image(tensor_to_tensor_ptr(image_tensor)));
       },
       "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
       py::arg("image_tensor"));
@@ -537,36 +531,15 @@ PYBIND11_MODULE(_llm_runner, m) {
   m.def(
       "make_audio_input",
       [](torch::Tensor audio_tensor) -> MultimodalInput {
-        if (audio_tensor.dim() != 3) {
+        if (audio_tensor.scalar_type() != torch::kUInt8 && audio_tensor.scalar_type() != torch::kFloat) {
           throw std::runtime_error(
-              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
+              "Unsupported audio tensor dtype. Only uint8 and float32 are supported.");
         }
-
-        int64_t batch_size = audio_tensor.size(0);
-        int64_t n_bins = audio_tensor.size(1);
-        int64_t n_frames = audio_tensor.size(2);
-
-        audio_tensor = audio_tensor.contiguous();
-        if (audio_tensor.scalar_type() == torch::kUInt8) {
-          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
-          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
-          return MultimodalInput(Audio(
-              std::move(audio_data),
-              static_cast<int32_t>(batch_size),
-              static_cast<int32_t>(n_bins),
-              static_cast<int32_t>(n_frames)));
-        } else if (audio_tensor.scalar_type() == torch::kFloat) {
-          float* data = audio_tensor.data_ptr<float>();
-          std::vector<float> audio_data(data, data + audio_tensor.numel());
-          return MultimodalInput(Audio(
-              std::move(audio_data),
-              static_cast<int32_t>(batch_size),
-              static_cast<int32_t>(n_bins),
-              static_cast<int32_t>(n_frames)));
-        } else {
+        if (audio_tensor.dim() != 3) {
           throw std::runtime_error(
-              "Unsupported audio tensor dtype. Only uint8 and float32 are supported for preprocessed audio.");
+              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
         }
+        return MultimodalInput(Audio(tensor_to_tensor_ptr(audio_tensor)));
       },
       "Create a preprocessed audio input from a torch tensor (batch_size, n_bins, n_frames)",
       py::arg("audio_tensor"));
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index e001e8fc154..18f8ccb4fd5 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -42,8 +42,12 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_decoder_runner" + aten_suffix,
-            exported_headers = ["text_decoder_runner.h"],
-            srcs = ["text_decoder_runner.cpp"],
+            exported_headers = [
+                "text_decoder_runner.h",
+            ],
+            srcs = [
+                "text_decoder_runner.cpp",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -59,8 +63,12 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_prefiller" + aten_suffix,
-            exported_headers = ["text_prefiller.h"],
-            srcs = ["text_prefiller.cpp"],
+            exported_headers = [
+                "text_prefiller.h",
+            ],
+            srcs = [
+                "text_prefiller.cpp",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -74,7 +82,9 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_token_generator" + aten_suffix,
-            exported_headers = ["text_token_generator.h"],
+            exported_headers = [
+                "text_token_generator.h",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -88,7 +98,10 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "image_prefiller" + aten_suffix,
-            exported_headers = ["image_prefiller.h", "image.h"],
+            exported_headers = [
+                "image.h",
+                "image_prefiller.h",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -104,20 +117,18 @@ def define_common_targets():
             name = "multimodal_runner_lib" + aten_suffix,
             exported_headers = [
                 "audio.h",
-                "image.h",
-                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",
                 "multimodal_decoder_runner.h",
+                "wav_loader.h",
             ],
             srcs = [
                 "multimodal_prefiller.cpp",
             ],
             exported_deps = [
-                ":text_decoder_runner" + aten_suffix,
-                ":text_prefiller" + aten_suffix,
                 ":image_prefiller" + aten_suffix,
+                ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
             ],
         )
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp
index 85d45d69173..04714349716 100644
--- a/extension/llm/runner/test/test_multimodal_input.cpp
+++ b/extension/llm/runner/test/test_multimodal_input.cpp
@@ -71,7 +71,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromImage) {
   EXPECT_EQ(input.get_image().width(), 224);
   EXPECT_EQ(input.get_image().height(), 224);
   EXPECT_EQ(input.get_image().channels(), 3);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), 224 * 224 * 3);
+  EXPECT_EQ(input.get_image().tensor()->numel(), 224 * 224 * 3);
 }
 
 TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
@@ -79,7 +79,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   int width = img.width();
   int height = img.height();
   int channels = img.channels();
-  size_t data_size = img.get_uint8_data().size();
+  size_t data_size = img.tensor()->numel();
 
   MultimodalInput input(std::move(img));
 
@@ -89,7 +89,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   EXPECT_EQ(input.get_image().width(), width);
   EXPECT_EQ(input.get_image().height(), height);
   EXPECT_EQ(input.get_image().channels(), channels);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), data_size);
+  EXPECT_EQ(input.get_image().tensor()->numel(), data_size);
 }
 
 // Test copy constructor and assignment
@@ -356,7 +356,7 @@ TEST_F(MultimodalInputTest, DifferentImageSizes) {
   EXPECT_EQ(input.get_image().width(), 32);
   EXPECT_EQ(input.get_image().height(), 32);
   EXPECT_EQ(input.get_image().channels(), 1);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), 32 * 32);
+  EXPECT_EQ(input.get_image().tensor()->numel(), 32 * 32);
 }
 
 // Test with empty text
diff --git a/extension/llm/runner/test/test_runner_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py
index f30226bf3e2..c242d5374cc 100644
--- a/extension/llm/runner/test/test_runner_pybindings.py
+++ b/extension/llm/runner/test/test_runner_pybindings.py
@@ -122,7 +122,7 @@ def test_creation(self):
         image = Image([1, 2, 3, 4], 2, 2, 1)
 
         # Properties are read-only
-        self.assertEqual(image.uint8_data, [1, 2, 3, 4])
+        self.assertEqual(memoryview(image).tobytes(), bytes([1, 2, 3, 4]))
         self.assertEqual(image.width, 2)
         self.assertEqual(image.height, 2)
         self.assertEqual(image.channels, 1)