diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index ed0fa5d16bb..1b22051533d 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -42,6 +42,7 @@ build_llama_runner() {
     popd
     ANDROID_ABI=arm64-v8a
     cmake -DBUCK2="${BUCK2}" \
+    -DBUILD_TESTING=OFF \
     -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 4ed5ec308c5..88fedabba27 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -169,6 +169,7 @@ cmake_build_llama_runner() {
     popd
     dir="examples/models/llama"
     retry cmake \
+        -DBUILD_TESTING=OFF \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -Bcmake-out/${dir} \
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
index ae9924c2a2b..3fe5fa0faea 100644
--- a/.ci/scripts/test_llama_torchao_lowbit.sh
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -40,6 +40,7 @@ cmake --build cmake-out -j16 --target install --config Release
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
+    -DBUILD_TESTING=OFF \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 9a0251c9a38..0d9f2b8b141 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -64,9 +64,10 @@ cmake_install_executorch_libraries_for_android() {
 
 
 LLAVA_COMMON_CMAKE_ARGS="                        \
+        -DBUILD_TESTING=OFF                      \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}         \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}   \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
         -DEXECUTORCH_BUILD_XNNPACK=ON"
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
index c2f01bf17b1..fc7f440d999 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -14,6 +14,7 @@
 
 using executorch::extension::llm::GenerationConfig;
 using executorch::extension::llm::Image;
+using executorch::extension::llm::TextLLMRunner;
 using executorch::runtime::Error;
 
 NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
@@ -23,7 +24,7 @@ @interface LLaMARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaMARunner {
-  std::unique_ptr<example::Runner> _runner;
+  std::unique_ptr<TextLLMRunner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
@@ -31,7 +32,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = example::Runner::create(
+    _runner = example::create_llama_runner(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 0f7648ff65e..8c27de20845 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -220,7 +220,6 @@ endif()
 target_include_directories(
   llama_main
   PUBLIC ${_common_include_directories}
-         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
 target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
 target_compile_options(llama_main PUBLIC ${_common_compile_options})
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
index 1c1b6f62dc1..38009dd59ec 100644
--- a/examples/models/llama/main.cpp
+++ b/examples/models/llama/main.cpp
@@ -81,8 +81,13 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  std::unique_ptr<example::Runner> runner =
-      example::Runner::create(model_path, tokenizer_path, data_path);
+  std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner =
+      example::create_llama_runner(model_path, tokenizer_path, data_path);
+
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create llama runner");
+    return 1;
+  }
 
   if (warmup) {
     runner->warmup(prompt, /*max_new_tokens=*/seq_len);
diff --git a/examples/models/llama/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt
index fefee61092d..a73990edd96 100644
--- a/examples/models/llama/runner/CMakeLists.txt
+++ b/examples/models/llama/runner/CMakeLists.txt
@@ -52,23 +52,20 @@ else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
+# For extension_llm_runner
+if(NOT TARGET extension_llm_runner)
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/extension/llm/runner
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../../extension/llm/runner
+  )
+endif()
+
 set(llama_runner_deps executorch_core extension_data_loader extension_module
-                      extension_tensor extension_flat_tensor
+                      extension_tensor extension_flat_tensor extension_llm_runner
 )
 
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
-target_include_directories(
-  llama_runner
-  INTERFACE ${_common_include_directories}
-)
-
-# Include tokenizers dependency
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_subdirectory(
-  ${EXECUTORCH_ROOT}/extension/llm/tokenizers
-  ${CMAKE_CURRENT_BINARY_DIR}/tokenizers
-)
 target_link_libraries(
   llama_runner PUBLIC tokenizers
 )
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 119eedc704e..2ba2fdf9941 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -11,8 +11,7 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama/runner/runner.h>
-
-#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/module/module.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
@@ -26,41 +25,14 @@ using ::executorch::runtime::Result;
 
 namespace llm = ::executorch::extension::llm;
 
-namespace {
-static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
-static constexpr auto kBosId = "get_bos_id";
-static constexpr auto kEosIds = "get_eos_ids";
-static constexpr auto kMaxSeqLen = "get_max_seq_len";
-static constexpr auto kMaxContextLen = "get_max_context_len";
-static constexpr auto kVocabSize = "get_vocab_size";
-static constexpr auto kUseKVCache = "use_kv_cache";
-static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
-
-std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer(
-    const std::string& tokenizer_path) {
-  auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
-  if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded json tokenizer");
-    return json_tokenizer;
-  }
-
-  auto tiktoken_tokenizer = get_tiktoken_for_llama();
-  if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded TikToken tokenizer");
-    return tiktoken_tokenizer;
-  }
-
-  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
-  if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded BPE tokenizer");
-    return bpe_tokenizer;
-  }
-
-  return nullptr;
+std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer(
+    const std::string& tokenizer_path,
+    Version version) {
+  auto special_tokens = get_special_tokens(version);
+  return llm::load_tokenizer(tokenizer_path, std::move(special_tokens));
 }
-} // namespace
 
-std::unique_ptr<Runner> Runner::create(
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
@@ -71,29 +43,10 @@ std::unique_ptr<Runner> Runner::create(
       model_path.c_str(),
       tokenizer_path.c_str());
 
-  // Create the Module
-  std::unique_ptr<Module> module;
-  if (data_path.has_value()) {
-    module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
-  } else {
-    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
-  }
-
-  // Initialize metadata with default values
-  std::unordered_map<std::string, int64_t> metadata({
-      {kEnableDynamicShape, false},
-      {kMaxSeqLen, 128},
-      {kMaxContextLen, 128},
-      {kUseKVCache, true},
-      {kUseSDPAWithKVCache, false},
-  });
-
   // Create and load tokenizer
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
-      load_tokenizer(tokenizer_path);
+      load_llama_tokenizer(tokenizer_path, Version::Default);
 
-  // Fallback to BPE tokenizer if tiktoken fails
   if (tokenizer == nullptr) {
     ET_LOG(
         Info,
@@ -101,279 +54,8 @@ std::unique_ptr<Runner> Runner::create(
         tokenizer_path.c_str());
     return nullptr;
   }
-
-  ET_LOG(Info, "Reading metadata from model");
-
-  // Set tokenizer-related metadata
-  metadata[kBosId] = tokenizer->bos_tok();
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
-      std::unordered_set<uint64_t>{tokenizer->eos_tok()});
-  metadata[kVocabSize] = tokenizer->vocab_size();
-
-  // Read metadata from the model
-  auto method_names_result = module->method_names();
-  if (method_names_result.error() != Error::Ok) {
-    ET_LOG(Error, "Failed reading method names");
-    return nullptr;
-  }
-  const auto method_names = method_names_result.get();
-
-  for (auto& pair : metadata) {
-    const auto& method_name = pair.first;
-    auto& value = pair.second;
-
-    if (method_names.count(method_name)) {
-      auto get_result = module->get(method_name);
-      value = get_result.get().toScalar().to<decltype(metadata)::mapped_type>();
-    } else {
-      ET_LOG(
-          Info,
-          "Method %s not found, using the default value %" PRId64,
-          method_name.c_str(),
-          value);
-    }
-    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
-  }
-
-  // Get EOS IDs if available
-  if (method_names.count(kEosIds)) {
-    eos_ids->clear();
-    auto execute_result = module->execute(kEosIds);
-    if (execute_result.error() != Error::Ok) {
-      ET_LOG(Error, "Failed to execute %s", kEosIds);
-      return nullptr;
-    }
-    for (const auto& eos_id : execute_result.get()) {
-      auto value = eos_id.toScalar().to<int64_t>();
-      eos_ids->emplace(value);
-      ET_LOG(Info, "eos_id = %" PRId64, value);
-    }
-  }
-
-  // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
-  // TextPrefiller and TextTokenGenerator
-  auto text_decoder_runner = std::make_unique<llm::TextDecoderRunner>(
-      module.get(), metadata.at(kUseKVCache));
-
-  // Create text_prefiller
-  auto text_prefiller = std::make_unique<llm::TextPrefiller>(
-      text_decoder_runner.get(),
-      metadata.at(kUseKVCache),
-      metadata.at(kEnableDynamicShape),
-      metadata.at(kMaxSeqLen));
-
-  // Create text_token_generator with stats
-  auto stats = std::make_unique<llm::Stats>();
-  auto text_token_generator = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer.get(),
-      text_decoder_runner.get(),
-      metadata.at(kUseKVCache),
-      std::move(eos_ids),
-      stats.get());
-
-  // Create and return the Runner instance
-  return std::make_unique<Runner>(
-      std::move(metadata),
-      std::move(tokenizer),
-      std::move(module),
-      std::move(text_decoder_runner),
-      std::move(text_prefiller),
-      std::move(text_token_generator),
-      std::move(stats),
-      temperature);
-}
-
-Runner::Runner(
-    std::unordered_map<std::string, int64_t> metadata,
-    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::unique_ptr<::executorch::extension::Module> module,
-    std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-        text_decoder_runner,
-    std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller,
-    std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-        text_token_generator,
-    std::unique_ptr<::executorch::extension::llm::Stats> stats,
-    float temperature)
-    : tokenizer_(std::move(tokenizer)),
-      metadata_(std::move(metadata)),
-      module_(std::move(module)),
-      text_decoder_runner_(std::move(text_decoder_runner)),
-      text_prefiller_(std::move(text_prefiller)),
-      text_token_generator_(std::move(text_token_generator)),
-      stats_(std::move(stats)),
-      temperature_(temperature) {
-  // Note: This constructor assumes that text_prefiller and text_token_generator
-  // already have references to the Module and TextDecoderRunner they need
-}
-
-bool Runner::is_loaded() const {
-  return text_prefiller_->is_loaded() && text_token_generator_->is_loaded();
-}
-
-Error Runner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-  ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
-  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
-  return Error::Ok;
-}
-
-// Don't print with the same priority during warmup
-#define RUNNER_ET_LOG(warmup, format, ...) \
-  if (warmup) {                            \
-    ET_LOG(Debug, format, __VA_ARGS__);    \
-  } else {                                 \
-    ET_LOG(Info, format, __VA_ARGS__);     \
-  }
-
-Error Runner::generate(
-    const std::string& prompt,
-    const ::executorch::extension::llm::GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const llm::Stats&)> stats_callback) {
-  // Prepare the inputs.
-  // Use ones-initialized inputs.
-  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  if (!is_loaded()) {
-    stats_->model_load_start_ms = llm::time_in_ms();
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_->model_load_end_ms = llm::time_in_ms();
-  }
-
-  if (config.warming) {
-    ET_LOG(Info, "Doing a warmup run...");
-  }
-
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after loading model: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // Wrap the token_callback with print function
-  std::function<void(const std::string&)> wrapped_callback =
-      [token_callback, config](const std::string& piece) {
-        if (!config.warming) {
-          llm::safe_printf(piece.c_str());
-          fflush(stdout);
-        }
-        if (token_callback) {
-          token_callback(piece);
-        }
-      };
-  // First token time only measures the time it takes to encode the prompt and
-  // return a response token.
-
-  stats_->inference_start_ms = llm::time_in_ms();
-  shouldStop_ = false;
-
-  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
-      prompt,
-      /* bos */ 0,
-      /* eos */ 0);
-
-  ET_CHECK_TK_OK_OR_RETURN_ERROR(
-      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
-
-  // encode the (string) prompt into tokens sequence
-  std::vector<uint64_t> prompt_tokens = encode_res.get();
-  int num_prompt_tokens = prompt_tokens.size();
-
-  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
-  ET_CHECK_MSG(
-      num_prompt_tokens < metadata_.at(kMaxContextLen),
-      "num_prompt_tokens %d >= max_seq_len_ %" PRId64
-      ", Max seq length exceeded - please increase max seq len value in your export script",
-      num_prompt_tokens,
-      metadata_.at(kMaxContextLen));
-
-  // Determine max_new_tokens using the GenerationConfig's resolve method
-  int max_new_tokens = config.resolve_max_new_tokens(
-      metadata_.at(kMaxContextLen), num_prompt_tokens);
-
-  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
-
-  // Prefill first
-  // Here feed all tokens to the model and get the next predicted token
-  // after the prompt. After that we will enter generate loop.
-
-  // print prompts
-  if (config.echo) {
-    wrapped_callback(prompt);
-  }
-  int64_t pos = 0;
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-  uint64_t cur_token = prefill_res.get();
-  stats_->first_token_ms = llm::time_in_ms();
-  stats_->prompt_eval_end_ms = llm::time_in_ms();
-
-  // print the first token from prefill. No prev_token so use cur_token for it.
-  wrapped_callback(
-      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after prompt prefill: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // start the main loop
-  prompt_tokens.push_back(cur_token);
-
-  // Generate max_new_tokens - 1 because prefill already generated 1 token.
-  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens,
-      num_prompt_tokens,
-      max_new_tokens - 1,
-      temperature_ == -1.0f ? config.temperature : temperature_,
-      wrapped_callback));
-
-  stats_->inference_end_ms = llm::time_in_ms();
-  if (!config.warming) {
-    printf("\n");
-  }
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  if (num_generated_tokens == max_new_tokens) {
-    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
-  }
-
-  stats_->num_prompt_tokens = num_prompt_tokens;
-  stats_->num_generated_tokens = num_generated_tokens;
-
-  if (config.warming) {
-    ET_LOG(Info, "Warmup run finished!");
-  } else {
-    // Do not print report during warmup
-    ::executorch::llm::print_report(*stats_);
-  }
-  if (stats_callback) {
-    stats_callback(*stats_);
-  }
-
-  return Error::Ok;
-}
-
-Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
-  // Create a GenerationConfig for warmup
-  llm::GenerationConfig config{
-      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
-
-  // Call generate with the warmup config
-  Error err = generate(prompt, config);
-
-  // Reset stats after warmup, not resetting the std::unique_ptr!
-  stats_->reset();
-  return err;
+  return llm::create_text_llm_runner(
+      model_path, std::move(tokenizer), data_path);
 }
 
-void Runner::stop() {
-  if (is_loaded()) {
-    text_token_generator_->stop();
-  } else {
-    ET_LOG(Error, "Token generator is not loaded, cannot stop");
-  }
-}
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index e4e91db37d5..09a166b0109 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -19,74 +19,20 @@
 #include <unordered_map>
 
 #include <executorch/extension/llm/runner/irunner.h>
-#include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/runner/text_prefiller.h>
-#include <executorch/extension/llm/runner/text_token_generator.h>
-#include <executorch/extension/module/module.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
-class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
- public:
-  // Static factory method to create a Runner instance
-  static std::unique_ptr<Runner> create(
-      const std::string& model_path,
-      const std::string& tokenizer_path,
-      std::optional<const std::string> data_path = std::nullopt,
-      float temperature = -1.0f);
+namespace llm = ::executorch::extension::llm;
 
-  // Constructor with dependency injection
-  explicit Runner(
-      std::unordered_map<std::string, int64_t> metadata,
-      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-      std::unique_ptr<::executorch::extension::Module> module,
-      std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-          text_decoder_runner,
-      std::unique_ptr<::executorch::extension::llm::TextPrefiller>
-          text_prefiller,
-      std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-          text_token_generator,
-      std::unique_ptr<::executorch::extension::llm::Stats> stats,
-      float temperature = -1.0f);
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::optional<const std::string> data_path = std::nullopt,
+    float temperature = -1.0f);
 
-  bool is_loaded() const override;
-  ::executorch::runtime::Error load() override;
-  ::executorch::runtime::Error generate(
-      const std::string& prompt,
-      const ::executorch::extension::llm::GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {}) override;
-  ::executorch::runtime::Error warmup(
-      const std::string& prompt,
-      int32_t max_new_tokens);
-  void stop() override;
-
- private:
-  bool shouldStop_{false};
-
-  // Components
-  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
-  std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<::executorch::extension::Module>
-      module_; // Manage module's lifecycle, make sure it outlives
-               // text_decoder_runner_.
-  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-      text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make
-                            // sure it outlives text_prefiller_ &
-                            // text_token_generator_.
-  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
-  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-      text_token_generator_;
-
-  // Stats
-  std::unique_ptr<::executorch::extension::llm::Stats> stats_;
-
-  // temperature.
-  // Deprecated, we should rely on the temperature in GenerationConfig instead.
-  float temperature_ = -1.0f;
-};
+std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
+    const std::string& tokenizer_path);
 
 } // namespace example
diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl
index 158202cf55a..e0a96af85bb 100644
--- a/examples/models/llama/runner/targets.bzl
+++ b/examples/models/llama/runner/targets.bzl
@@ -34,16 +34,12 @@ def define_common_targets():
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
+            compiler_flags = [
+                "-Wno-missing-prototypes",
+            ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
-                "//executorch/extension/llm/runner:irunner",
-                "//executorch/extension/llm/runner:stats",
-                "//executorch/extension/llm/runner:text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/runner:text_prefiller" + aten_suffix,
-                "//executorch/extension/llm/runner:text_token_generator" + aten_suffix,
-                "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
-                "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/extension/llm/runner:runner_lib" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
diff --git a/examples/models/llama/runner/test/CMakeLists.txt b/examples/models/llama/runner/test/CMakeLists.txt
deleted file mode 100644
index aa754b96da6..00000000000
--- a/examples/models/llama/runner/test/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-
-cmake_minimum_required(VERSION 3.19)
-
-set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
-
-include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
-
-set(_test_srcs test_runner.cpp)
-
-et_cxx_test(
-  test_runner
-  SOURCES
-  ${_test_srcs}
-  EXTRA_LIBS
-  executorch
-)
diff --git a/examples/models/llama/runner/test/TARGETS b/examples/models/llama/runner/test/TARGETS
deleted file mode 100644
index 97de7abe9b1..00000000000
--- a/examples/models/llama/runner/test/TARGETS
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/examples/models/llama/runner/test/targets.bzl b/examples/models/llama/runner/test/targets.bzl
deleted file mode 100644
index 3b02360da08..00000000000
--- a/examples/models/llama/runner/test/targets.bzl
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-def define_common_targets():
-    runtime.cxx_test(
-        name = "test_runner",
-        srcs = ["test_runner.cpp"],
-        deps = [
-            "//executorch/examples/models/llama/runner:runner",
-            "//executorch/extension/llm/runner:irunner",
-            "//executorch/extension/llm/runner:stats",
-            "//executorch/extension/llm/runner:text_token_generator",
-            "//executorch/extension/llm/runner:text_decoder_runner",
-            "//executorch/extension/llm/runner:text_prefiller",
-            "//executorch/extension/module:module",
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/platform:platform",
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-        ],
-    )
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp
index f595de3c4e7..7b98a6ca415 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.cpp
+++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp
@@ -42,7 +42,9 @@ _get_default_special_tokens() {
   return special_tokens;
 }
 
-std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
+} // namespace
+
+std::unique_ptr<std::vector<std::string>> get_special_tokens(Version version) {
   switch (version) {
     case Version::Multimodal:
       return get_multimodal_special_tokens();
@@ -51,11 +53,9 @@ std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
   }
 }
 
-} // namespace
-
 std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
   return std::make_unique<Tiktoken>(
-      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
+      get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
 }
 
 std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens() {
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h
index a7f65eca29e..01d836ffbe6 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.h
+++ b/examples/models/llama/tokenizer/llama_tiktoken.h
@@ -20,6 +20,8 @@ enum class Version {
 std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
     Version version = Version::Default);
 
+std::unique_ptr<std::vector<std::string>> get_special_tokens(Version version);
+
 std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
 
 } // namespace example
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
index 1f9d6fa8e1d..016678e3c54 100644
--- a/examples/models/llava/runner/CMakeLists.txt
+++ b/examples/models/llava/runner/CMakeLists.txt
@@ -28,8 +28,6 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # build llava_runner library
 set(_llava_runner__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
-    "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
-    "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
 )
 
 # extension llm runner lib
@@ -45,8 +43,3 @@ set(llava_runner_deps executorch_core extension_data_loader extension_llm_runner
 )
 
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
-
-target_include_directories(
-  llava_runner INTERFACE ${_common_include_directories}
-                         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
-)
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 03e26f089db..ad1c77a92b9 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -168,7 +168,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       std::optional<const std::string> data_path_str = data_path
           ? std::optional<const std::string>{data_path->toStdString()}
           : std::nullopt;
-      runner_ = example::Runner::create(
+      // TODO(larryliu0820): Use the API in text_llm_runner.h to create the
+      // runner.
+      runner_ = example::create_llama_runner(
           model_path->toStdString(),
           tokenizer_path->toStdString(),
           data_path_str);
diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
index b0cddfa808c..f6fe811b4ab 100644
--- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
+++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -33,6 +33,7 @@
 		30AA4B642DC0766800B1BE50 /* std_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */; };
 		30AA4B652DC0766800B1BE50 /* pre_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5B2DC0766800B1BE50 /* pre_tokenizer.cpp */; };
 		30AA4B662DC0766800B1BE50 /* re2_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5C2DC0766800B1BE50 /* re2_regex.cpp */; };
+		F22E9E1A2DF2CBB900EC5425 /* text_llm_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */; };
 		F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */; };
 		F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */; };
 		F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B01A2D88AF3500BE6839 /* tiktoken.cpp */; };
@@ -94,6 +95,8 @@
 		30AA4B5D2DC0766800B1BE50 /* regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = regex.cpp; path = src/regex.cpp; sourceTree = "<group>"; };
 		30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = std_regex.cpp; path = src/std_regex.cpp; sourceTree = "<group>"; };
 		30AA4B5F2DC0766800B1BE50 /* token_decoder.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = token_decoder.cpp; path = src/token_decoder.cpp; sourceTree = "<group>"; };
+		F22E9E182DF2CBB900EC5425 /* text_llm_runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_llm_runner.h; sourceTree = "<group>"; };
+		F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = text_llm_runner.cpp; sourceTree = "<group>"; };
 		F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer_base.cpp; path = src/bpe_tokenizer_base.cpp; sourceTree = "<group>"; };
 		F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama2c_tokenizer.cpp; path = src/llama2c_tokenizer.cpp; sourceTree = "<group>"; };
 		F292B01A2D88AF3500BE6839 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = src/tiktoken.cpp; sourceTree = "<group>"; };
@@ -146,6 +149,8 @@
 		032A73E02CAFBB7800932D36 /* runner */ = {
 			isa = PBXGroup;
 			children = (
+				F22E9E182DF2CBB900EC5425 /* text_llm_runner.h */,
+				F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */,
 				032A73D42CAFBB7800932D36 /* image.h */,
 				032A73D52CAFBB7800932D36 /* image_prefiller.h */,
 				032A73D62CAFBB7800932D36 /* multimodal_runner.h */,
@@ -409,6 +414,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				F22E9E1A2DF2CBB900EC5425 /* text_llm_runner.cpp in Sources */,
 				03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */,
 				032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */,
 				032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */,
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index e53a457939c..c56f054ae3b 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -74,7 +74,7 @@ @implementation LLaMATests
   NSString *tokenizerPath = resources[@"tokenizer"];
   return @{
     @"generate" : ^(XCTestCase *testCase){
-      auto __block runner = example::Runner::create(
+      auto __block runner = example::create_llama_runner(
           modelPath.UTF8String, tokenizerPath.UTF8String);
       if (!runner) {
         XCTFail("Failed to create runner");
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 9696737f471..05f47e55c74 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -43,16 +43,15 @@ target_include_directories(
 
 add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
 
-set(runner_deps executorch_core extension_module extension_tensor)
+# add tokenizers
+add_subdirectory(
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers
+  ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/tokenizers
+)
 
-target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
+set(runner_deps executorch_core extension_module extension_tensor tokenizers)
 
-target_include_directories(
-  extension_llm_runner
-  PUBLIC
-    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/include
-    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/src
-)
+target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
 
 target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 03b593cacf5..2e8231748ed 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -84,14 +84,26 @@ def define_common_targets():
             name = "runner_lib" + aten_suffix,
             exported_headers = [
                 "multimodal_runner.h",
+                "text_llm_runner.h",
+            ],
+            srcs = [
+                "text_llm_runner.cpp",
             ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
+            compiler_flags = [
+                "-Wno-missing-prototypes",
+            ],
             exported_deps = [
                 ":image_prefiller" + aten_suffix,
+                ":irunner",
                 ":text_decoder_runner" + aten_suffix,
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
+                "//pytorch/tokenizers:hf_tokenizer",
+                "//pytorch/tokenizers:llama2c_tokenizer",
+                # "//pytorch/tokenizers:sentencepiece", # TODO(larryliu0820) Make sure this compiles in xplat.
+                "//pytorch/tokenizers:tiktoken",
             ],
         )
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index b17a318a080..ac46f0021fb 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -17,12 +17,13 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_test_srcs generation_config_test.cpp)
+set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp)
 
 et_cxx_test(
-  generation_config_test
+  test_runner
   SOURCES
   ${_test_srcs}
   EXTRA_LIBS
   executorch
+  extension_llm_runner
 )
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 9cdaad990bb..a5c8be7b6de 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -8,8 +8,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
     runtime.cxx_test(
-        name = "generation_config_test",
-        srcs = ["generation_config_test.cpp"],
+        name = "test_generation_config",
+        srcs = ["test_generation_config.cpp"],
         deps = [
             "//executorch/extension/llm/runner:irunner",
             "//executorch/extension/llm/runner:stats",
@@ -17,3 +17,13 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_text_llm_runner",
+        srcs = ["test_text_llm_runner.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:irunner",
+            "//executorch/extension/llm/runner:runner_lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
diff --git a/extension/llm/runner/test/generation_config_test.cpp b/extension/llm/runner/test/test_generation_config.cpp
similarity index 100%
rename from extension/llm/runner/test/generation_config_test.cpp
rename to extension/llm/runner/test/test_generation_config.cpp
diff --git a/examples/models/llama/runner/test/test_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp
similarity index 97%
rename from examples/models/llama/runner/test/test_runner.cpp
rename to extension/llm/runner/test/test_text_llm_runner.cpp
index f158ca8515d..a9c2c680609 100644
--- a/examples/models/llama/runner/test/test_runner.cpp
+++ b/extension/llm/runner/test/test_text_llm_runner.cpp
@@ -7,18 +7,19 @@
  * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
  */
 
-#include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using namespace example;
 using executorch::extension::llm::GenerationConfig;
 using executorch::extension::llm::Stats;
 using executorch::extension::llm::TextDecoderRunner;
+using executorch::extension::llm::TextLLMRunner;
 using executorch::extension::llm::TextPrefiller;
 using executorch::extension::llm::TextTokenGenerator;
 using executorch::runtime::Error;
@@ -212,7 +213,7 @@ TEST_F(RunnerTest, GenerateCallsCallbackExactlyMaxNewTokensTimes) {
       tokenizer.get(), text_decoder_runner.get(), stats.get());
 
   // Create a Runner with our mocked components
-  Runner runner(
+  TextLLMRunner runner(
       createDefaultMetadata(),
       std::unique_ptr<::tokenizers::Tokenizer>(tokenizer.release()),
       std::make_unique<MockModule>(),
@@ -271,7 +272,7 @@ TEST_F(RunnerTest, WarmupCallsGenerateWithWarmingFlag) {
       tokenizer.get(), text_decoder_runner.get(), stats.get());
 
   // Create a Runner with our mocked components
-  Runner runner(
+  TextLLMRunner runner(
       createDefaultMetadata(),
       std::move(tokenizer),
       std::make_unique<MockModule>(),
@@ -305,7 +306,7 @@ TEST_F(RunnerTest, IsLoadedReturnsTrueWhenComponentsInitialized) {
       tokenizer.get(), text_decoder_runner.get(), stats.get());
 
   // Create a Runner with our mocked components
-  Runner runner(
+  TextLLMRunner runner(
       createDefaultMetadata(),
       std::unique_ptr<::tokenizers::Tokenizer>(tokenizer.release()),
       std::make_unique<MockModule>(),
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
new file mode 100644
index 00000000000..9fa20d2646e
--- /dev/null
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/tiktoken.h>
+
+namespace executorch::extension::llm {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
+static constexpr auto kBosId = "get_bos_id";
+static constexpr auto kEosIds = "get_eos_ids";
+static constexpr auto kMaxSeqLen = "get_max_seq_len";
+static constexpr auto kMaxContextLen = "get_max_context_len";
+static constexpr auto kVocabSize = "get_vocab_size";
+static constexpr auto kUseKVCache = "use_kv_cache";
+static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+
+TextLLMRunner::TextLLMRunner(
+    std::unordered_map<std::string, int64_t> metadata,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::unique_ptr<::executorch::extension::Module> module,
+    std::unique_ptr<TextDecoderRunner> text_decoder_runner,
+    std::unique_ptr<TextPrefiller> text_prefiller,
+    std::unique_ptr<TextTokenGenerator> text_token_generator,
+    std::unique_ptr<Stats> stats,
+    float temperature)
+    : tokenizer_(std::move(tokenizer)),
+      metadata_(std::move(metadata)),
+      module_(std::move(module)),
+      text_decoder_runner_(std::move(text_decoder_runner)),
+      text_prefiller_(std::move(text_prefiller)),
+      text_token_generator_(std::move(text_token_generator)),
+      stats_(std::move(stats)),
+      temperature_(temperature) {
+  // Note: This constructor assumes that text_prefiller and text_token_generator
+  // already have references to the Module and TextDecoderRunner they need
+}
+
+bool TextLLMRunner::is_loaded() const {
+  return text_prefiller_->is_loaded() && text_token_generator_->is_loaded();
+}
+
+Error TextLLMRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
+  return Error::Ok;
+}
+
+// Don't print with the same priority during warmup
+#define RUNNER_ET_LOG(warmup, format, ...) \
+  if (warmup) {                            \
+    ET_LOG(Debug, format, __VA_ARGS__);    \
+  } else {                                 \
+    ET_LOG(Info, format, __VA_ARGS__);     \
+  }
+
+Error TextLLMRunner::generate(
+    const std::string& prompt,
+    const GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  // Prepare the inputs.
+  // Use ones-initialized inputs.
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+  if (!is_loaded()) {
+    stats_->model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_->model_load_end_ms = time_in_ms();
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Doing a warmup run...");
+  }
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback, config](const std::string& piece) {
+        if (!config.warming) {
+          llm::safe_printf(piece.c_str());
+          fflush(stdout);
+        }
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+  // First token time only measures the time it takes to encode the prompt and
+  // return a response token.
+
+  stats_->inference_start_ms = time_in_ms();
+  shouldStop_ = false;
+
+  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+      prompt,
+      /* bos */ 0,
+      /* eos */ 0);
+
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
+      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
+
+  // encode the (string) prompt into tokens sequence
+  std::vector<uint64_t> prompt_tokens = encode_res.get();
+  int num_prompt_tokens = prompt_tokens.size();
+
+  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
+  ET_CHECK_MSG(
+      num_prompt_tokens < metadata_.at(kMaxContextLen),
+      "num_prompt_tokens %d >= max_seq_len_ %" PRId64
+      ", Max seq length exceeded - please increase max seq len value in your export script",
+      num_prompt_tokens,
+      metadata_.at(kMaxContextLen));
+
+  // Determine max_new_tokens using the GenerationConfig's resolve method
+  int max_new_tokens = config.resolve_max_new_tokens(
+      metadata_.at(kMaxContextLen), num_prompt_tokens);
+
+  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
+
+  // Prefill first
+  // Here feed all tokens to the model and get the next predicted token
+  // after the prompt. After that we will enter generate loop.
+
+  // print prompts
+  if (config.echo) {
+    wrapped_callback(prompt);
+  }
+  int64_t pos = 0;
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+  uint64_t cur_token = prefill_res.get();
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+
+  // print the first token from prefill. No prev_token so use cur_token for it.
+  wrapped_callback(
+      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // start the main loop
+  prompt_tokens.push_back(cur_token);
+
+  // Generate max_new_tokens - 1 because prefill already generated 1 token.
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      prompt_tokens,
+      num_prompt_tokens,
+      max_new_tokens - 1,
+      temperature_ == -1.0f ? config.temperature : temperature_,
+      wrapped_callback));
+
+  stats_->inference_end_ms = time_in_ms();
+  if (!config.warming) {
+    printf("\n");
+  }
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  if (num_generated_tokens == max_new_tokens) {
+    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
+  }
+
+  stats_->num_prompt_tokens = num_prompt_tokens;
+  stats_->num_generated_tokens = num_generated_tokens;
+
+  if (config.warming) {
+    ET_LOG(Info, "Warmup run finished!");
+  } else {
+    // Do not print report during warmup
+    print_report(*stats_);
+  }
+  if (stats_callback) {
+    stats_callback(*stats_);
+  }
+
+  return Error::Ok;
+}
+
+Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) {
+  // Create a GenerationConfig for warmup
+  GenerationConfig config{
+      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
+
+  // Call generate with the warmup config
+  Error err = generate(prompt, config);
+
+  // Reset stats after warmup, not resetting the std::unique_ptr!
+  stats_->reset();
+  return err;
+}
+
+void TextLLMRunner::stop() {
+  if (is_loaded()) {
+    text_token_generator_->stop();
+  } else {
+    ET_LOG(Error, "Token generator is not loaded, cannot stop");
+  }
+}
+
+std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
+    const std::string& tokenizer_path,
+    std::unique_ptr<std::vector<std::string>> special_tokens,
+    std::optional<std::string> pattern,
+    size_t bos_token_index,
+    size_t eos_token_index) {
+  auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
+  if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded json tokenizer");
+    return json_tokenizer;
+  }
+  std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer;
+  if (special_tokens != nullptr && !pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        std::move(special_tokens), bos_token_index, eos_token_index);
+  } else if (special_tokens != nullptr && pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        pattern.value(),
+        std::move(special_tokens),
+        bos_token_index,
+        eos_token_index);
+  } else {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>();
+  }
+  if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded TikToken tokenizer");
+    return tiktoken_tokenizer;
+  }
+
+  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
+  if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded BPE tokenizer");
+    return bpe_tokenizer;
+  }
+
+  return nullptr;
+}
+
+std::unordered_map<std::string, int64_t> get_llm_metadata(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module) {
+  // Initialize metadata with default values
+  std::unordered_map<std::string, int64_t> metadata({
+      {llm::kEnableDynamicShape, false},
+      {llm::kMaxSeqLen, 128},
+      {llm::kMaxContextLen, 128},
+      {llm::kUseKVCache, true},
+      {llm::kUseSDPAWithKVCache, false},
+  });
+
+  // Read metadata from the model
+  auto method_names_result = module->method_names();
+  if (method_names_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method names");
+    return metadata;
+  }
+  const auto method_names = method_names_result.get();
+
+  for (auto& pair : metadata) {
+    const auto& method_name = pair.first;
+    auto& value = pair.second;
+
+    if (method_names.count(method_name)) {
+      auto get_result = module->get(method_name);
+      value = get_result.get().toScalar().to<decltype(metadata)::mapped_type>();
+    } else {
+      ET_LOG(
+          Info,
+          "Method %s not found, using the default value %" PRId64,
+          method_name.c_str(),
+          value);
+    }
+    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
+  }
+  // Set tokenizer-related metadata
+  metadata[llm::kBosId] = tokenizer->bos_tok();
+  metadata[llm::kVocabSize] = tokenizer->vocab_size();
+  return metadata;
+}
+
+std::unordered_set<uint64_t> get_eos_ids(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module) {
+  std::unordered_set<uint64_t> eos_ids = {tokenizer->eos_tok()};
+  // Get EOS IDs if available
+  auto method_names_result = module->method_names();
+  if (method_names_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method names");
+    return eos_ids;
+  }
+  const auto method_names = method_names_result.get();
+
+  if (method_names.count(llm::kEosIds)) {
+    eos_ids.clear();
+    auto execute_result = module->execute(llm::kEosIds);
+    if (execute_result.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to execute %s", llm::kEosIds);
+      return eos_ids;
+    }
+    for (const auto& eos_id : execute_result.get()) {
+      auto value = eos_id.toScalar().to<int64_t>();
+      eos_ids.emplace(value);
+      ET_LOG(Info, "eos_id = %" PRId64, value);
+    }
+  }
+  return eos_ids;
+}
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path,
+    float temperature) {
+  // Sanity check tokenizer
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer is null or not loaded");
+    return nullptr;
+  }
+
+  // Create the Module
+  std::unique_ptr<Module> module;
+  if (data_path.has_value()) {
+    module = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
+
+  // Get metadata from Module
+  ET_LOG(Info, "Reading metadata from model");
+  auto metadata = llm::get_llm_metadata(tokenizer.get(), module.get());
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
+      llm::get_eos_ids(tokenizer.get(), module.get()));
+
+  // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
+  // TextPrefiller and TextTokenGenerator
+  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
+      module.get(), metadata.at(kUseKVCache));
+
+  // Create text_prefiller
+  auto text_prefiller = std::make_unique<TextPrefiller>(
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      metadata.at(kEnableDynamicShape),
+      metadata.at(kMaxSeqLen));
+
+  // Create text_token_generator with stats
+  auto stats = std::make_unique<Stats>();
+  auto text_token_generator = std::make_unique<TextTokenGenerator>(
+      tokenizer.get(),
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      std::move(eos_ids),
+      stats.get());
+
+  // Create and return the Runner instance
+  return std::make_unique<TextLLMRunner>(
+      std::move(metadata),
+      std::move(tokenizer),
+      std::move(module),
+      std::move(text_decoder_runner),
+      std::move(text_prefiller),
+      std::move(text_token_generator),
+      std::move(stats),
+      temperature);
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
new file mode 100644
index 00000000000..715688ba82c
--- /dev/null
+++ b/extension/llm/runner/text_llm_runner.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
+#include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+namespace executorch::extension::llm {
+
+class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
+ public:
+  /**
+   * @brief Constructor for TextLLMRunner with dependency injection
+   *
+   * Creates a TextLLMRunner instance with all required components for text
+   * generation.
+   *
+   * @param metadata Key-value pairs containing model metadata (e.g.,
+   * vocab_size, context_length)
+   * @param tokenizer Tokenizer for converting between text and token IDs
+   * @param module The underlying model module that performs inference
+   * @param text_decoder_runner Component responsible for running the decoder
+   * part of the model
+   * @param text_prefiller Component for handling the prefill phase of text
+   * generation
+   * @param text_token_generator Component for generating tokens during the
+   * decode phase
+   * @param stats Statistics tracking object for performance monitoring
+   * @param temperature Temperature parameter for controlling randomness in
+   * generation (deprecated). Please use GenerationConfig.temperature instead.
+   */
+  explicit TextLLMRunner(
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+      std::unique_ptr<::executorch::extension::Module> module,
+      std::unique_ptr<TextDecoderRunner> text_decoder_runner,
+      std::unique_ptr<TextPrefiller> text_prefiller,
+      std::unique_ptr<TextTokenGenerator> text_token_generator,
+      std::unique_ptr<Stats> stats,
+      float temperature = -1.0f);
+
+  /**
+   * @brief Checks if the model is loaded and ready for inference
+   *
+   * @return bool True if the model is loaded, false otherwise
+   */
+  bool is_loaded() const override;
+  /**
+   * @brief Loads the model and prepares it for inference
+   *
+   * This method initializes all components and prepares the model for text
+   * generation.
+   *
+   * @return ::executorch::runtime::Error Success or error status
+   */
+  ::executorch::runtime::Error load() override;
+  /**
+   * @brief Generates text based on the provided prompt
+   *
+   * This method performs text generation using the loaded model. It processes
+   * the input prompt, runs the model in prefill and decode phases, and returns
+   * generated text through callbacks.
+   *
+   * @param prompt The input text to generate from
+   * @param config Configuration parameters for text generation (e.g.,
+   * max_new_tokens, temperature)
+   * @param token_callback Function called for each generated token with the
+   * decoded text
+   * @param stats_callback Function called with performance statistics
+   * @return ::executorch::runtime::Error Success or error status
+   */
+  ::executorch::runtime::Error generate(
+      const std::string& prompt,
+      const GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {}) override;
+  /**
+   * @brief Warms up the model with a sample prompt
+   *
+   * This method runs a complete generation cycle without returning results,
+   * which helps initialize internal caches and optimize subsequent inferences.
+   *
+   * @param prompt The sample prompt to use for warmup
+   * @param max_new_tokens Maximum number of tokens to generate during warmup
+   * @return ::executorch::runtime::Error Success or error status
+   */
+  ::executorch::runtime::Error warmup(
+      const std::string& prompt,
+      int32_t max_new_tokens);
+  /**
+   * @brief Stops the ongoing text generation process
+   *
+   * This method signals the generator to stop producing new tokens and
+   * terminate the current generation process.
+   */
+  void stop() override;
+
+ private:
+  bool shouldStop_{false};
+
+  // Components
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<::executorch::extension::Module>
+      module_; // Manage module's lifecycle, make sure it outlives
+               // text_decoder_runner_.
+  std::unique_ptr<TextDecoderRunner>
+      text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make
+                            // sure it outlives text_prefiller_ &
+                            // text_token_generator_.
+  std::unique_ptr<TextPrefiller> text_prefiller_;
+  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+
+  // Stats
+  std::unique_ptr<Stats> stats_;
+
+  // temperature.
+  // Deprecated, we should rely on the temperature in GenerationConfig instead.
+  float temperature_ = -1.0f;
+};
+
+/**
+ * @brief Loads a tokenizer from the specified path
+ *
+ * This function creates and initializes a tokenizer from a file, with options
+ * to customize special tokens and regex patterns.
+ *
+ * @param tokenizer_path Path to the tokenizer file
+ * @param special_tokens Optional list of special tokens to add to the tokenizer
+ * @param pattern Optional regex pattern for tokenization
+ * @param bos_token_index Index of the beginning-of-sequence token
+ * @param eos_token_index Index of the end-of-sequence token
+ * @return std::unique_ptr<tokenizers::Tokenizer> Initialized tokenizer instance
+ */
+ET_EXPERIMENTAL std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
+    const std::string& tokenizer_path,
+    std::unique_ptr<std::vector<std::string>> special_tokens = nullptr,
+    std::optional<std::string> pattern = std::nullopt,
+    size_t bos_token_index = 0,
+    size_t eos_token_index = 1);
+
+/**
+ * @brief Creates a TextLLMRunner instance with the specified model and
+ * tokenizer
+ *
+ * This factory function creates and initializes a TextLLMRunner with all
+ * necessary components for text generation using the specified model and
+ * tokenizer.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_path Optional path to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * (deprecated)
+ * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance
+ */
+ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path = std::nullopt,
+    float temperature = -1.0f);
+
+} // namespace executorch::extension::llm
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 422cd579d04..32368661b19 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -32,6 +32,7 @@ build_executorch() {
   if [ -x "$(command -v glslc)" ]; then
     BUILD_VULKAN="ON"
   fi
+  # -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \  TODO(larryliu0820): Fix the name collision between Abseil and XNNPACK and turn this on.
   cmake . \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \
@@ -40,7 +41,6 @@ build_executorch() {
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml
index 6f12c9d4413..a033fba4929 100644
--- a/tools/cmake/cmake_deps.toml
+++ b/tools/cmake/cmake_deps.toml
@@ -241,6 +241,17 @@ deps = [
   "executorch_core",
 ]
 
+[targets.extension_tokenizers]
+buck_targets = [
+  "//extension/llm/tokenizers:sentencepiece",
+  "//extension/llm/tokenizers:tiktoken",
+  "//extension/llm/tokenizers:hf_tokenizer",
+  "//extension/llm/tokenizers:llama2c_tokenizer",
+]
+filters = [
+  ".cpp$",
+]
+
 [targets.extension_llm_runner]
 buck_targets = [
   "//extension/llm/runner:runner_lib",
@@ -257,6 +268,7 @@ deps = [
   "extension_flat_tensor",
   "extension_runner_util",
   "extension_tensor",
+  "extension_tokenizers",
 ]
 
 [targets.extension_tensor]