diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9aa53004b03..5258e966233 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -871,6 +871,10 @@ if(EXECUTORCH_BUILD_WASM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm)
endif()
+if(EXECUTORCH_BUILD_TOKENIZERS_WASM)
+ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm/tokenizers)
+endif()
+
if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
list(APPEND _executorch_extensions extension_training)
diff --git a/extension/wasm/tokenizers/CMakeLists.txt b/extension/wasm/tokenizers/CMakeLists.txt
new file mode 100644
index 00000000000..03b7ea1ff6b
--- /dev/null
+++ b/extension/wasm/tokenizers/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.29)
+
+if(NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT EMSCRIPTEN)
+ message(FATAL_ERROR "Emscripten is required to build this target")
+endif()
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+set(link_libraries)
+list(APPEND link_libraries embind tokenizers::tokenizers)
+
+add_library(tokenizers_wasm OBJECT tokenizers.cpp)
+
+target_compile_options(tokenizers_wasm PUBLIC ${_common_compile_options})
+target_include_directories(
+ tokenizers_wasm PUBLIC ${_common_include_directories}
+)
+
+target_link_libraries(tokenizers_wasm PUBLIC ${link_libraries})
diff --git a/extension/wasm/tokenizers/README.md b/extension/wasm/tokenizers/README.md
new file mode 100644
index 00000000000..e1c48992e94
--- /dev/null
+++ b/extension/wasm/tokenizers/README.md
@@ -0,0 +1,66 @@
+# Tokenizers JavaScript Bindings
+
+This directory contains the JavaScript bindings for the [LLM Tokenizers](../../llm/README.md#tokenizer) library.
+
+## Building
+
+To build Tokenizers for Wasm, make sure to use the `emcmake cmake` command and to have `EXECUTORCH_BUILD_TOKENIZERS_WASM` and `EXECUTORCH_BUILD_EXTENSION_LLM` enabled. For example:
+
+```bash
+# Configure the build with the Emscripten environment variables
+emcmake cmake . -DEXECUTORCH_BUILD_TOKENIZERS_WASM=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -Bcmake-out-wasm
+
+# Build the Wasm extension
+cmake --build cmake-out-wasm --target tokenizers_wasm -j32
+```
+
+Emscripten modules are loaded into the global `Module` object by default. This means you cannot have multiple modules in the same page. If you are also using the ExecuTorch Wasm bindings, it is recommended to use the `MODULARIZE` option to avoid conflicts.
+
+In your CMakeLists.txt, add the following lines:
+
+```cmake
+add_executable(tokenizers_wasm_lib) # Emscripten outputs this as a JS and Wasm file
+target_link_libraries(tokenizers_wasm_lib PRIVATE tokenizers_wasm)
+target_link_options(tokenizers_wasm_lib PRIVATE -sMODULARIZE=1 -sEXPORT_NAME=loadTokenizers) # If EXPORT_NAME is not set, the default is Module, which will conflict with ExecuTorch
+```
+
+You can then access the module with `mod = await loadTokenizers();` or `loadTokenizers().then(mod => { /* ... */ });`.
+
+For example, to load the module in a HTML file, you can use the following:
+
+```html
+
+
+
+```
+
+You can read more about Modularized Output in the [Emscripten docs](https://emscripten.org/docs/compiling/Modularized-Output.html).
+
+## JavaScript API
+
+### Supported Tokenizers
+- `HFTokenizer`
+- `SpTokenizer`
+- `Tiktoken`
+- `Llama2cTokenizer`
+
+### Tokenizer API
+- `load(data)`: Load tokenizer data from a file or a buffer.
+- `encode(text, bos=0, eos=0)`: Encode a string into a list of tokens with the number of bos tokens to prepend and eos tokens to append to the result.
+- `decode(tokens)`: Decode a list of tokens into a string.
+- `vocabSize`: The number of tokens in the vocabulary.
+- `eosTok`: The end-of-sequence token.
+- `bosTok`: The begining-of-sequence token.
+- `isLoaded`: Whether the tokenizer is loaded.
diff --git a/extension/wasm/tokenizers/tokenizers.cpp b/extension/wasm/tokenizers/tokenizers.cpp
new file mode 100644
index 00000000000..b1558464f20
--- /dev/null
+++ b/extension/wasm/tokenizers/tokenizers.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace emscripten;
+using tokenizers::Error;
+using tokenizers::HFTokenizer;
+using tokenizers::Llama2cTokenizer;
+using tokenizers::SPTokenizer;
+using tokenizers::Tekken;
+using tokenizers::Tiktoken;
+using tokenizers::Tokenizer;
+
+#define THROW_JS_ERROR(errorType, message, ...) \
+ ({ \
+ char msg_buf[256]; \
+ int len = snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
+ if (len < sizeof(msg_buf)) { \
+ EM_ASM(throw new errorType(UTF8ToString($0)), msg_buf); \
+ } else { \
+ std::string msg; \
+ msg.resize(len); \
+ snprintf(&msg[0], len + 1, message, ##__VA_ARGS__); \
+ EM_ASM(throw new errorType(UTF8ToString($0)), msg.c_str()); \
+ } \
+ __builtin_unreachable(); \
+ })
+
+/// Throws a JavaScript Error with the provided message if `error` is not `Ok`.
+#define THROW_IF_ERROR(error, message, ...) \
+ ({ \
+ if ET_UNLIKELY ((error) != Error::Ok) { \
+ THROW_JS_ERROR(Error, message, ##__VA_ARGS__); \
+ } \
+ })
+
+namespace executorch {
+namespace extension {
+namespace wasm {
+namespace tokenizers {
+
+namespace {
+
+#define JS_FORALL_TOKENIZERS(_) \
+ _(HFTokenizer) \
+ _(Tiktoken) \
+ _(SPTokenizer) \
+ _(Llama2cTokenizer) \
+ _(Tekken)
+
+/**
+ * EXPERIMENTAL: JavaScript wrapper for Tokenizer.
+ */
+template
+class ET_EXPERIMENTAL JsTokenizer {
+ static_assert(
+ std::is_base_of::value,
+ "T must be a subclass of Tokenizer");
+
+ public:
+ JsTokenizer() : tokenizer_(std::make_unique()) {}
+ JsTokenizer(const JsTokenizer&) = delete;
+ JsTokenizer& operator=(const JsTokenizer&) = delete;
+ JsTokenizer(JsTokenizer&&) = default;
+ JsTokenizer& operator=(JsTokenizer&&) = default;
+
+ void load_from_uint8_array(val data) {
+ // Tokenizer API can't load from a buffer, so we need to write the buffer to
+ // a temporary file and load from there.
+ static const char* tmpFileName = "tokenizer_input_buffer.tmp";
+ FILE* tmp_file = fopen(tmpFileName, "wb");
+ if (tmp_file == nullptr) {
+ THROW_JS_ERROR(Error, "Failed to open file");
+ }
+ size_t length = data["length"].as();
+ std::vector buffer(length);
+ val memory_view = val(typed_memory_view(length, buffer.data()));
+ memory_view.call("set", data);
+ fwrite(buffer.data(), sizeof(uint8_t), length, tmp_file);
+ fclose(tmp_file);
+ Error error = tokenizer_->load(tmpFileName);
+ THROW_IF_ERROR(error, "Failed to load tokenizer");
+ remove(tmpFileName);
+ }
+
+ void load(val data) {
+ if (data.isString()) {
+ Error error = tokenizer_->load(data.as());
+ THROW_IF_ERROR(error, "Failed to load tokenizer");
+ } else if (data.instanceof (val::global("Uint8Array"))) {
+ return load_from_uint8_array(data);
+ } else if (data.instanceof (val::global("ArrayBuffer"))) {
+ return load_from_uint8_array(val::global("Uint8Array").new_(data));
+ } else {
+ THROW_JS_ERROR(
+ TypeError,
+ "Unsupported data type: %s",
+ data.typeOf().as().c_str());
+ }
+ }
+
+ val encode(const std::string& text, int8_t bos, int8_t eos) const {
+ auto res = tokenizer_->encode(text, bos, eos);
+ THROW_IF_ERROR(res.error(), "Failed to encode text");
+ return val::array(res.get().begin(), res.get().end());
+ }
+
+ val encode(const std::string& text, int8_t bos) const {
+ return encode(text, bos, 0);
+ }
+
+ val encode(const std::string& text) const {
+ return encode(text, 0);
+ }
+
+ std::string decode(uint64_t prev, uint64_t current) const {
+ auto res = tokenizer_->decode(prev, current);
+ THROW_IF_ERROR(res.error(), "Failed to decode token");
+ return res.get();
+ }
+
+ uint64_t vocab_size() const {
+ return tokenizer_->vocab_size();
+ }
+
+ uint64_t bos_tok() const {
+ return tokenizer_->bos_tok();
+ }
+
+ uint64_t eos_tok() const {
+ return tokenizer_->eos_tok();
+ }
+
+ bool is_loaded() const {
+ return tokenizer_->is_loaded();
+ }
+
+ private:
+ std::unique_ptr tokenizer_;
+};
+
+} // namespace
+
+EMSCRIPTEN_BINDINGS(TokenizerModule) {
+#define JS_BIND_TOKENIZER(NAME) \
+ class_>(#NAME) \
+ .constructor<>() \
+ .function("load", &JsTokenizer::load) \
+ .function( \
+ "encode", \
+ select_overload( \
+ &JsTokenizer::encode)) \
+ .function( \
+ "encode", \
+ select_overload( \
+ &JsTokenizer::encode)) \
+ .function( \
+ "encode", \
+ select_overload( \
+ &JsTokenizer::encode)) \
+ .function("decode", &JsTokenizer::decode) \
+ .property("vocabSize", &JsTokenizer::vocab_size) \
+ .property("bosTok", &JsTokenizer::bos_tok) \
+ .property("eosTok", &JsTokenizer::eos_tok) \
+ .property("isLoaded", &JsTokenizer::is_loaded);
+ JS_FORALL_TOKENIZERS(JS_BIND_TOKENIZER)
+}
+
+} // namespace tokenizers
+} // namespace wasm
+} // namespace extension
+} // namespace executorch
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 76e7eba53cf..fb0dc0a4ade 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -155,6 +155,10 @@ define_overridable_option(
define_overridable_option(
EXECUTORCH_BUILD_WASM "Build the ExecuTorch JavaScript API" BOOL OFF
)
+define_overridable_option(
+ EXECUTORCH_BUILD_TOKENIZERS_WASM "Build the JavaScript Tokenizers API" BOOL
+ OFF
+)
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
set(_default_executorch_build_pthreadpool OFF)
@@ -333,6 +337,11 @@ check_required_options_on(
EXECUTORCH_BUILD_EXTENSION_TENSOR
)
+check_required_options_on(
+ IF_ON EXECUTORCH_BUILD_TOKENIZERS_WASM REQUIRES
+ EXECUTORCH_BUILD_EXTENSION_LLM
+)
+
if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
message(
FATAL_ERROR