diff --git a/CMakeLists.txt b/CMakeLists.txt index 9aa53004b03..5258e966233 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -871,6 +871,10 @@ if(EXECUTORCH_BUILD_WASM) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm) endif() +if(EXECUTORCH_BUILD_TOKENIZERS_WASM) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm/tokenizers) +endif() + if(EXECUTORCH_BUILD_EXTENSION_TRAINING) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training) list(APPEND _executorch_extensions extension_training) diff --git a/extension/wasm/tokenizers/CMakeLists.txt b/extension/wasm/tokenizers/CMakeLists.txt new file mode 100644 index 00000000000..03b7ea1ff6b --- /dev/null +++ b/extension/wasm/tokenizers/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.29) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(NOT EMSCRIPTEN) + message(FATAL_ERROR "Emscripten is required to build this target") +endif() + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror) +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +set(link_libraries) +list(APPEND link_libraries embind tokenizers::tokenizers) + +add_library(tokenizers_wasm OBJECT tokenizers.cpp) + +target_compile_options(tokenizers_wasm PUBLIC ${_common_compile_options}) +target_include_directories( + tokenizers_wasm PUBLIC ${_common_include_directories} +) + +target_link_libraries(tokenizers_wasm PUBLIC ${link_libraries}) diff --git a/extension/wasm/tokenizers/README.md b/extension/wasm/tokenizers/README.md new file mode 100644 index 00000000000..e1c48992e94 --- /dev/null +++ b/extension/wasm/tokenizers/README.md @@ -0,0 +1,66 @@ +# Tokenizers JavaScript Bindings + +This directory contains the JavaScript bindings for the [LLM Tokenizers](../../llm/README.md#tokenizer) library. + +## Building + +To build Tokenizers for Wasm, make sure to use the `emcmake cmake` command and to have `EXECUTORCH_BUILD_TOKENIZERS_WASM` and `EXECUTORCH_BUILD_EXTENSION_LLM` enabled. For example: + +```bash +# Configure the build with the Emscripten environment variables +emcmake cmake . -DEXECUTORCH_BUILD_TOKENIZERS_WASM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out-wasm + +# Build the Wasm extension +cmake --build cmake-out-wasm --target tokenizers_wasm -j32 +``` + +Emscripten modules are loaded into the global `Module` object by default. This means you cannot have multiple modules in the same page. If you are also using the ExecuTorch Wasm bindings, it is recommended to use the `MODULARIZE` option to avoid conflicts. + +In your CMakeLists.txt, add the following lines: + +```cmake +add_executable(tokenizers_wasm_lib) # Emscripten outputs this as a JS and Wasm file +target_link_libraries(tokenizers_wasm_lib PRIVATE tokenizers_wasm) +target_link_options(tokenizers_wasm_lib PRIVATE -sMODULARIZE=1 -sEXPORT_NAME=loadTokenizers) # If EXPORT_NAME is not set, the default is Module, which will conflict with ExecuTorch +``` + +You can then access the module with `mod = await loadTokenizers();` or `loadTokenizers().then(mod => { /* ... */ });`. + +For example, to load the module in a HTML file, you can use the following: + +```html + + + +``` + +You can read more about Modularized Output in the [Emscripten docs](https://emscripten.org/docs/compiling/Modularized-Output.html). + +## JavaScript API + +### Supported Tokenizers +- `HFTokenizer` +- `SpTokenizer` +- `Tiktoken` +- `Llama2cTokenizer` + +### Tokenizer API +- `load(data)`: Load tokenizer data from a file or a buffer. +- `encode(text, bos=0, eos=0)`: Encode a string into a list of tokens with the number of bos tokens to prepend and eos tokens to append to the result. +- `decode(tokens)`: Decode a list of tokens into a string. +- `vocabSize`: The number of tokens in the vocabulary. +- `eosTok`: The end-of-sequence token. +- `bosTok`: The begining-of-sequence token. +- `isLoaded`: Whether the tokenizer is loaded. diff --git a/extension/wasm/tokenizers/tokenizers.cpp b/extension/wasm/tokenizers/tokenizers.cpp new file mode 100644 index 00000000000..b1558464f20 --- /dev/null +++ b/extension/wasm/tokenizers/tokenizers.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace emscripten; +using tokenizers::Error; +using tokenizers::HFTokenizer; +using tokenizers::Llama2cTokenizer; +using tokenizers::SPTokenizer; +using tokenizers::Tekken; +using tokenizers::Tiktoken; +using tokenizers::Tokenizer; + +#define THROW_JS_ERROR(errorType, message, ...) \ + ({ \ + char msg_buf[256]; \ + int len = snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \ + if (len < sizeof(msg_buf)) { \ + EM_ASM(throw new errorType(UTF8ToString($0)), msg_buf); \ + } else { \ + std::string msg; \ + msg.resize(len); \ + snprintf(&msg[0], len + 1, message, ##__VA_ARGS__); \ + EM_ASM(throw new errorType(UTF8ToString($0)), msg.c_str()); \ + } \ + __builtin_unreachable(); \ + }) + +/// Throws a JavaScript Error with the provided message if `error` is not `Ok`. +#define THROW_IF_ERROR(error, message, ...) \ + ({ \ + if ET_UNLIKELY ((error) != Error::Ok) { \ + THROW_JS_ERROR(Error, message, ##__VA_ARGS__); \ + } \ + }) + +namespace executorch { +namespace extension { +namespace wasm { +namespace tokenizers { + +namespace { + +#define JS_FORALL_TOKENIZERS(_) \ + _(HFTokenizer) \ + _(Tiktoken) \ + _(SPTokenizer) \ + _(Llama2cTokenizer) \ + _(Tekken) + +/** + * EXPERIMENTAL: JavaScript wrapper for Tokenizer. + */ +template +class ET_EXPERIMENTAL JsTokenizer { + static_assert( + std::is_base_of::value, + "T must be a subclass of Tokenizer"); + + public: + JsTokenizer() : tokenizer_(std::make_unique()) {} + JsTokenizer(const JsTokenizer&) = delete; + JsTokenizer& operator=(const JsTokenizer&) = delete; + JsTokenizer(JsTokenizer&&) = default; + JsTokenizer& operator=(JsTokenizer&&) = default; + + void load_from_uint8_array(val data) { + // Tokenizer API can't load from a buffer, so we need to write the buffer to + // a temporary file and load from there. + static const char* tmpFileName = "tokenizer_input_buffer.tmp"; + FILE* tmp_file = fopen(tmpFileName, "wb"); + if (tmp_file == nullptr) { + THROW_JS_ERROR(Error, "Failed to open file"); + } + size_t length = data["length"].as(); + std::vector buffer(length); + val memory_view = val(typed_memory_view(length, buffer.data())); + memory_view.call("set", data); + fwrite(buffer.data(), sizeof(uint8_t), length, tmp_file); + fclose(tmp_file); + Error error = tokenizer_->load(tmpFileName); + THROW_IF_ERROR(error, "Failed to load tokenizer"); + remove(tmpFileName); + } + + void load(val data) { + if (data.isString()) { + Error error = tokenizer_->load(data.as()); + THROW_IF_ERROR(error, "Failed to load tokenizer"); + } else if (data.instanceof (val::global("Uint8Array"))) { + return load_from_uint8_array(data); + } else if (data.instanceof (val::global("ArrayBuffer"))) { + return load_from_uint8_array(val::global("Uint8Array").new_(data)); + } else { + THROW_JS_ERROR( + TypeError, + "Unsupported data type: %s", + data.typeOf().as().c_str()); + } + } + + val encode(const std::string& text, int8_t bos, int8_t eos) const { + auto res = tokenizer_->encode(text, bos, eos); + THROW_IF_ERROR(res.error(), "Failed to encode text"); + return val::array(res.get().begin(), res.get().end()); + } + + val encode(const std::string& text, int8_t bos) const { + return encode(text, bos, 0); + } + + val encode(const std::string& text) const { + return encode(text, 0); + } + + std::string decode(uint64_t prev, uint64_t current) const { + auto res = tokenizer_->decode(prev, current); + THROW_IF_ERROR(res.error(), "Failed to decode token"); + return res.get(); + } + + uint64_t vocab_size() const { + return tokenizer_->vocab_size(); + } + + uint64_t bos_tok() const { + return tokenizer_->bos_tok(); + } + + uint64_t eos_tok() const { + return tokenizer_->eos_tok(); + } + + bool is_loaded() const { + return tokenizer_->is_loaded(); + } + + private: + std::unique_ptr tokenizer_; +}; + +} // namespace + +EMSCRIPTEN_BINDINGS(TokenizerModule) { +#define JS_BIND_TOKENIZER(NAME) \ + class_>(#NAME) \ + .constructor<>() \ + .function("load", &JsTokenizer::load) \ + .function( \ + "encode", \ + select_overload( \ + &JsTokenizer::encode)) \ + .function( \ + "encode", \ + select_overload( \ + &JsTokenizer::encode)) \ + .function( \ + "encode", \ + select_overload( \ + &JsTokenizer::encode)) \ + .function("decode", &JsTokenizer::decode) \ + .property("vocabSize", &JsTokenizer::vocab_size) \ + .property("bosTok", &JsTokenizer::bos_tok) \ + .property("eosTok", &JsTokenizer::eos_tok) \ + .property("isLoaded", &JsTokenizer::is_loaded); + JS_FORALL_TOKENIZERS(JS_BIND_TOKENIZER) +} + +} // namespace tokenizers +} // namespace wasm +} // namespace extension +} // namespace executorch diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 76e7eba53cf..fb0dc0a4ade 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -155,6 +155,10 @@ define_overridable_option( define_overridable_option( EXECUTORCH_BUILD_WASM "Build the ExecuTorch JavaScript API" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_TOKENIZERS_WASM "Build the JavaScript Tokenizers API" BOOL + OFF +) if(EXECUTORCH_BUILD_ARM_BAREMETAL) set(_default_executorch_build_pthreadpool OFF) @@ -333,6 +337,11 @@ check_required_options_on( EXECUTORCH_BUILD_EXTENSION_TENSOR ) +check_required_options_on( + IF_ON EXECUTORCH_BUILD_TOKENIZERS_WASM REQUIRES + EXECUTORCH_BUILD_EXTENSION_LLM +) + if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}) message( FATAL_ERROR