wangzhaode
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 40 additions & 9 deletions b/‎.github/workflows/build.yml‎
Lines changed: 40 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 18 additions & 5 deletions b/‎README.md‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎README_CN.md‎
Lines changed: 19 additions & 6 deletions b/‎README_CN.md‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎doc/implementation_details_CN.md‎
Lines changed: 4 additions & 2 deletions b/‎doc/implementation_details_CN.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/tokenizer.cpp‎
Lines changed: 39 additions & 32 deletions b/‎src/tokenizer.cpp‎
Lines changed: 39 additions & 32 deletions
diff --git a/‎tests/test_main.cpp‎
Lines changed: 2 additions & 2 deletions b/‎tests/test_main.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -7,32 +7,63 @@ on:
     branches: [ "main" ]
 
 jobs:
-  build:
+  generate-data:
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v3
       with:
         submodules: recursive
 
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+
     - name: Install dependencies
+      run: |
+        pip install modelscope transformers
+
+    - name: Generate Tests Data
+      run: |
+        cd tests
+        python generate_assets.py
+
+    - name: Upload models
+      uses: actions/upload-artifact@v4
+      with:
+        name: models
+        path: tests/models/
+
+  build-and-test:
+    needs: generate-data
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ujson_use_rapidjson: ["OFF", "ON"]
+    name: build-test (RapidJSON=${{ matrix.ujson_use_rapidjson }})
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Download models
+      uses: actions/download-artifact@v4
+      with:
+        name: models
+        path: tests/models/
+
+    - name: Install build dependencies
       run: |
         sudo apt-get update
         sudo apt-get install -y cmake g++
-        pip install modelscope transformers
 
     - name: Build
       run: |
         mkdir build
         cd build
-        cmake .. -DCMAKE_BUILD_TYPE=Release
+        cmake .. -DCMAKE_BUILD_TYPE=Release -DUJSON_USE_RAPIDJSON=${{ matrix.ujson_use_rapidjson }}
         make -j$(nproc)
 
-    - name: Generate Tests Data
-      run: |
-        cd tests
-        python generate_assets.py
-
     - name: Run Tests
       run: |
         cd build
 
@@ -11,11 +11,9 @@ It provides a high-performance C++ implementation for modern LLM tokenization pi
 ## Features
 
 - **HuggingFace Compatible**: Loads directly from `tokenizer.json`.
-- **Comprehensive Support**: Supports BPE, WordPiece, and Unigram models.
-- **Complex Normalization**: Implements NFKC, Sequence, Prepend, Replace, and more.
-- **Advanced Pre-tokenization**: Supports ByteLevel, Digits, Split, and Regex-based patterns (GPT-2/4 style).
-- **Efficient**: Optimized C++ implementation using minimal dependencies.
-- **Self-Contained**: Includes pruned versions of optimizations like Oniguruma for minimal footprint.
+- **Dual JSON Backend**: Supports both `nlohmann/json` and `RapidJSON` via `ujson` bridge.
+- **Efficient**: Optimized C++ implementation with nearly 2x faster loading using RapidJSON.
+- **Self-Contained**: Includes pruned Oniguruma for minimal footprint.
 
 ## Supported Models
 
@@ -42,7 +40,10 @@ The library allows easy loading and usage of tokenizers.
 ```bash
 mkdir build
 cd build
+# Default: uses nlohmann/json
 cmake ..
+# Optional: use RapidJSON for 2x faster loading
+cmake .. -DUJSON_USE_RAPIDJSON=ON
 make
 ```
 
@@ -82,6 +83,18 @@ int main() {
 }
 ```
 
+## Performance
+
+The library is optimized for loading speed, especially for large models. Using the `RapidJSON` backend provides a significant performance boost:
+
+| Metric (41 Models / 1691 Cases) | nlohmann/json | RapidJSON (via ujson) | Speedup |
+| :--- | :--- | :--- | :--- |
+| **Total Loading Time** | 92.40 s | 47.13 s | **1.96x** |
+| **Total Encode Time** | 0.25 s | 0.23 s | 1.07x |
+| **Total Time** | 92.65 s | 47.36 s | **1.95x** |
+
+*Benchmarks conducted on 41 different model architectures.*
+
 ## Documentation
 
 For deep technical details on the implementation and architecture, see [doc/implementation_details_CN.md](doc/implementation_details_CN.md).
 
@@ -10,12 +10,10 @@
 
 ## 主要特性
 
-- **HuggingFace 兼容**：直接加载标准的 `tokenizer.json` 文件，无需转换。
-- **全面支持**：支持 BPE (Byte-Pair Encoding), WordPiece, Unigram 算法。
-- **复杂的规范化**：内置 NFKC、Sequence、Prepend、Replace 等多种规范化器。
-- **高级预分词**：支持 ByteLevel、Digits、Split 以及基于 Regex 的复杂切分（完美复刻 GPT-2/4 风格）。
-- **高效轻量**：优化的 C++ 实现，依赖极少。
-- **自包含**：内置经过深度裁剪的 Oniguruma 正则引擎，在保持强大 Unicode 支持的同时最小化体积。
+- **HuggingFace 兼容**：直接加载标准的 `tokenizer.json` 文件。
+- **双 JSON 后端**：通过 `ujson` 桥接同时支持 `nlohmann/json` 和 `RapidJSON`。
+- **高效高性能**：优化的 C++ 实现，使用 RapidJSON 后端可提升约 2 倍加载速度。
+- **轻量自包含**：内置裁剪版 Oniguruma，最小化二进制体积。
 
 ## 支持模型
 
@@ -40,7 +38,10 @@
 ```bash
 mkdir build
 cd build
+# 默认：使用 nlohmann/json
 cmake ..
+# 可选：使用 RapidJSON 提升 2 倍加载速度
+cmake .. -DUJSON_USE_RAPIDJSON=ON
 make
 ```
 
@@ -80,6 +81,18 @@ int main() {
 }
 ```
 
+## 性能测试
+
+本库针对加载速度进行了深度优化，特别是在处理超大模型配置文件时。使用 `RapidJSON` 后端可获得显著性能提升：
+
+| 指标 (41 个模型 / 1691 个测试用例) | nlohmann/json | RapidJSON (via ujson) | 加速比 |
+| :--- | :--- | :--- | :--- |
+| **总计加载时间** | 92.40 s | 47.13 s | **1.96x** |
+| **总计编码时间** | 0.25 s | 0.23 s | 1.07x |
+| **总计总耗时** | 92.65 s | 47.36 s | **1.95x** |
+
+*测试涵盖 41 种不同模型架构。*
+
 ## 文档
 
 关于项目架构和技术实现的深度解析，请参阅技术文档 [doc/implementation_details_CN.md](doc/implementation_details_CN.md)。
 
@@ -49,9 +49,11 @@
 *   **裁剪**: 移除了所有非 UTF-8 编码支持 (EUC-JP, SJIS 等)，移除了 POSIX/GNU 兼容层，仅保留核心正则引擎。
 *   **体积优化**: 使得最终二进制体积增加极小，远小于引入 ICU 或完整版 Oniguruma。
 
-### 2. JSON 加载与兼容性
+### 2. JSON 加载与性能优化
 *   直接解析 HuggingFace 标准的 `tokenizer.json`。
-*   使用 `nlohmann::json` 处理复杂的嵌套配置 (如 `normalizer` 中套 `Sequence` 再套 `Replace`)。
+*   **ujson 桥接**: 引入了 `ujson` 桥接层，允许在 `nlohmann/json` 和 `RapidJSON` 之间灵活切换。
+*   **后端切换**: 开发者可以通过 `UJSON_USE_RAPIDJSON` 宏启用 RapidJSON 后端。
+*   **性能提升**: 在 40+ 模型的大规模测试中，RapidJSON 后端将模型加载时间从 ~92s 降低到 ~47s，实现了近 **2 倍** 的加载性能提升。
 *   针对 `pre_tokenizer` 和 `normalizer` 的多态类型实现了工厂模式加载。
 
 ### 3. Unicode 处理
 
@@ -9,10 +9,14 @@
 #include <cmath>
 #include <oniguruma.h>
 #include <utf8proc/utf8proc.h>
+#include <iostream>
+#include "ujson.hpp"
 #include "jinja.hpp"
 
 namespace tokenizer {
 
+using json = ujson::json;
+
 // ==========================================
 // C++11 Polyfills
 // ==========================================
@@ -87,7 +91,7 @@ static std::string OnigurumaRegexEscape(const std::string& pattern) {
     return escaped;
 }
 
-static std::string get_token_content(const nlohmann::json& j) {
+static std::string get_token_content(const json& j) {
     if (j.is_string()) return j.get<std::string>();
     if (j.is_object() && j.contains("content")) return j["content"].get<std::string>();
     return "";
@@ -560,8 +564,8 @@ class BPEModel : public Model {
         return out;
     }
 
-    void load(const nlohmann::json& v, const nlohmann::json& m) {
-        for (auto it = v.begin(); it != v.end(); ++it) { vocab_[it.key()] = it.value(); id_to_token_[it.value()] = it.key(); }
+    void load(const json& v, const json& m) {
+        for (auto it = v.begin(); it != v.end(); ++it) { vocab_[it.key()] = it.value().get<int>(); id_to_token_[it.value().get<int>()] = it.key(); }
         int rank = 0;
         for (const auto& item : m) {
             std::string s1, s2;
@@ -585,10 +589,10 @@ class WordPieceModel : public Model {
     WordPieceModel(const std::string& unk = "[UNK]", const std::string& prefix = "##", int max_chars = 100)
         : unk_token_(unk), continuing_subword_prefix_(prefix), max_input_chars_per_word_(max_chars), unk_token_id_(-1) {}
 
-    void load(const nlohmann::json& v) {
+    void load(const json& v) {
         for (auto it = v.begin(); it != v.end(); ++it) {
-            vocab_[it.key()] = it.value();
-            id_to_token_[it.value()] = it.key();
+            vocab_[it.key()] = it.value().get<int>();
+            id_to_token_[it.value().get<int>()] = it.key();
         }
         auto it = vocab_.find(unk_token_);
         if (it != vocab_.end()) unk_token_id_ = it->second;
@@ -659,7 +663,7 @@ class UnigramModel : public Model {
     UnigramModel(int unk_id = 0, bool byte_fallback = false)
         : unk_token_id_(unk_id), byte_fallback_(byte_fallback) {}
 
-    void load(const nlohmann::json& v) {
+    void load(const json& v) {
         int idx = 0;
         for (const auto& item : v) {
             if (item.is_array() && item.size() >= 2) {
@@ -1059,7 +1063,7 @@ struct PreTrainedTokenizer::Impl {
         }
     }
 
-    bool load_from_json(PreTrainedTokenizer* public_api, const nlohmann::json& j) {
+    bool load_from_json(PreTrainedTokenizer* public_api, const json& j) {
         if (j.contains("model") && j["model"].is_object()) {
             std::string model_type = j["model"].value("type", "");
             // Auto-detect model type if not specified
@@ -1118,7 +1122,7 @@ struct PreTrainedTokenizer::Impl {
                 if (j["model"].contains("byte_fallback")) byte_fallback = j["model"]["byte_fallback"].get<bool>();
 
                 bool use_byte_level = false;
-                auto check_bl = [](const nlohmann::json& c) -> bool {
+                auto check_bl = [](const json& c) -> bool {
                     if (!c.is_object()) return false;
                     if (c.value("type", "") == "ByteLevel") return true;
                     if (c.contains("pretokenizers")) {
@@ -1132,9 +1136,9 @@ struct PreTrainedTokenizer::Impl {
                     }
                     return false;
                 };
-                if (check_bl(j.value("pre_tokenizer", nlohmann::json()))) use_byte_level = true;
-                if (check_bl(j.value("post_processor", nlohmann::json()))) use_byte_level = true;
-                if (check_bl(j.value("decoder", nlohmann::json()))) use_byte_level = true;
+                if (check_bl(j.value("pre_tokenizer", json()))) use_byte_level = true;
+                if (check_bl(j.value("post_processor", json()))) use_byte_level = true;
+                if (check_bl(j.value("decoder", json()))) use_byte_level = true;
 
                 // If we have a ByteLevelPreTokenizer in the sequence, BPEModel should not do the mapping itself
                 bool pt_has_byte_level = false;
@@ -1151,7 +1155,7 @@ struct PreTrainedTokenizer::Impl {
             }
         }
         if (j.contains("normalizer") && !j["normalizer"].is_null()) {
-            auto create_norm = [&](const nlohmann::json& s) -> std::shared_ptr<Normalizer> {
+            auto create_norm = [&](const json& s) -> std::shared_ptr<Normalizer> {
                 std::string type = s.value("type", "");
                 if (type == "NFKC") return std::make_shared<NFKCNormalizer>();
                 if (type == "Precompiled") {
@@ -1199,7 +1203,7 @@ struct PreTrainedTokenizer::Impl {
             }
         }
         if (j.contains("decoder") && !j["decoder"].is_null()) {
-            auto create_dec = [&](const nlohmann::json& s) -> std::shared_ptr<Decoder> {
+            auto create_dec = [&](const json& s) -> std::shared_ptr<Decoder> {
                 std::string type = s.value("type", "");
                 if (type == "Replace") {
                     std::string p;
@@ -1232,7 +1236,7 @@ struct PreTrainedTokenizer::Impl {
         }
         if (j.contains("pre_tokenizer") && !j["pre_tokenizer"].is_null()) {
             auto pt = j["pre_tokenizer"];
-            auto create_pt = [&](const nlohmann::json& s) -> std::shared_ptr<PreTokenizer> {
+            auto create_pt = [&](const json& s) -> std::shared_ptr<PreTokenizer> {
                 std::string type = s.value("type", "");
                 if (type == "Split") {
                     std::string p;
@@ -1269,22 +1273,22 @@ struct PreTrainedTokenizer::Impl {
         }
         if (j.contains("post_processor") && !j["post_processor"].is_null()) {
             auto pp = j["post_processor"];
-            auto ptl = [&](const nlohmann::json& s) {
+            auto ptl = [&](const json& s) {
                 std::vector<TemplateProcessing::Step> steps;
                 if (s.contains("single")) {
-                    for (auto& i : s["single"]) {
+                    for (const auto& i : s["single"]) {
                         if (i.contains("SpecialToken")) steps.push_back({true, public_api->token_to_id(i["SpecialToken"]["id"].get<std::string>())});
                         else if (i.contains("Sequence")) steps.push_back({false, 0});
                     }
                     this->post_processor_ = std::make_shared<TemplateProcessing>(steps);
                 }
             };
             if (pp.value("type", "") == "TemplateProcessing") ptl(pp);
-            else if (pp.value("type", "") == "Sequence" && pp.contains("processors")) { for (auto& s : pp["processors"]) if (s.value("type", "") == "TemplateProcessing") { ptl(s); break; } }
+            else if (pp.value("type", "") == "Sequence" && pp.contains("processors")) { for (const auto& s : pp["processors"]) if (s.value("type", "") == "TemplateProcessing") { ptl(s); break; } }
         }
         if (j.contains("added_tokens") && j["added_tokens"].is_array()) {
             std::vector<std::string> cs;
-            for (auto& item : j["added_tokens"]) {
+            for (const auto& item : j["added_tokens"]) {
                 std::string c = item.value("content", ""); int id = item.value("id", -1);
                 bool special = item.value("special", false);
                 bool lstrip = item.value("lstrip", false);
@@ -1358,30 +1362,29 @@ void PreTrainedTokenizer::set_chat_template(const std::string& t) {
     impl_->chat_template_ = t;
     impl_->jinja_template_ = std::make_shared<jinja::Template>(t);
 }
-
 std::string PreTrainedTokenizer::apply_chat_template(const ChatMessages& msgs, bool add_gen) const {
     if (!impl_->jinja_template_) return "";
-    nlohmann::json j_msgs = nlohmann::json::array();
+    json j_msgs = json::array();
     for (const auto& m : msgs) j_msgs.push_back({{"role", m.first}, {"content", m.second}});
-    nlohmann::json extra;
+    json extra = json::object();
     extra["bos_token"] = id_to_token(impl_->special_tokens_.bos);
     extra["eos_token"] = id_to_token(impl_->special_tokens_.eos);
-    return impl_->jinja_template_->apply_chat_template(j_msgs, add_gen, nlohmann::json::array(), extra);
+    return impl_->jinja_template_->apply_chat_template(j_msgs, add_gen, json::array(), extra);
 }
 
 std::string PreTrainedTokenizer::apply_chat_template(const std::string& json_str, bool add_generation_prompt) const {
     if (!impl_->jinja_template_) return "";
-    auto j_msgs = nlohmann::json::parse(json_str, nullptr, false);
+    auto j_msgs = json::parse(json_str);
     if (!j_msgs.is_array()) return "";
-    nlohmann::json extra;
+    json extra = json::object();
     extra["bos_token"] = id_to_token(impl_->special_tokens_.bos);
     extra["eos_token"] = id_to_token(impl_->special_tokens_.eos);
-    return impl_->jinja_template_->apply_chat_template(j_msgs, add_generation_prompt, nlohmann::json::array(), extra);
+    return impl_->jinja_template_->apply_chat_template(j_msgs, add_generation_prompt, json::array(), extra);
 }
 
 bool PreTrainedTokenizer::load_from_json_str(const std::string& json_str) {
-    auto j = nlohmann::json::parse(json_str, nullptr, false);
-    if (j.is_discarded()) return false;
+    auto j = json::parse(json_str);
+    if (j.is_null()) return false;
     return impl_->load_from_json(this, j);
 }
 
@@ -1396,16 +1399,20 @@ void PreTrainedTokenizer::set_clean_up_tokenization_spaces(bool clean) {
     std::shared_ptr<PreTrainedTokenizer> AutoTokenizer::from_pretrained(const std::string& path) {
         auto tok = std::make_shared<PreTrainedTokenizer>();
         std::ifstream f(path + "/tokenizer.json"); if (!f.is_open()) return nullptr;
-        nlohmann::json j; f >> j;
+        std::stringstream ss_j; ss_j << f.rdbuf();
+        json j = json::parse(ss_j.str());
+        if (j.is_null()) return nullptr;
+
         std::ifstream fc(path + "/tokenizer_config.json");
         bool clean_up_spaces = false;
         if (fc.is_open()) {
-            nlohmann::json jc; fc >> jc; if (jc.contains("chat_template")) tok->set_chat_template(jc["chat_template"].get<std::string>());
+            std::stringstream ss_jc; ss_jc << fc.rdbuf();
+            json jc = json::parse(ss_jc.str());
+            if (jc.contains("chat_template")) tok->set_chat_template(jc["chat_template"].get<std::string>());
             clean_up_spaces = jc.value("clean_up_tokenization_spaces", false);
             j["config_overrides"] = jc;
         }
-        std::stringstream ss; ss << j;
-        if (!tok->load_from_json_str(ss.str())) return nullptr;
+        if (!tok->load_from_json_str(j.dump())) return nullptr;
         tok->set_clean_up_tokenization_spaces(clean_up_spaces);
         return tok;
     }
 
@@ -18,9 +18,9 @@
 #include "tokenizer.hpp"
 
 #include <utf8proc/utf8proc.h>
-#include <nlohmann/json.hpp>
+#include "ujson.hpp"
 
-using json = nlohmann::json;
+using json = ujson::json;
 
 // ==================== 颜色定义 ====================
 namespace Color {