99#include < cmath>
1010#include < oniguruma.h>
1111#include < utf8proc/utf8proc.h>
12+ #include < iostream>
13+ #include " ujson.hpp"
1214#include " jinja.hpp"
1315
1416namespace tokenizer {
1517
18+ using json = ujson::json;
19+
1620// ==========================================
1721// C++11 Polyfills
1822// ==========================================
@@ -87,7 +91,7 @@ static std::string OnigurumaRegexEscape(const std::string& pattern) {
8791 return escaped;
8892}
8993
90- static std::string get_token_content (const nlohmann:: json& j) {
94+ static std::string get_token_content (const json& j) {
9195 if (j.is_string ()) return j.get <std::string>();
9296 if (j.is_object () && j.contains (" content" )) return j[" content" ].get <std::string>();
9397 return " " ;
@@ -560,8 +564,8 @@ class BPEModel : public Model {
560564 return out;
561565 }
562566
563- void load (const nlohmann:: json& v, const nlohmann:: json& m) {
564- for (auto it = v.begin (); it != v.end (); ++it) { vocab_[it.key ()] = it.value (); id_to_token_[it.value ()] = it.key (); }
567+ void load (const json& v, const json& m) {
568+ for (auto it = v.begin (); it != v.end (); ++it) { vocab_[it.key ()] = it.value (). get < int >() ; id_to_token_[it.value (). get < int > ()] = it.key (); }
565569 int rank = 0 ;
566570 for (const auto & item : m) {
567571 std::string s1, s2;
@@ -585,10 +589,10 @@ class WordPieceModel : public Model {
585589 WordPieceModel (const std::string& unk = " [UNK]" , const std::string& prefix = " ##" , int max_chars = 100 )
586590 : unk_token_(unk), continuing_subword_prefix_(prefix), max_input_chars_per_word_(max_chars), unk_token_id_(-1 ) {}
587591
588- void load (const nlohmann:: json& v) {
592+ void load (const json& v) {
589593 for (auto it = v.begin (); it != v.end (); ++it) {
590- vocab_[it.key ()] = it.value ();
591- id_to_token_[it.value ()] = it.key ();
594+ vocab_[it.key ()] = it.value (). get < int >() ;
595+ id_to_token_[it.value (). get < int >() ] = it.key ();
592596 }
593597 auto it = vocab_.find (unk_token_);
594598 if (it != vocab_.end ()) unk_token_id_ = it->second ;
@@ -659,7 +663,7 @@ class UnigramModel : public Model {
659663 UnigramModel (int unk_id = 0 , bool byte_fallback = false )
660664 : unk_token_id_(unk_id), byte_fallback_(byte_fallback) {}
661665
662- void load (const nlohmann:: json& v) {
666+ void load (const json& v) {
663667 int idx = 0 ;
664668 for (const auto & item : v) {
665669 if (item.is_array () && item.size () >= 2 ) {
@@ -1059,7 +1063,7 @@ struct PreTrainedTokenizer::Impl {
10591063 }
10601064 }
10611065
1062- bool load_from_json (PreTrainedTokenizer* public_api, const nlohmann:: json& j) {
1066+ bool load_from_json (PreTrainedTokenizer* public_api, const json& j) {
10631067 if (j.contains (" model" ) && j[" model" ].is_object ()) {
10641068 std::string model_type = j[" model" ].value (" type" , " " );
10651069 // Auto-detect model type if not specified
@@ -1118,7 +1122,7 @@ struct PreTrainedTokenizer::Impl {
11181122 if (j[" model" ].contains (" byte_fallback" )) byte_fallback = j[" model" ][" byte_fallback" ].get <bool >();
11191123
11201124 bool use_byte_level = false ;
1121- auto check_bl = [](const nlohmann:: json& c) -> bool {
1125+ auto check_bl = [](const json& c) -> bool {
11221126 if (!c.is_object ()) return false ;
11231127 if (c.value (" type" , " " ) == " ByteLevel" ) return true ;
11241128 if (c.contains (" pretokenizers" )) {
@@ -1132,9 +1136,9 @@ struct PreTrainedTokenizer::Impl {
11321136 }
11331137 return false ;
11341138 };
1135- if (check_bl (j.value (" pre_tokenizer" , nlohmann:: json ()))) use_byte_level = true ;
1136- if (check_bl (j.value (" post_processor" , nlohmann:: json ()))) use_byte_level = true ;
1137- if (check_bl (j.value (" decoder" , nlohmann:: json ()))) use_byte_level = true ;
1139+ if (check_bl (j.value (" pre_tokenizer" , json ()))) use_byte_level = true ;
1140+ if (check_bl (j.value (" post_processor" , json ()))) use_byte_level = true ;
1141+ if (check_bl (j.value (" decoder" , json ()))) use_byte_level = true ;
11381142
11391143 // If we have a ByteLevelPreTokenizer in the sequence, BPEModel should not do the mapping itself
11401144 bool pt_has_byte_level = false ;
@@ -1151,7 +1155,7 @@ struct PreTrainedTokenizer::Impl {
11511155 }
11521156 }
11531157 if (j.contains (" normalizer" ) && !j[" normalizer" ].is_null ()) {
1154- auto create_norm = [&](const nlohmann:: json& s) -> std::shared_ptr<Normalizer> {
1158+ auto create_norm = [&](const json& s) -> std::shared_ptr<Normalizer> {
11551159 std::string type = s.value (" type" , " " );
11561160 if (type == " NFKC" ) return std::make_shared<NFKCNormalizer>();
11571161 if (type == " Precompiled" ) {
@@ -1199,7 +1203,7 @@ struct PreTrainedTokenizer::Impl {
11991203 }
12001204 }
12011205 if (j.contains (" decoder" ) && !j[" decoder" ].is_null ()) {
1202- auto create_dec = [&](const nlohmann:: json& s) -> std::shared_ptr<Decoder> {
1206+ auto create_dec = [&](const json& s) -> std::shared_ptr<Decoder> {
12031207 std::string type = s.value (" type" , " " );
12041208 if (type == " Replace" ) {
12051209 std::string p;
@@ -1232,7 +1236,7 @@ struct PreTrainedTokenizer::Impl {
12321236 }
12331237 if (j.contains (" pre_tokenizer" ) && !j[" pre_tokenizer" ].is_null ()) {
12341238 auto pt = j[" pre_tokenizer" ];
1235- auto create_pt = [&](const nlohmann:: json& s) -> std::shared_ptr<PreTokenizer> {
1239+ auto create_pt = [&](const json& s) -> std::shared_ptr<PreTokenizer> {
12361240 std::string type = s.value (" type" , " " );
12371241 if (type == " Split" ) {
12381242 std::string p;
@@ -1269,22 +1273,22 @@ struct PreTrainedTokenizer::Impl {
12691273 }
12701274 if (j.contains (" post_processor" ) && !j[" post_processor" ].is_null ()) {
12711275 auto pp = j[" post_processor" ];
1272- auto ptl = [&](const nlohmann:: json& s) {
1276+ auto ptl = [&](const json& s) {
12731277 std::vector<TemplateProcessing::Step> steps;
12741278 if (s.contains (" single" )) {
1275- for (auto & i : s[" single" ]) {
1279+ for (const auto & i : s[" single" ]) {
12761280 if (i.contains (" SpecialToken" )) steps.push_back ({true , public_api->token_to_id (i[" SpecialToken" ][" id" ].get <std::string>())});
12771281 else if (i.contains (" Sequence" )) steps.push_back ({false , 0 });
12781282 }
12791283 this ->post_processor_ = std::make_shared<TemplateProcessing>(steps);
12801284 }
12811285 };
12821286 if (pp.value (" type" , " " ) == " TemplateProcessing" ) ptl (pp);
1283- else if (pp.value (" type" , " " ) == " Sequence" && pp.contains (" processors" )) { for (auto & s : pp[" processors" ]) if (s.value (" type" , " " ) == " TemplateProcessing" ) { ptl (s); break ; } }
1287+ else if (pp.value (" type" , " " ) == " Sequence" && pp.contains (" processors" )) { for (const auto & s : pp[" processors" ]) if (s.value (" type" , " " ) == " TemplateProcessing" ) { ptl (s); break ; } }
12841288 }
12851289 if (j.contains (" added_tokens" ) && j[" added_tokens" ].is_array ()) {
12861290 std::vector<std::string> cs;
1287- for (auto & item : j[" added_tokens" ]) {
1291+ for (const auto & item : j[" added_tokens" ]) {
12881292 std::string c = item.value (" content" , " " ); int id = item.value (" id" , -1 );
12891293 bool special = item.value (" special" , false );
12901294 bool lstrip = item.value (" lstrip" , false );
@@ -1358,30 +1362,29 @@ void PreTrainedTokenizer::set_chat_template(const std::string& t) {
13581362 impl_->chat_template_ = t;
13591363 impl_->jinja_template_ = std::make_shared<jinja::Template>(t);
13601364}
1361-
13621365std::string PreTrainedTokenizer::apply_chat_template (const ChatMessages& msgs, bool add_gen) const {
13631366 if (!impl_->jinja_template_ ) return " " ;
1364- nlohmann:: json j_msgs = nlohmann:: json::array ();
1367+ json j_msgs = json::array ();
13651368 for (const auto & m : msgs) j_msgs.push_back ({{" role" , m.first }, {" content" , m.second }});
1366- nlohmann:: json extra;
1369+ json extra = json::object () ;
13671370 extra[" bos_token" ] = id_to_token (impl_->special_tokens_ .bos );
13681371 extra[" eos_token" ] = id_to_token (impl_->special_tokens_ .eos );
1369- return impl_->jinja_template_ ->apply_chat_template (j_msgs, add_gen, nlohmann:: json::array (), extra);
1372+ return impl_->jinja_template_ ->apply_chat_template (j_msgs, add_gen, json::array (), extra);
13701373}
13711374
13721375std::string PreTrainedTokenizer::apply_chat_template (const std::string& json_str, bool add_generation_prompt) const {
13731376 if (!impl_->jinja_template_ ) return " " ;
1374- auto j_msgs = nlohmann:: json::parse (json_str, nullptr , false );
1377+ auto j_msgs = json::parse (json_str);
13751378 if (!j_msgs.is_array ()) return " " ;
1376- nlohmann:: json extra;
1379+ json extra = json::object () ;
13771380 extra[" bos_token" ] = id_to_token (impl_->special_tokens_ .bos );
13781381 extra[" eos_token" ] = id_to_token (impl_->special_tokens_ .eos );
1379- return impl_->jinja_template_ ->apply_chat_template (j_msgs, add_generation_prompt, nlohmann:: json::array (), extra);
1382+ return impl_->jinja_template_ ->apply_chat_template (j_msgs, add_generation_prompt, json::array (), extra);
13801383}
13811384
13821385bool PreTrainedTokenizer::load_from_json_str (const std::string& json_str) {
1383- auto j = nlohmann:: json::parse (json_str, nullptr , false );
1384- if (j.is_discarded ()) return false ;
1386+ auto j = json::parse (json_str);
1387+ if (j.is_null ()) return false ;
13851388 return impl_->load_from_json (this , j);
13861389}
13871390
@@ -1396,16 +1399,20 @@ void PreTrainedTokenizer::set_clean_up_tokenization_spaces(bool clean) {
13961399 std::shared_ptr<PreTrainedTokenizer> AutoTokenizer::from_pretrained (const std::string& path) {
13971400 auto tok = std::make_shared<PreTrainedTokenizer>();
13981401 std::ifstream f (path + " /tokenizer.json" ); if (!f.is_open ()) return nullptr ;
1399- nlohmann::json j; f >> j;
1402+ std::stringstream ss_j; ss_j << f.rdbuf ();
1403+ json j = json::parse (ss_j.str ());
1404+ if (j.is_null ()) return nullptr ;
1405+
14001406 std::ifstream fc (path + " /tokenizer_config.json" );
14011407 bool clean_up_spaces = false ;
14021408 if (fc.is_open ()) {
1403- nlohmann::json jc; fc >> jc; if (jc.contains (" chat_template" )) tok->set_chat_template (jc[" chat_template" ].get <std::string>());
1409+ std::stringstream ss_jc; ss_jc << fc.rdbuf ();
1410+ json jc = json::parse (ss_jc.str ());
1411+ if (jc.contains (" chat_template" )) tok->set_chat_template (jc[" chat_template" ].get <std::string>());
14041412 clean_up_spaces = jc.value (" clean_up_tokenization_spaces" , false );
14051413 j[" config_overrides" ] = jc;
14061414 }
1407- std::stringstream ss; ss << j;
1408- if (!tok->load_from_json_str (ss.str ())) return nullptr ;
1415+ if (!tok->load_from_json_str (j.dump ())) return nullptr ;
14091416 tok->set_clean_up_tokenization_spaces (clean_up_spaces);
14101417 return tok;
14111418 }
0 commit comments