Skip to content

Commit f88daa5

Browse files
committed
add qwen tokenizer
1 parent fd693ac commit f88daa5

File tree

8 files changed

+1278
-2
lines changed

8 files changed

+1278
-2
lines changed

clip.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
/*================================================== CLIPTokenizer ===================================================*/
88

9-
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
9+
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
1010
std::regex re("<lora:([^:]+):([^>]+)>");
1111
std::smatch matches;
1212
std::unordered_map<std::string, float> filename2multiplier;
@@ -31,7 +31,7 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
3131
return std::make_pair(filename2multiplier, text);
3232
}
3333

34-
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
34+
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
3535
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
3636
std::set<int> byte_set;
3737
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
@@ -398,6 +398,7 @@ class CLIPTokenizer {
398398
}
399399
for (auto& token : matches) {
400400
std::string token_str = token.str();
401+
LOG_DEBUG("%s", token_str.c_str());
401402
std::u32string utf32_token;
402403
for (int i = 0; i < token_str.length(); i++) {
403404
unsigned char b = token_str[i];

examples/cli/main.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
#include "avi_writer.h"
2929

30+
#include "qwen.hpp"
31+
3032
#if defined(_WIN32)
3133
#define NOMINMAX
3234
#include <windows.h>
@@ -1138,6 +1140,20 @@ bool load_images_from_dir(const std::string dir,
11381140

11391141
int main(int argc, const char* argv[]) {
11401142
SDParams params;
1143+
params.verbose = true;
1144+
sd_set_log_callback(sd_log_cb, (void*)&params);
1145+
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
1146+
return false;
1147+
};
1148+
// auto tokenizer = CLIPTokenizer();
1149+
auto tokenizer = Qwen::Qwen2Tokenizer();
1150+
std::string text("a lovely cat");
1151+
auto tokens = tokenizer.encode(text, on_new_token_cb);
1152+
for (auto token : tokens) {
1153+
std::cout << token << " ";
1154+
}
1155+
std::cout << std::endl;
1156+
exit(1);
11411157
parse_args(argc, argv, params);
11421158
params.sample_params.guidance.slg.layers = params.skip_layers.data();
11431159
params.sample_params.guidance.slg.layer_count = params.skip_layers.size();

model.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "stable-diffusion.h"
1717
#include "util.h"
1818
#include "vocab.hpp"
19+
#include "vocab_qwen.hpp"
1920
#include "vocab_umt5.hpp"
2021

2122
#include "ggml-alloc.h"
@@ -1939,6 +1940,11 @@ std::string ModelLoader::load_merges() {
19391940
return merges_utf8_str;
19401941
}
19411942

1943+
std::string ModelLoader::load_qwen2_merges() {
1944+
std::string merges_utf8_str(reinterpret_cast<const char*>(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str));
1945+
return merges_utf8_str;
1946+
}
1947+
19421948
std::string ModelLoader::load_t5_tokenizer_json() {
19431949
std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
19441950
return json_str;

model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ class ModelLoader {
258258
~ModelLoader() = default;
259259

260260
static std::string load_merges();
261+
static std::string load_qwen2_merges();
261262
static std::string load_t5_tokenizer_json();
262263
static std::string load_umt5_tokenizer_json();
263264
};

0 commit comments

Comments
 (0)