Skip to content

Commit 439ca3b

Browse files
committed
fix deepseek deseret regex
On windows compiled with gcc the c++ regex library failed to handle the characters
1 parent cc2983d commit 439ca3b

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
389389
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
390390
regex_exprs = {
391391
"[\r\n]",
392-
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
392+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
393393
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
394394
"\\s+$",
395395
"[一-龥ࠀ-一가-퟿]+",

src/unicode.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
33
#endif
44

5+
#if defined(_WIN32)
6+
#define WIN32_LEAN_AND_MEAN
7+
#include <windows.h>
8+
#endif
9+
510
#include "unicode.h"
611
#include "unicode-data.h"
712

@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
201206
}
202207

203208
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
209+
#ifdef _WIN32
210+
int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
211+
if (!wlen) {
212+
throw std::invalid_argument("failed to convert regex");
213+
}
214+
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
215+
wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
216+
if (!wlen) {
217+
free(wbuf);
218+
throw std::invalid_argument("failed to convert regex");
219+
}
220+
std::wstring ret = std::wstring(wbuf);
221+
free(wbuf);
222+
return ret;
223+
#else
204224
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
205225
return conv.from_bytes(s);
226+
#endif
206227
}
207228

208229
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {

0 commit comments

Comments
 (0)