Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
return txt[0] == ' ';
}

// Count UTF-8 characters (not bytes) in a string
static int utf8_len(const char * str) {
int count = 0;
while (*str) {
// Skip continuation bytes (10xxxxxx)
if ((*str & 0xC0) != 0x80) {
count++;
}
str++;
}
return count;
}

static void whisper_exp_compute_token_level_timestamps_dtw(
struct whisper_context * ctx,
struct whisper_state * state,
Expand Down Expand Up @@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
}

const auto txt = whisper_token_to_str(&ctx, token.id);
const int cur = strlen(txt);
const int cur = utf8_len(txt); // Use UTF-8 character count instead of byte count

if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
state.result_all.back().text = std::move(text);
Expand Down
Loading