Skip to content

Commit e6edc14

Browse files
SakuzyPengclaude
andcommitted
feat(cli): add word-level LRC output with UTF-8 fix
Add new -olrcw/--output-lrc-word option for word-level LRC output with inline timestamps per token. Key changes: - Add output_lrc_word parameter and CLI option - Implement output_lrc_word() function with per-token timestamps - Fix UTF-8 multi-byte character handling (merge continuation bytes) - Enable token_timestamps when output_lrc_word is set - Handle diarize speaker prefix without breaking LRC format - Update README.md with new option The UTF-8 fix addresses issue #1798 where CJK characters were split across tokens with timestamps inserted between bytes. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 7aa8818 commit e6edc14

File tree

2 files changed

+106
-1
lines changed

2 files changed

+106
-1
lines changed

examples/cli/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ options:
3737
-ovtt, --output-vtt [false ] output result in a vtt file
3838
-osrt, --output-srt [false ] output result in a srt file
3939
-olrc, --output-lrc [false ] output result in a lrc file
40+
-olrcw, --output-lrc-word [false ] output result in a word-level lrc file
4041
-owts, --output-words [false ] output script for generating karaoke video
4142
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
4243
-ocsv, --output-csv [false ] output result in a CSV file

examples/cli/cli.cpp

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ struct whisper_params {
6868
bool output_jsn = false;
6969
bool output_jsn_full = false;
7070
bool output_lrc = false;
71+
bool output_lrc_word = false; // word-level LRC output
7172
bool no_prints = false;
7273
bool print_special = false;
7374
bool print_colors = false;
@@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
179180
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
180181
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
181182
else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
183+
else if (arg == "-olrcw"|| arg == "--output-lrc-word") { params.output_lrc_word = true; }
182184
else if (arg == "-fp" || arg == "--font-path") { params.font_path = ARGV_NEXT; }
183185
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
184186
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
@@ -260,6 +262,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
260262
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
261263
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
262264
fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false");
265+
fprintf(stderr, " -olrcw, --output-lrc-word [%-7s] output result in a word-level lrc file\n", params.output_lrc_word ? "true" : "false");
263266
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
264267
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
265268
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
@@ -922,6 +925,106 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
922925
}
923926
}
924927

928+
// Helper: check if byte is a UTF-8 continuation byte (10xxxxxx)
929+
static bool is_utf8_continuation(unsigned char c) {
930+
return (c & 0xC0) == 0x80;
931+
}
932+
933+
// Helper: format timestamp and append text to line
934+
static void append_lrc_word(std::string & line, int64_t timestamp, const std::string & text) {
935+
if (text.empty() || timestamp < 0) {
936+
return;
937+
}
938+
939+
int64_t msec = timestamp * 10;
940+
int64_t min = msec / (1000 * 60);
941+
msec = msec - min * (1000 * 60);
942+
int64_t sec = msec / 1000;
943+
msec = msec - sec * 1000;
944+
945+
char buf[16];
946+
snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) (msec / 10));
947+
948+
line += "[";
949+
line += buf;
950+
line += "]";
951+
line += text;
952+
}
953+
954+
// Word-level LRC output with inline timestamps
955+
static void output_lrc_word(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
956+
fout << "[by:whisper.cpp]\n";
957+
958+
const int n_segments = whisper_full_n_segments(ctx);
959+
for (int i = 0; i < n_segments; ++i) {
960+
std::string line = "";
961+
const int n_tokens = whisper_full_n_tokens(ctx, i);
962+
963+
// Get speaker prefix if diarize is enabled (will be prepended to first word)
964+
std::string speaker_prefix = "";
965+
if (params.diarize && pcmf32s.size() == 2) {
966+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
967+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
968+
speaker_prefix = estimate_diarization_speaker(pcmf32s, t0, t1);
969+
}
970+
971+
std::string pending_text = "";
972+
int64_t pending_timestamp = -1;
973+
bool is_first_word = true;
974+
975+
for (int j = 0; j < n_tokens; ++j) {
976+
const char * token_text = whisper_full_get_token_text(ctx, i, j);
977+
whisper_token_data token_data = whisper_full_get_token_data(ctx, i, j);
978+
979+
// Skip special tokens (like [BLANK], timestamps, etc.)
980+
if (token_data.id >= whisper_token_eot(ctx)) {
981+
continue;
982+
}
983+
984+
// Skip empty tokens
985+
if (!token_text || !token_text[0]) {
986+
continue;
987+
}
988+
989+
// Use DTW timestamp if available, otherwise use t0
990+
int64_t t = (token_data.t_dtw >= 0) ? token_data.t_dtw : token_data.t0;
991+
if (t < 0) {
992+
// Fallback to segment start time if token timestamp is not available
993+
t = whisper_full_get_segment_t0(ctx, i);
994+
}
995+
996+
// Check if this token starts with a UTF-8 continuation byte
997+
bool is_continuation = is_utf8_continuation((unsigned char)token_text[0]);
998+
999+
if (is_continuation && !pending_text.empty()) {
1000+
// This token is a continuation of a multi-byte UTF-8 character
1001+
// Append to pending text without adding a new timestamp
1002+
pending_text += token_text;
1003+
} else {
1004+
// Flush pending text with its timestamp
1005+
append_lrc_word(line, pending_timestamp, pending_text);
1006+
1007+
// Start new pending, prepend speaker to first word
1008+
if (is_first_word && !speaker_prefix.empty()) {
1009+
pending_text = speaker_prefix + token_text;
1010+
is_first_word = false;
1011+
} else {
1012+
pending_text = token_text;
1013+
}
1014+
pending_timestamp = t;
1015+
}
1016+
}
1017+
1018+
// Flush remaining pending text
1019+
append_lrc_word(line, pending_timestamp, pending_text);
1020+
1021+
// Only output if we have actual content (line starts with timestamp)
1022+
if (!line.empty()) {
1023+
fout << line << "\n";
1024+
}
1025+
}
1026+
}
1027+
9251028

9261029
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
9271030

@@ -1182,7 +1285,7 @@ int main(int argc, char ** argv) {
11821285
wparams.offset_ms = params.offset_t_ms;
11831286
wparams.duration_ms = params.duration_ms;
11841287

1185-
wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
1288+
wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.output_lrc_word || params.max_len > 0;
11861289
wparams.thold_pt = params.word_thold;
11871290
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
11881291
wparams.split_on_word = params.split_on_word;
@@ -1294,6 +1397,7 @@ int main(int argc, char ** argv) {
12941397
output_ext(csv, pcmf32s);
12951398
output_func(output_json, ".json", params.output_jsn, pcmf32s);
12961399
output_ext(lrc, pcmf32s);
1400+
output_func(output_lrc_word, ".word.lrc", params.output_lrc_word, pcmf32s);
12971401
output_func(output_score, ".score.txt", params.log_score, pcmf32s);
12981402

12991403
#undef output_ext

0 commit comments

Comments
 (0)