@@ -68,6 +68,7 @@ struct whisper_params {
6868 bool output_jsn = false ;
6969 bool output_jsn_full = false ;
7070 bool output_lrc = false ;
71+ bool output_lrc_word = false ; // word-level LRC output
7172 bool no_prints = false ;
7273 bool print_special = false ;
7374 bool print_colors = false ;
@@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
179180 else if (arg == " -osrt" || arg == " --output-srt" ) { params.output_srt = true ; }
180181 else if (arg == " -owts" || arg == " --output-words" ) { params.output_wts = true ; }
181182 else if (arg == " -olrc" || arg == " --output-lrc" ) { params.output_lrc = true ; }
183+ else if (arg == " -olrcw" || arg == " --output-lrc-word" ) { params.output_lrc_word = true ; }
182184 else if (arg == " -fp" || arg == " --font-path" ) { params.font_path = ARGV_NEXT; }
183185 else if (arg == " -ocsv" || arg == " --output-csv" ) { params.output_csv = true ; }
184186 else if (arg == " -oj" || arg == " --output-json" ) { params.output_jsn = true ; }
@@ -260,6 +262,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
260262 fprintf (stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n " , params.output_vtt ? " true" : " false" );
261263 fprintf (stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n " , params.output_srt ? " true" : " false" );
262264 fprintf (stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n " , params.output_lrc ? " true" : " false" );
265+ fprintf (stderr, " -olrcw, --output-lrc-word [%-7s] output result in a word-level lrc file\n " , params.output_lrc_word ? " true" : " false" );
263266 fprintf (stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n " , params.output_wts ? " true" : " false" );
264267 fprintf (stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n " , params.font_path .c_str ());
265268 fprintf (stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n " , params.output_csv ? " true" : " false" );
@@ -922,6 +925,106 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
922925 }
923926}
924927
928+ // Helper: check if byte is a UTF-8 continuation byte (10xxxxxx)
929+ static bool is_utf8_continuation (unsigned char c) {
930+ return (c & 0xC0 ) == 0x80 ;
931+ }
932+
933+ // Helper: format timestamp and append text to line
934+ static void append_lrc_word (std::string & line, int64_t timestamp, const std::string & text) {
935+ if (text.empty () || timestamp < 0 ) {
936+ return ;
937+ }
938+
939+ int64_t msec = timestamp * 10 ;
940+ int64_t min = msec / (1000 * 60 );
941+ msec = msec - min * (1000 * 60 );
942+ int64_t sec = msec / 1000 ;
943+ msec = msec - sec * 1000 ;
944+
945+ char buf[16 ];
946+ snprintf (buf, sizeof (buf), " %02d:%02d.%02d" , (int ) min, (int ) sec, (int ) (msec / 10 ));
947+
948+ line += " [" ;
949+ line += buf;
950+ line += " ]" ;
951+ line += text;
952+ }
953+
954+ // Word-level LRC output with inline timestamps
955+ static void output_lrc_word (struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float >> pcmf32s) {
956+ fout << " [by:whisper.cpp]\n " ;
957+
958+ const int n_segments = whisper_full_n_segments (ctx);
959+ for (int i = 0 ; i < n_segments; ++i) {
960+ std::string line = " " ;
961+ const int n_tokens = whisper_full_n_tokens (ctx, i);
962+
963+ // Get speaker prefix if diarize is enabled (will be prepended to first word)
964+ std::string speaker_prefix = " " ;
965+ if (params.diarize && pcmf32s.size () == 2 ) {
966+ const int64_t t0 = whisper_full_get_segment_t0 (ctx, i);
967+ const int64_t t1 = whisper_full_get_segment_t1 (ctx, i);
968+ speaker_prefix = estimate_diarization_speaker (pcmf32s, t0, t1);
969+ }
970+
971+ std::string pending_text = " " ;
972+ int64_t pending_timestamp = -1 ;
973+ bool is_first_word = true ;
974+
975+ for (int j = 0 ; j < n_tokens; ++j) {
976+ const char * token_text = whisper_full_get_token_text (ctx, i, j);
977+ whisper_token_data token_data = whisper_full_get_token_data (ctx, i, j);
978+
979+ // Skip special tokens (like [BLANK], timestamps, etc.)
980+ if (token_data.id >= whisper_token_eot (ctx)) {
981+ continue ;
982+ }
983+
984+ // Skip empty tokens
985+ if (!token_text || !token_text[0 ]) {
986+ continue ;
987+ }
988+
989+ // Use DTW timestamp if available, otherwise use t0
990+ int64_t t = (token_data.t_dtw >= 0 ) ? token_data.t_dtw : token_data.t0 ;
991+ if (t < 0 ) {
992+ // Fallback to segment start time if token timestamp is not available
993+ t = whisper_full_get_segment_t0 (ctx, i);
994+ }
995+
996+ // Check if this token starts with a UTF-8 continuation byte
997+ bool is_continuation = is_utf8_continuation ((unsigned char )token_text[0 ]);
998+
999+ if (is_continuation && !pending_text.empty ()) {
1000+ // This token is a continuation of a multi-byte UTF-8 character
1001+ // Append to pending text without adding a new timestamp
1002+ pending_text += token_text;
1003+ } else {
1004+ // Flush pending text with its timestamp
1005+ append_lrc_word (line, pending_timestamp, pending_text);
1006+
1007+ // Start new pending, prepend speaker to first word
1008+ if (is_first_word && !speaker_prefix.empty ()) {
1009+ pending_text = speaker_prefix + token_text;
1010+ is_first_word = false ;
1011+ } else {
1012+ pending_text = token_text;
1013+ }
1014+ pending_timestamp = t;
1015+ }
1016+ }
1017+
1018+ // Flush remaining pending text
1019+ append_lrc_word (line, pending_timestamp, pending_text);
1020+
1021+ // Only output if we have actual content (line starts with timestamp)
1022+ if (!line.empty ()) {
1023+ fout << line << " \n " ;
1024+ }
1025+ }
1026+ }
1027+
9251028
9261029static void cb_log_disable (enum ggml_log_level , const char * , void * ) { }
9271030
@@ -1182,7 +1285,7 @@ int main(int argc, char ** argv) {
11821285 wparams.offset_ms = params.offset_t_ms ;
11831286 wparams.duration_ms = params.duration_ms ;
11841287
1185- wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0 ;
1288+ wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.output_lrc_word || params. max_len > 0 ;
11861289 wparams.thold_pt = params.word_thold ;
11871290 wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len ;
11881291 wparams.split_on_word = params.split_on_word ;
@@ -1294,6 +1397,7 @@ int main(int argc, char ** argv) {
12941397 output_ext (csv, pcmf32s);
12951398 output_func (output_json, " .json" , params.output_jsn , pcmf32s);
12961399 output_ext (lrc, pcmf32s);
1400+ output_func (output_lrc_word, " .word.lrc" , params.output_lrc_word , pcmf32s);
12971401 output_func (output_score, " .score.txt" , params.log_score , pcmf32s);
12981402
12991403#undef output_ext
0 commit comments