@@ -129,6 +129,9 @@ struct whisper_params {
129129 float vad_max_speech_duration_s = FLT_MAX;
130130 int vad_speech_pad_ms = 30 ;
131131 float vad_samples_overlap = 0 .1f ;
132+
133+ // Warmup parameters
134+ std::string warmup_file = " " ;
132135};
133136
134137void whisper_print_usage (int /* argc*/ , char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -192,6 +195,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
192195 std::to_string (params.vad_max_speech_duration_s ).c_str ());
193196 fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
194197 fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
198+ fprintf (stderr, " -wf PATH, --warmup-file PATH [%-7s] path to audio file for model warmup\n " , params.warmup_file .c_str ());
195199 fprintf (stderr, " \n " );
196200}
197201
@@ -258,6 +262,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
258262 else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
259263 else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
260264 else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
265+ else if (arg == " -wf" || arg == " --warmup-file" ) { params.warmup_file = argv[++i]; }
261266 else {
262267 fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
263268 whisper_print_usage (argc, argv, params, sparams);
@@ -703,6 +708,41 @@ int main(int argc, char ** argv) {
703708
704709 // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
705710 whisper_ctx_init_openvino_encoder (ctx, nullptr , params.openvino_encode_device .c_str (), nullptr );
711+
712+ // warmup model if warmup file is provided
713+ if (!params.warmup_file .empty ()) {
714+ printf (" Warming up model with audio file: %s\n " , params.warmup_file .c_str ());
715+ std::vector<float > pcmf32_warmup;
716+ std::vector<std::vector<float >> pcmf32s_warmup;
717+
718+ if (read_audio_data (params.warmup_file , pcmf32_warmup, pcmf32s_warmup, false )) {
719+ whisper_full_params wparams = whisper_full_default_params (WHISPER_SAMPLING_GREEDY);
720+ wparams.print_realtime = false ;
721+ wparams.print_progress = false ;
722+ wparams.print_timestamps = false ;
723+ wparams.print_special = false ;
724+ wparams.translate = false ;
725+ wparams.language = " en" ;
726+ wparams.n_threads = params.n_threads ;
727+ wparams.n_max_text_ctx = 128 ;
728+ wparams.no_context = true ;
729+ wparams.single_segment = true ;
730+ wparams.audio_ctx = 768 ;
731+
732+ const auto t_start = std::chrono::high_resolution_clock::now ();
733+
734+ if (whisper_full_parallel (ctx, wparams, pcmf32_warmup.data (), pcmf32_warmup.size (), 1 ) == 0 ) {
735+ const auto t_end = std::chrono::high_resolution_clock::now ();
736+ const auto t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count ();
737+ printf (" Model warmup completed in %d ms\n " , (int )t_ms);
738+ } else {
739+ fprintf (stderr, " warning: model warmup failed\n " );
740+ }
741+ } else {
742+ fprintf (stderr, " warning: failed to read warmup audio file '%s'\n " , params.warmup_file .c_str ());
743+ }
744+ }
745+
706746 state.store (SERVER_STATE_READY);
707747
708748
0 commit comments