1
+ #include < iostream>
2
+ #include " stream_components_audio.h"
3
+ #include " stream_components_params.h"
4
+ #include " stream_components_output.h"
5
+ #include " stream_components_server.h"
6
+
7
+ using namespace stream_components ;
8
+
9
+ struct whisper_params {
10
+ audio_params audio;
11
+ server_params server;
12
+
13
+ void initialize () {
14
+ audio.initialize ();
15
+ server.initialize ();
16
+ }
17
+ };
18
+
19
+
20
+ void whisper_print_usage (int argc, char **argv, const whisper_params ¶ms);
21
+
22
+ bool whisper_params_parse (int argc, char **argv, whisper_params ¶ms) {
23
+ for (int i = 1 ; i < argc; i++) {
24
+ std::string arg = argv[i];
25
+
26
+ if (arg == " -h" || arg == " --help" ) {
27
+ whisper_print_usage (argc, argv, params);
28
+ exit (0 );
29
+ } else if (arg == " -t" || arg == " --threads" ) { params.server .n_threads = std::stoi (argv[++i]); }
30
+ else if (arg == " --step" ) { params.audio .step_ms = std::stoi (argv[++i]); }
31
+ else if (arg == " --length" ) { params.audio .length_ms = std::stoi (argv[++i]); }
32
+ else if (arg == " --keep" ) { params.audio .keep_ms = std::stoi (argv[++i]); }
33
+ else if (arg == " -c" || arg == " --capture" ) { params.audio .capture_id = std::stoi (argv[++i]); }
34
+ // else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
35
+ else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio .audio_ctx = std::stoi (argv[++i]); }
36
+ else if (arg == " -vth" || arg == " --vad-thold" ) { params.audio .vad_thold = std::stof (argv[++i]); }
37
+ else if (arg == " -fth" || arg == " --freq-thold" ) { params.audio .freq_thold = std::stof (argv[++i]); }
38
+ else if (arg == " -su" || arg == " --speed-up" ) { params.server .speed_up = true ; }
39
+ else if (arg == " -tr" || arg == " --translate" ) { params.server .translate = true ; }
40
+ else if (arg == " -nf" || arg == " --no-fallback" ) { params.server .no_fallback = true ; }
41
+ // else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
42
+ else if (arg == " -kc" || arg == " --keep-context" ) { params.server .no_context = false ; }
43
+ else if (arg == " -l" || arg == " --language" ) { params.server .language = argv[++i]; }
44
+ else if (arg == " -m" || arg == " --model" ) { params.server .model = argv[++i]; }
45
+ // else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
46
+ else if (arg == " -tdrz" || arg == " --tinydiarize" ) { params.server .tinydiarize = true ; }
47
+ // else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
48
+
49
+ else {
50
+ fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
51
+ whisper_print_usage (argc, argv, params);
52
+ exit (0 );
53
+ }
54
+ }
55
+
56
+ return true ;
57
+ }
58
+
59
+ void whisper_print_usage (int /* argc*/ , char **argv, const whisper_params ¶ms) {
60
+ fprintf (stderr, " \n " );
61
+ fprintf (stderr, " usage: %s [options]\n " , argv[0 ]);
62
+ fprintf (stderr, " \n " );
63
+ fprintf (stderr, " options:\n " );
64
+ fprintf (stderr, " -h, --help [default] show this help message and exit\n " );
65
+ fprintf (stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n " ,
66
+ params.server .n_threads );
67
+ fprintf (stderr, " --step N [%-7d] audio step size in milliseconds\n " , params.audio .step_ms );
68
+ fprintf (stderr, " --length N [%-7d] audio length in milliseconds\n " , params.audio .length_ms );
69
+ fprintf (stderr, " --keep N [%-7d] audio to keep from previous step in ms\n " , params.audio .keep_ms );
70
+ fprintf (stderr, " -c ID, --capture ID [%-7d] capture device ID\n " , params.audio .capture_id );
71
+ // fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
72
+ fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio .audio_ctx );
73
+ fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.audio .vad_thold );
74
+ fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.audio .freq_thold );
75
+ fprintf (stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n " ,
76
+ params.server .speed_up ? " true" : " false" );
77
+ fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " ,
78
+ params.server .translate ? " true" : " false" );
79
+ fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " ,
80
+ params.server .no_fallback ? " true" : " false" );
81
+ // fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
82
+ fprintf (stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n " ,
83
+ params.server .no_context ? " false" : " true" );
84
+ fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language\n " , params.server .language .c_str ());
85
+ fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.server .model .c_str ());
86
+ // fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
87
+ fprintf (stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n " ,
88
+ params.server .tinydiarize ? " true" : " false" );
89
+ // fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
90
+ fprintf (stderr, " \n " );
91
+ }
92
+
93
+ int main (int argc, char **argv) {
94
+
95
+ // Read parameters...
96
+ whisper_params params;
97
+
98
+ if (whisper_params_parse (argc, argv, params) == false ) {
99
+ return 1 ;
100
+ }
101
+
102
+ // Compute derived parameters
103
+ params.initialize ();
104
+
105
+ // Check parameters
106
+ if (params.server .language != " auto" && whisper_lang_id (params.server .language .c_str ()) == -1 ) {
107
+ fprintf (stderr, " error: unknown language '%s'\n " , params.server .language .c_str ());
108
+ whisper_print_usage (argc, argv, params);
109
+ exit (0 );
110
+ }
111
+
112
+ // Instantiate the audio input
113
+ stream_components::LocalSDLMicrophone audio (params.audio );
114
+
115
+ // Instantiate the server
116
+ stream_components::WhisperServer server (params.server , params.audio );
117
+
118
+ // Print the 'header'...
119
+ WhisperOutput::server_to_json (std::cout, params.server , server.ctx );
120
+
121
+ // Run until Ctrl + C
122
+ bool is_running = true ;
123
+ while (is_running) {
124
+
125
+ // handle Ctrl + C
126
+ is_running = sdl_poll_events ();
127
+ if (!is_running) {
128
+ break ;
129
+ }
130
+
131
+ // get next audio section
132
+ auto pcmf32 = audio.get_next ();
133
+
134
+ // get the whisper output
135
+ auto result = server.process (pcmf32.data (), pcmf32.size ());
136
+
137
+ // write the output as json to stdout (for this example)
138
+ if (result) {
139
+ result->transcription_to_json (std::cout);
140
+ }
141
+ }
142
+
143
+ std::cout << " EXITED MAIN LOOP" << std::endl;
144
+ return 0 ;
145
+ }
0 commit comments