Update whisper params to match the struct in whisper.h (vocodedev#517)

ClericalAid · Christian · web-flow · commit 19f97fa39749 · 2024-04-02T14:21:32.000-07:00
Co-authored-by: Christian &lt;chrish@greybok-debian.lan&gt;
diff --git a/vocode/utils/whisper_cpp/whisper_params.py b/vocode/utils/whisper_cpp/whisper_params.py
@@ -5,13 +5,14 @@ class WhisperFullParams(ctypes.Structure):
     _fields_ = [
         ("strategy", ctypes.c_int),
         #
-        ("n_max_text_ctx", ctypes.c_int),
         ("n_threads", ctypes.c_int),
+        ("n_max_text_ctx", ctypes.c_int),
         ("offset_ms", ctypes.c_int),
         ("duration_ms", ctypes.c_int),
         #
         ("translate", ctypes.c_bool),
         ("no_context", ctypes.c_bool),
+        ("no_timestamps", ctypes.c_bool),
         ("single_segment", ctypes.c_bool),
         ("print_special", ctypes.c_bool),
         ("print_progress", ctypes.c_bool),
@@ -26,13 +27,17 @@ class WhisperFullParams(ctypes.Structure):
         ("max_tokens", ctypes.c_int),
         #
         ("speed_up", ctypes.c_bool),
+        ("debug_mode", ctypes.c_bool),
         ("audio_ctx", ctypes.c_int),
         #
+        ("tdrz_enable", ctypes.c_bool),
+        #
         ("initial_prompt", ctypes.c_char_p),
         ("prompt_tokens", ctypes.c_void_p),
         ("prompt_n_tokens", ctypes.c_int),
         #
         ("language", ctypes.c_char_p),
+        ("detect_language", ctypes.c_bool),
         #
         ("suppress_blank", ctypes.c_bool),
         ("suppress_non_speech_tokens", ctypes.c_bool),
@@ -47,7 +52,7 @@ class WhisperFullParams(ctypes.Structure):
         ("no_speech_thold", ctypes.c_float),
         #
         ("greedy", ctypes.c_int * 1),
-        ("beam_search", ctypes.c_int * 3),
+        ("beam_search", ctypes.c_int * 2),
         #
         ("new_segment_callback", ctypes.c_void_p),
         ("new_segment_callback_user_data", ctypes.c_void_p),
@@ -60,4 +65,9 @@ class WhisperFullParams(ctypes.Structure):
         #
         ("logits_filter_callback", ctypes.c_void_p),
         ("logits_filter_callback_user_data", ctypes.c_void_p),
+        #
+        ("grammar_rules", ctypes.POINTER(ctypes.c_void_p)),
+        ("n_grammar_rules", ctypes.c_size_t),
+        ("i_start_rule", ctypes.c_size_t),
+        ("grammar_penalty", ctypes.c_float),
     ]

Original file line number	Diff line number	Diff line change
`@@ -5,13 +5,14 @@ class WhisperFullParams(ctypes.Structure):`
`5`	`5`	`_fields_ = [`
`6`	`6`	`("strategy", ctypes.c_int),`
`7`	`7`	`#`
`8`		`- ("n_max_text_ctx", ctypes.c_int),`
`9`	`8`	`("n_threads", ctypes.c_int),`
	`9`	`+ ("n_max_text_ctx", ctypes.c_int),`
`10`	`10`	`("offset_ms", ctypes.c_int),`
`11`	`11`	`("duration_ms", ctypes.c_int),`
`12`	`12`	`#`
`13`	`13`	`("translate", ctypes.c_bool),`
`14`	`14`	`("no_context", ctypes.c_bool),`
	`15`	`+ ("no_timestamps", ctypes.c_bool),`
`15`	`16`	`("single_segment", ctypes.c_bool),`
`16`	`17`	`("print_special", ctypes.c_bool),`
`17`	`18`	`("print_progress", ctypes.c_bool),`
`@@ -26,13 +27,17 @@ class WhisperFullParams(ctypes.Structure):`
`26`	`27`	`("max_tokens", ctypes.c_int),`
`27`	`28`	`#`
`28`	`29`	`("speed_up", ctypes.c_bool),`
	`30`	`+ ("debug_mode", ctypes.c_bool),`
`29`	`31`	`("audio_ctx", ctypes.c_int),`
`30`	`32`	`#`
	`33`	`+ ("tdrz_enable", ctypes.c_bool),`
	`34`	`+ #`
`31`	`35`	`("initial_prompt", ctypes.c_char_p),`
`32`	`36`	`("prompt_tokens", ctypes.c_void_p),`
`33`	`37`	`("prompt_n_tokens", ctypes.c_int),`
`34`	`38`	`#`
`35`	`39`	`("language", ctypes.c_char_p),`
	`40`	`+ ("detect_language", ctypes.c_bool),`
`36`	`41`	`#`
`37`	`42`	`("suppress_blank", ctypes.c_bool),`
`38`	`43`	`("suppress_non_speech_tokens", ctypes.c_bool),`
`@@ -47,7 +52,7 @@ class WhisperFullParams(ctypes.Structure):`
`47`	`52`	`("no_speech_thold", ctypes.c_float),`
`48`	`53`	`#`
`49`	`54`	`("greedy", ctypes.c_int * 1),`
`50`		`- ("beam_search", ctypes.c_int * 3),`
	`55`	`+ ("beam_search", ctypes.c_int * 2),`
`51`	`56`	`#`
`52`	`57`	`("new_segment_callback", ctypes.c_void_p),`
`53`	`58`	`("new_segment_callback_user_data", ctypes.c_void_p),`
`@@ -60,4 +65,9 @@ class WhisperFullParams(ctypes.Structure):`
`60`	`65`	`#`
`61`	`66`	`("logits_filter_callback", ctypes.c_void_p),`
`62`	`67`	`("logits_filter_callback_user_data", ctypes.c_void_p),`
	`68`	`+ #`
	`69`	`+ ("grammar_rules", ctypes.POINTER(ctypes.c_void_p)),`
	`70`	`+ ("n_grammar_rules", ctypes.c_size_t),`
	`71`	`+ ("i_start_rule", ctypes.c_size_t),`
	`72`	`+ ("grammar_penalty", ctypes.c_float),`
`63`	`73`	`]`