1
+ import subprocess , sys , os , json
2
+ from datetime import datetime
3
+ from vosk import Model , KaldiRecognizer
4
+
5
+ SAMPLE_RATE = 16000
6
+ BYTES_PER_SECOND = SAMPLE_RATE * 2
7
+
8
+ class Transcriber ():
9
+ def __init__ (self , model_path , window_size_sec = 5 , stride_sec = 1 ):
10
+ """
11
+ window_size_sec: context window size (e.g., 5 seconds)
12
+ stride_sec: step size (e.g., 1 second)
13
+ """
14
+ self .model = Model (model_path )
15
+ self .window_size = window_size_sec
16
+ self .stride = stride_sec
17
+
18
+ def transcribe (self , filename ):
19
+ rec = KaldiRecognizer (self .model , SAMPLE_RATE )
20
+ rec .SetWords (True )
21
+
22
+ if not os .path .exists (filename ):
23
+ raise FileNotFoundError (filename )
24
+
25
+ ffmpeg_command = [
26
+ "ffmpeg" ,
27
+ "-nostdin" ,
28
+ "-loglevel" , "quiet" ,
29
+ "-i" , filename ,
30
+ "-ar" , str (SAMPLE_RATE ),
31
+ "-ac" , "1" ,
32
+ "-f" , "s16le" ,
33
+ "-"
34
+ ]
35
+
36
+ transcription = {}
37
+ start_time = datetime .now ()
38
+
39
+ with subprocess .Popen (ffmpeg_command , stdout = subprocess .PIPE , bufsize = 10 ** 8 ) as process :
40
+ audio = process .stdout .read ()
41
+
42
+ # Convert window/stride to bytes
43
+ window_bytes = self .window_size * BYTES_PER_SECOND
44
+ stride_bytes = self .stride * BYTES_PER_SECOND
45
+
46
+ total_len = len (audio )
47
+ frame_index = 0
48
+
49
+ # Slide across audio
50
+ for start in range (0 , total_len - window_bytes + 1 , stride_bytes ):
51
+ window = audio [start :start + window_bytes ]
52
+ rec = KaldiRecognizer (self .model , SAMPLE_RATE )
53
+ rec .SetWords (True )
54
+
55
+ if rec .AcceptWaveform (window ):
56
+ result = json .loads (rec .Result ())
57
+ text = result .get ("text" , "" )
58
+ else :
59
+ part = json .loads (rec .PartialResult ())
60
+ text = part .get ("partial" , "" )
61
+
62
+ transcription [frame_index ] = {
63
+ "start_sec" : start // BYTES_PER_SECOND ,
64
+ "end_sec" : (start + window_bytes ) // BYTES_PER_SECOND ,
65
+ "text" : text
66
+ }
67
+ frame_index += 1
68
+
69
+ # Handle tail
70
+ if total_len % stride_bytes != 0 :
71
+ tail = audio [- window_bytes :]
72
+ if tail :
73
+ rec = KaldiRecognizer (self .model , SAMPLE_RATE )
74
+ rec .SetWords (True )
75
+ if rec .AcceptWaveform (tail ):
76
+ result = json .loads (rec .Result ())
77
+ text = result .get ("text" , "" )
78
+ else :
79
+ part = json .loads (rec .PartialResult ())
80
+ text = part .get ("partial" , "" )
81
+
82
+ transcription [frame_index ] = {
83
+ "start_sec" : (total_len - window_bytes ) // BYTES_PER_SECOND ,
84
+ "end_sec" : total_len // BYTES_PER_SECOND ,
85
+ "text" : text
86
+ }
87
+
88
+ end_time = datetime .now ()
89
+ time_elapsed = end_time - start_time
90
+
91
+ return {
92
+ "start_time" : start_time .isoformat (),
93
+ "end_time" : end_time .isoformat (),
94
+ "elapsed_time" : str (time_elapsed ),
95
+ "window_size" : self .window_size ,
96
+ "stride" : self .stride ,
97
+ "transcription" : transcription # dict of {index: {start_sec, end_sec, text}}
98
+ }
0 commit comments