11import requests , json , os
2- import librosa
2+ import wave
3+ import numpy as np
4+ import threading
5+ from maix import app , time
36
47class SensevoiceClient :
58 def __init__ (self , model = "" , url = "http://0.0.0.0:12347" , lauguage = "auto" , stream = False ):
69 self .model = model
710 self .url = url
811 self .stream = stream
912 self .launguage = lauguage
13+ self .thread = None
14+ self .thread_is_exit = False
15+ self .thread_exit_code = 0
16+
17+ self .last_ai_isp = int (app .get_sys_config_kv ("npu" , "ai_isp" , "0" ))
18+ if self .last_ai_isp :
19+ raise ValueError ("Please turn off AI ISP first, try: app.set_sys_config_kv('npu', 'ai_isp', '0')" )
20+
21+ if not os .path .exists (model ):
22+ raise ValueError (f'Model { self .model } is not existed!' )
23+
1024 def _check_service (self ):
1125 try :
1226 response = requests .get (self .url + '/status' )
@@ -16,12 +30,13 @@ def _check_service(self):
1630 return False
1731
1832 def _start_service (self ):
19- import time
2033 if not self ._check_service ():
2134 os .system ("systemctl start sensevoice.service" )
2235
36+ count = 0
2337 while not self ._check_service ():
24- print ("Waiting for service to start..." )
38+ count += 1
39+ print (f"Waiting for service to start({ count } )..." )
2540 time .sleep (1 )
2641
2742 return True
@@ -62,58 +77,159 @@ def _stop_model(self):
6277 except Exception as e :
6378 return False
6479
65- def start (self ):
80+ def _start_model_thread (self ):
81+ print ('Start service...' )
6682 if self ._start_service ():
6783 print ("Service started successfully." )
6884 else :
6985 print ("Failed to start service." )
86+ self .thread_is_exit = True
87+ self .thread_exit_code = 1
7088 return False
7189
90+ print ('Start model...' )
7291 if self ._start_model ():
7392 print ("Model started successfully." )
7493 else :
7594 print ("Failed to start model." )
95+ self .thread_is_exit = True
96+ self .thread_exit_code = 1
7697 return False
98+
99+ self .thread_is_exit = True
100+ self .thread_exit_code = 0
77101 return True
78102
79- def stop_model (self ):
80- self ._stop_model ()
103+ def start (self ):
104+ self .thread_is_exit = False
105+ self .thread = threading .Thread (target = self ._start_model_thread , daemon = True )
106+ self .thread .start ()
107+
108+ def is_ready (self , block = False ):
109+ while not app .need_exit ():
110+ if self ._get_status () == "loaded" :
111+ return True
112+ else :
113+ if block :
114+ time .sleep (1 )
115+ else :
116+ return False
117+
118+ if self .thread_is_exit :
119+ return True if self .thread_exit_code == 0 else False
120+
121+ return False
81122
82123 def stop (self ):
83124 self ._stop_model ()
84125 self ._stop_service ()
85126
86- def get_wave_form (self , path ):
87- waveform , _ = librosa .load (path , sr = 16000 )
127+ def load_wav_with_wave (self , path , sr = 16000 ):
128+ """
129+ Load WAV file using wave library and resample to target sample rate
130+ """
131+ with wave .open (path , 'rb' ) as wav_file :
132+ # Get audio parameters
133+ n_channels = wav_file .getnchannels ()
134+ sampwidth = wav_file .getsampwidth ()
135+ framerate = wav_file .getframerate ()
136+ n_frames = wav_file .getnframes ()
137+
138+ # Read audio data
139+ frames = wav_file .readframes (n_frames )
140+
141+ # Convert byte data to numpy array based on sample width
142+ dtype_map = {1 : np .int8 , 2 : np .int16 , 4 : np .int32 }
143+ if sampwidth not in dtype_map :
144+ raise ValueError (f"Unsupported sample width: { sampwidth } " )
145+
146+ dtype = dtype_map [sampwidth ]
147+ audio_data = np .frombuffer (frames , dtype = dtype )
148+
149+ # Reshape for multi-channel audio
150+ if n_channels > 1 :
151+ audio_data = audio_data .reshape (- 1 , n_channels )
152+
153+ # Convert to float32 in range [-1, 1]
154+ audio_data = audio_data .astype (np .float32 ) / np .iinfo (dtype ).max
155+
156+ # Resample if needed
157+ if framerate != sr :
158+ # You'll need scipy for resampling
159+ from scipy import signal
160+ audio_data = signal .resample_poly (audio_data , sr , framerate , axis = 0 )
161+
162+ return audio_data , sr
163+
164+ def load_with_pcm (self , frames , sr = 16000 , bits = 16 , channels = 1 ):
165+ if sr != 16000 or bits != 16 or channels != 1 :
166+ raise ValueError ("Only support samplerate = 16000, bits=16, channels=1" )
167+
168+ dtype = np .int16
169+
170+ # Read audio data
171+ audio_data = np .frombuffer (frames , dtype = dtype )
172+
173+ # Convert to float32 in range [-1, 1]
174+ audio_data = audio_data .astype (np .float32 ) / np .iinfo (dtype ).max
175+
176+ return audio_data , sr
177+
178+ def get_wave_form (self , data :str | bytes ): # data is path or pcm data
179+ if isinstance (data , str ):
180+ waveform , _ = self .load_wav_with_wave (data , sr = 16000 )
181+ elif isinstance (data , bytes ):
182+ waveform , _ = self .load_with_pcm (data , sr = 16000 )
183+ else :
184+ raise ValueError ("Not support this data type" , type (data ))
88185 return waveform
89186
90- def refer (self , filepath ):
187+ def refer (self , path = None , audio_data = None ):
91188 if self .stream :
92189 print ("Streaming mode, use refer_stream() instead." )
93190 return ""
94- waveform = self .get_wave_form (filepath )
191+
192+ if path :
193+ waveform = self .get_wave_form (path )
194+ elif audio_data :
195+ waveform = self .get_wave_form (audio_data )
196+ else :
197+ raise ValueError ("You need input path or audio_data" )
198+
95199 data = {
96200 "audio_data" : waveform .tolist (),
97201 "sample_rate" : 16000 ,
98202 "launguage" : "auto"
99203 }
204+
100205 try :
101206 response = requests .post (self .url + '/asr' , json = data )
102207 if response .status_code == 200 :
103208 res = json .loads (response .text )
104- return res .get ("text" , "" )
209+ text = res .get ("text" , "" )
210+ if len (text ) > 0 :
211+ return text [0 ]
212+ else :
213+ return ""
105214 else :
106215 print (f"Requests failed: { response .status_code } " )
107216 return ""
108217 except Exception as e :
109218 print ("Requests failed:" , e )
110219 return ""
111220
112- def refer_stream (self , filepath ):
221+ def refer_stream (self , path = None , audio_data = None ):
113222 if not self .stream :
114223 print ("Streaming mode, use refer() instead." )
115224 return ""
116- waveform = self .get_wave_form (filepath )
225+
226+ if path :
227+ waveform = self .get_wave_form (path )
228+ elif audio_data :
229+ waveform = self .get_wave_form (audio_data )
230+ else :
231+ raise ValueError ("You need input path or audio_data" )
232+
117233 data = {
118234 "audio_data" : waveform .tolist (),
119235 "sample_rate" : 16000 ,
@@ -131,16 +247,41 @@ def refer_stream(self, filepath):
131247 print ("Requests failed:" , e )
132248 return ""
133249
250+ open_microphone = False
134251stream = True
135- client = SensevoiceClient (model = "/root/models/sensevoice-maixcam2/model.mud" , stream = stream )
136- if client .start () is False :
252+ model_path = "/root/models/sensevoice-maixcam2"
253+ client = SensevoiceClient (model = model_path + "/model.mud" , stream = stream )
254+ client .start ()
255+ if client .is_ready (block = True ) is False :
137256 print ("Failed to start service or model." )
138257 exit ()
139- if not stream :
140- print ('start refer' )
141- text = client .refer ("example/zh.mp3" )
142- print (text )
258+
259+ if open_microphone :
260+ from maix import audio
261+ recorder = audio .Recorder (sample_rate = 16000 , channel = 1 )
262+ recorder .volume (100 )
263+ print ('Recording for 3 seconds..' )
264+ audio_data = recorder .record (3 * 1000 )
265+ if not stream :
266+ print ('start refer' )
267+ text = client .refer (audio_data = audio_data )
268+ print (text )
269+ else :
270+ print ('start refer stream' )
271+ for text in client .refer_stream (audio_data = audio_data ):
272+ print (text )
143273else :
144- print ('start refer stream' )
145- for text in client .refer_stream ("example/zh.mp3" ):
146- print (text )
274+ audio_file = "/maixapp/share/audio/demo.wav"
275+ if not stream :
276+ print ('start refer' )
277+ text = client .refer (path = audio_file )
278+ print (text )
279+ else :
280+ print ('start refer stream' )
281+ for text in client .refer_stream (path = audio_file ):
282+ print (text )
283+
284+
285+ # You can comment out this line of code, which will save time on the next startup.
286+ # But it will cause the background service to continuously occupy CMM memory.
287+ client .stop ()
0 commit comments