Skip to content

Commit f31def8

Browse files
committed
* optimize app_speech/app_yoloworld & add image generation app
1 parent 73bb883 commit f31def8

File tree

19 files changed

+3197
-145
lines changed

19 files changed

+3197
-145
lines changed

examples/audio/asr/sensevoice/asr_sensevoice.py

Lines changed: 163 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,26 @@
11
import requests, json, os
2-
import librosa
2+
import wave
3+
import numpy as np
4+
import threading
5+
from maix import app, time
36

47
class SensevoiceClient:
58
def __init__(self, model = "", url="http://0.0.0.0:12347", lauguage="auto", stream=False):
69
self.model = model
710
self.url = url
811
self.stream = stream
912
self.launguage = lauguage
13+
self.thread = None
14+
self.thread_is_exit = False
15+
self.thread_exit_code = 0
16+
17+
self.last_ai_isp = int(app.get_sys_config_kv("npu", "ai_isp", "0"))
18+
if self.last_ai_isp:
19+
raise ValueError("Please turn off AI ISP first, try: app.set_sys_config_kv('npu', 'ai_isp', '0')")
20+
21+
if not os.path.exists(model):
22+
raise ValueError(f'Model {self.model} is not existed!')
23+
1024
def _check_service(self):
1125
try:
1226
response = requests.get(self.url + '/status')
@@ -16,12 +30,13 @@ def _check_service(self):
1630
return False
1731

1832
def _start_service(self):
19-
import time
2033
if not self._check_service():
2134
os.system("systemctl start sensevoice.service")
2235

36+
count = 0
2337
while not self._check_service():
24-
print("Waiting for service to start...")
38+
count += 1
39+
print(f"Waiting for service to start({count})...")
2540
time.sleep(1)
2641

2742
return True
@@ -62,58 +77,159 @@ def _stop_model(self):
6277
except Exception as e:
6378
return False
6479

65-
def start(self):
80+
def _start_model_thread(self):
81+
print('Start service...')
6682
if self._start_service():
6783
print("Service started successfully.")
6884
else:
6985
print("Failed to start service.")
86+
self.thread_is_exit = True
87+
self.thread_exit_code = 1
7088
return False
7189

90+
print('Start model...')
7291
if self._start_model():
7392
print("Model started successfully.")
7493
else:
7594
print("Failed to start model.")
95+
self.thread_is_exit = True
96+
self.thread_exit_code = 1
7697
return False
98+
99+
self.thread_is_exit = True
100+
self.thread_exit_code = 0
77101
return True
78102

79-
def stop_model(self):
80-
self._stop_model()
103+
def start(self):
104+
self.thread_is_exit = False
105+
self.thread = threading.Thread(target=self._start_model_thread, daemon=True)
106+
self.thread.start()
107+
108+
def is_ready(self, block=False):
109+
while not app.need_exit():
110+
if self._get_status() == "loaded":
111+
return True
112+
else:
113+
if block:
114+
time.sleep(1)
115+
else:
116+
return False
117+
118+
if self.thread_is_exit:
119+
return True if self.thread_exit_code == 0 else False
120+
121+
return False
81122

82123
def stop(self):
83124
self._stop_model()
84125
self._stop_service()
85126

86-
def get_wave_form(self, path):
87-
waveform, _ = librosa.load(path, sr=16000)
127+
def load_wav_with_wave(self, path, sr=16000):
128+
"""
129+
Load WAV file using wave library and resample to target sample rate
130+
"""
131+
with wave.open(path, 'rb') as wav_file:
132+
# Get audio parameters
133+
n_channels = wav_file.getnchannels()
134+
sampwidth = wav_file.getsampwidth()
135+
framerate = wav_file.getframerate()
136+
n_frames = wav_file.getnframes()
137+
138+
# Read audio data
139+
frames = wav_file.readframes(n_frames)
140+
141+
# Convert byte data to numpy array based on sample width
142+
dtype_map = {1: np.int8, 2: np.int16, 4: np.int32}
143+
if sampwidth not in dtype_map:
144+
raise ValueError(f"Unsupported sample width: {sampwidth}")
145+
146+
dtype = dtype_map[sampwidth]
147+
audio_data = np.frombuffer(frames, dtype=dtype)
148+
149+
# Reshape for multi-channel audio
150+
if n_channels > 1:
151+
audio_data = audio_data.reshape(-1, n_channels)
152+
153+
# Convert to float32 in range [-1, 1]
154+
audio_data = audio_data.astype(np.float32) / np.iinfo(dtype).max
155+
156+
# Resample if needed
157+
if framerate != sr:
158+
# You'll need scipy for resampling
159+
from scipy import signal
160+
audio_data = signal.resample_poly(audio_data, sr, framerate, axis=0)
161+
162+
return audio_data, sr
163+
164+
def load_with_pcm(self, frames, sr=16000, bits=16, channels=1):
165+
if sr != 16000 or bits != 16 or channels != 1:
166+
raise ValueError("Only support samplerate = 16000, bits=16, channels=1")
167+
168+
dtype = np.int16
169+
170+
# Read audio data
171+
audio_data = np.frombuffer(frames, dtype=dtype)
172+
173+
# Convert to float32 in range [-1, 1]
174+
audio_data = audio_data.astype(np.float32) / np.iinfo(dtype).max
175+
176+
return audio_data, sr
177+
178+
def get_wave_form(self, data:str | bytes): # data is path or pcm data
179+
if isinstance(data, str):
180+
waveform, _ = self.load_wav_with_wave(data, sr=16000)
181+
elif isinstance(data, bytes):
182+
waveform, _ = self.load_with_pcm(data, sr=16000)
183+
else:
184+
raise ValueError("Not support this data type", type(data))
88185
return waveform
89186

90-
def refer(self, filepath):
187+
def refer(self, path=None, audio_data=None):
91188
if self.stream:
92189
print("Streaming mode, use refer_stream() instead.")
93190
return ""
94-
waveform = self.get_wave_form(filepath)
191+
192+
if path:
193+
waveform = self.get_wave_form(path)
194+
elif audio_data:
195+
waveform = self.get_wave_form(audio_data)
196+
else:
197+
raise ValueError("You need input path or audio_data")
198+
95199
data = {
96200
"audio_data": waveform.tolist(),
97201
"sample_rate": 16000,
98202
"launguage": "auto"
99203
}
204+
100205
try:
101206
response = requests.post(self.url + '/asr', json=data)
102207
if response.status_code == 200:
103208
res = json.loads(response.text)
104-
return res.get("text", "")
209+
text = res.get("text", "")
210+
if len(text) > 0:
211+
return text[0]
212+
else:
213+
return ""
105214
else:
106215
print(f"Requests failed: {response.status_code}")
107216
return ""
108217
except Exception as e:
109218
print("Requests failed:", e)
110219
return ""
111220

112-
def refer_stream(self, filepath):
221+
def refer_stream(self, path=None, audio_data=None):
113222
if not self.stream:
114223
print("Streaming mode, use refer() instead.")
115224
return ""
116-
waveform = self.get_wave_form(filepath)
225+
226+
if path:
227+
waveform = self.get_wave_form(path)
228+
elif audio_data:
229+
waveform = self.get_wave_form(audio_data)
230+
else:
231+
raise ValueError("You need input path or audio_data")
232+
117233
data = {
118234
"audio_data": waveform.tolist(),
119235
"sample_rate": 16000,
@@ -131,16 +247,41 @@ def refer_stream(self, filepath):
131247
print("Requests failed:", e)
132248
return ""
133249

250+
open_microphone = False
134251
stream = True
135-
client = SensevoiceClient(model="/root/models/sensevoice-maixcam2/model.mud", stream=stream)
136-
if client.start() is False:
252+
model_path = "/root/models/sensevoice-maixcam2"
253+
client = SensevoiceClient(model=model_path+"/model.mud", stream=stream)
254+
client.start()
255+
if client.is_ready(block=True) is False:
137256
print("Failed to start service or model.")
138257
exit()
139-
if not stream:
140-
print('start refer')
141-
text = client.refer("example/zh.mp3")
142-
print(text)
258+
259+
if open_microphone:
260+
from maix import audio
261+
recorder = audio.Recorder(sample_rate=16000, channel=1)
262+
recorder.volume(100)
263+
print('Recording for 3 seconds..')
264+
audio_data = recorder.record(3 * 1000)
265+
if not stream:
266+
print('start refer')
267+
text = client.refer(audio_data=audio_data)
268+
print(text)
269+
else:
270+
print('start refer stream')
271+
for text in client.refer_stream(audio_data=audio_data):
272+
print(text)
143273
else:
144-
print('start refer stream')
145-
for text in client.refer_stream("example/zh.mp3"):
146-
print(text)
274+
audio_file = "/maixapp/share/audio/demo.wav"
275+
if not stream:
276+
print('start refer')
277+
text = client.refer(path=audio_file)
278+
print(text)
279+
else:
280+
print('start refer stream')
281+
for text in client.refer_stream(path=audio_file):
282+
print(text)
283+
284+
285+
# You can comment out this line of code, which will save time on the next startup.
286+
# But it will cause the background service to continuously occupy CMM memory.
287+
client.stop()

examples/audio/asr/whisper/asr_whisper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from maix import nn, audio
22

33
# Only MaixCAM2 supports this model.
4-
whisper = nn.Whisper(model="/root/models/whisper-base/whisper-base.mud")
4+
lauguage = 'zh' # 'en' or 'zh
5+
whisper = nn.Whisper(model="/root/models/whisper-base/whisper-base.mud", language=lauguage)
56

67
use_default_file = True
78
transcribe_pcm_data = False

0 commit comments

Comments
 (0)