1+
2+ from maix import nn , audio , time , display , app , image , touchscreen
3+ import threading
4+ from queue import Queue , Empty
5+ import re
6+
7+ class App :
8+ def __init__ (self ):
9+ image .load_font ("sourcehansans" , "/maixapp/share/font/SourceHanSansCN-Regular.otf" , size = 20 )
10+ image .set_default_font ("sourcehansans" )
11+ self .disp = display .Display ()
12+ self .disp_w = 320
13+ self .disp_h = 240
14+ self .__show_load_info ('loading touchscreen..' )
15+ self .ts = touchscreen .TouchScreen ()
16+
17+ self .exit_img = image .load ('./assets/exit.jpg' )
18+ # self.__show_load_info('loading key..')
19+ # self.key_obj = key.Key(self.on_key)
20+ # self.key_status = False
21+
22+ self .__show_load_info ('loading recorder..' )
23+ self .default_wav_path = "/root/audio.wav"
24+ self .default_record_samplerate = 16000
25+ self .default_record_volume = 70
26+ self .recorder = audio .Recorder (sample_rate = self .default_record_samplerate )
27+ self .recorder .volume (self .default_record_volume )
28+
29+ self .__show_load_info ("loading webrtcvad.." )
30+ try :
31+ import webrtcvad
32+ self .vad = webrtcvad .Vad ()
33+ self .vad .set_mode (3 )
34+ self .vad_duration_ms = 30
35+ except :
36+ self .vad = None
37+
38+ self .__show_load_info ('loading player..' )
39+ self .player = audio .Player (sample_rate = 44100 )
40+ self .player .volume (50 )
41+ self .player_queue = Queue (100 )
42+ self .player_thread = threading .Thread (target = self .player_thread_handle , daemon = True )
43+ self .player_thread .start ()
44+
45+ self .__show_load_info ('loading whisper..' )
46+ ai_isp_on = bool (int (app .get_sys_config_kv ("npu" , "ai_isp" , "1" )))
47+ if ai_isp_on is True :
48+ img = image .Image (320 , 240 , bg = image .COLOR_BLACK )
49+ err_msg = "You need edit /boot/configs to set ai_isp_on to 0"
50+ err_msg_size = image .string_size (err_msg )
51+ img .draw_string ((img .width () - err_msg_size .width ()) // 2 , (img .height () - err_msg_size .height ()) // 2 , err_msg , image .COLOR_RED )
52+ self .disp .show (img )
53+ while not app .need_exit ():
54+ ts_data = self .ts .read ()
55+ if ts_data [2 ]:
56+ app .set_exit_flag (True )
57+ time .sleep_ms (100 )
58+ self .whisper = nn .Whisper (model = "/root/models/whisper-base/whisper-base.mud" , language = "en" )
59+
60+ self .__show_load_info ('loading llm..' )
61+ # /root/models/Qwen2.5-0.5B-Instruct/model.mud
62+ # /root/models/Qwen2.5-1.5B-Instruct/model.mud
63+ self .llm = nn .Qwen ("/root/models/Qwen2.5-1.5B-Instruct/model.mud" )
64+ self .llm .set_system_prompt ("You are Qwen, created by Alibaba Cloud. You are a helpful assistant." )
65+ self .llm .set_reply_callback (self .__llm_on_reply )
66+ self .llm_last_msg = ""
67+
68+ self .__show_load_info ('loading melotts..' )
69+ self .tts = nn .MeloTTS (model = "/root/models/melotts/melotts-zh.mud" , speed = 0.8 , language = 'en' )
70+
71+ self .tts_queue = Queue (100 )
72+ self .tts_thread = threading .Thread (target = self .tts_thread_handle , daemon = True )
73+ self .tts_thread .start ()
74+
75+ def player_thread_handle (self ):
76+ while not app .need_exit ():
77+ try :
78+ pcm = self .player_queue .get (timeout = 500 )
79+ print ('play start' )
80+ t = time .ticks_ms ()
81+ self .player .play (pcm )
82+ print ('player cost' , time .ticks_ms () - t )
83+ print ('play finish' )
84+ except Empty :
85+ continue
86+
87+ def tts_thread_handle (self ):
88+ while not app .need_exit ():
89+ try :
90+ msg = self .tts_queue .get (timeout = 500 )
91+ print ('tts queue get:' , msg )
92+ t = time .ticks_ms ()
93+ pcm = self .tts .infer (msg , output_pcm = True )
94+ print ('tts infer cost' , time .ticks_ms () - t )
95+ self .player_queue .put (pcm )
96+ except Empty :
97+ continue
98+
99+ def __llm_on_reply (self , obj , resp ):
100+ print (resp .msg_new , end = "" )
101+ img = image .Image (320 , 240 , bg = image .COLOR_BLACK )
102+ self .__draw_string_upper_center (img , text = "Run LLM.." , color = image .COLOR_GREEN )
103+ # img.draw_string(0, 0, "Run LLM..", image.COLOR_GREEN)
104+ img .draw_string (0 , 30 , resp .msg , image .COLOR_WHITE )
105+ self .disp .show (img )
106+
107+ self .llm_last_msg += resp .msg_new
108+ parts = re .split (r"[,.!?]" , self .llm_last_msg )
109+ # print('parts', parts)
110+ if len (parts ) > 1 :
111+ if "!" in self .llm_last_msg :
112+ push_msg = parts [0 ] + "!"
113+ elif "," in self .llm_last_msg :
114+ push_msg = parts [0 ] + ","
115+ elif "." in self .llm_last_msg :
116+ push_msg = parts [0 ] + "."
117+ elif "?" in self .llm_last_msg :
118+ push_msg = parts [0 ] + "?"
119+ else :
120+ push_msg = parts [0 ]
121+ pass
122+ self .llm_last_msg = parts [- 1 ]
123+ self .tts_queue .put (push_msg )
124+
125+ def __show_load_info (self , text : str , x :int = 0 , y :int = 0 , color :image .Color = image .COLOR_WHITE ):
126+ if self .disp :
127+ str_size = image .string_size (text )
128+ img = image .Image (self .disp_w , self .disp_h , bg = image .COLOR_BLACK )
129+ if x == 0 :
130+ x = (img .width () - str_size .width ()) // 2
131+ if y == 0 :
132+ y = (img .height () - str_size .height ()) // 2
133+ img .draw_string (x , y , text , image .COLOR_WHITE )
134+ self .disp .show (img )
135+
136+ def __draw_string_upper_center (self , img , y :int = 8 , text :str = "" , color :image .Color = image .COLOR_WHITE ):
137+ x = 0
138+ text_size = image .string_size (text )
139+ x = (img .width () - text_size .width ()) // 2
140+ img .draw_string (x , y , text , color )
141+
142+ def __reset_recorder (self , save_file : bool ):
143+ if self .recorder :
144+ del self .recorder
145+ if save_file :
146+ self .recorder = audio .Recorder (self .default_wav_path , self .default_record_samplerate )
147+ else :
148+ self .recorder = audio .Recorder (sample_rate = self .default_record_samplerate )
149+ self .recorder .volume (self .default_record_volume )
150+
151+ def run (self ):
152+ class Status :
153+ IDLE = 0
154+ SPEAKING = 1
155+ TRANSCRIBE = 2
156+ TTS = 3
157+ LLM = 4
158+ VAD = 5
159+ status = Status .IDLE
160+ record_ms = 4000
161+ asr_result = None
162+ llm_result = None
163+ start_vad = False
164+
165+ while not app .need_exit ():
166+ img = image .Image (320 , 240 , bg = image .COLOR_BLACK )
167+
168+ ts_data = self .ts .read ()
169+ if status == Status .VAD :
170+ self .__draw_string_upper_center (img , text = "VAD.." , color = image .COLOR_GREEN )
171+ # img.draw_string(0, 0, "VAD..", image.COLOR_GREEN)
172+ elif status == Status .SPEAKING :
173+ self .__draw_string_upper_center (img , text = "Speaking.." , color = image .COLOR_GREEN )
174+ # img.draw_string(0, 0, "Speaking..", image.COLOR_GREEN)
175+ elif status == Status .TRANSCRIBE :
176+ self .__draw_string_upper_center (img , text = "Transcribing.." , color = image .COLOR_GREEN )
177+ # img.draw_string(0, 0, "Transcribing..", image.COLOR_GREEN)
178+ elif status == Status .LLM :
179+ self .__draw_string_upper_center (img , text = "Run LLM.." , color = image .COLOR_GREEN )
180+ # img.draw_string(0, 0, "Run LLM..", image.COLOR_GREEN)
181+ if asr_result :
182+ img .draw_string (0 , 30 , asr_result , image .COLOR_WHITE )
183+ elif llm_result :
184+ img .draw_string (0 , 30 , llm_result , image .COLOR_WHITE )
185+ elif status == Status .TTS :
186+ self .__draw_string_upper_center (img , text = "Run MelloTTS.." , color = image .COLOR_GREEN )
187+ # img.draw_string(0, 0, "Run MelloTTS..", image.COLOR_GREEN)
188+ if asr_result :
189+ img .draw_string (0 , 30 , asr_result , image .COLOR_WHITE )
190+ elif llm_result :
191+ img .draw_string (0 , 30 , llm_result , image .COLOR_WHITE )
192+ else :
193+ # img.draw_string(0, 0, "Waiting press touchscreen..", image.COLOR_GREEN)
194+ self .__draw_string_upper_center (img , text = "Waiting press touchscreen.." , color = image .COLOR_GREEN )
195+ if asr_result :
196+ img .draw_string (0 , 30 , asr_result , image .COLOR_WHITE )
197+ elif llm_result :
198+ img .draw_string (0 , 30 , llm_result , image .COLOR_WHITE )
199+
200+ exit_img_x = 0
201+ exit_img_y = 0
202+ img .draw_image (exit_img_x , exit_img_y , self .exit_img )
203+ if ts_data [2 ] and 0 <= ts_data [0 ]<= self .exit_img .width () + exit_img_x * 2 and 0 <= ts_data [1 ]<= self .exit_img .height () + exit_img_y * 2 :
204+ print ('exit' )
205+ app .set_exit_flag (True )
206+ self .disp .show (img )
207+
208+ if status == Status .IDLE :
209+ if ts_data [2 ]:
210+ if self .vad :
211+ start_vad = not start_vad
212+ status = Status .VAD
213+ else :
214+ status = Status .SPEAKING
215+ elif status == Status .VAD :
216+ if self .vad :
217+ if start_vad :
218+ pcm = self .recorder .record (self .vad_duration_ms )
219+ if pcm and len (pcm ) > 0 :
220+ is_speech = self .vad .is_speech (pcm , self .default_record_samplerate )
221+ if is_speech :
222+ start_vad = False
223+ status = Status .SPEAKING
224+ else :
225+ status = Status .SPEAKING
226+ elif status == Status .SPEAKING :
227+ self .__reset_recorder (True )
228+ self .recorder .record (record_ms )
229+ self .recorder .finish ()
230+ self .__reset_recorder (False )
231+ status = Status .TRANSCRIBE
232+ elif status == Status .TRANSCRIBE :
233+ asr_result = self .whisper .transcribe (self .default_wav_path )
234+ print (asr_result )
235+ status = Status .LLM
236+ elif status == Status .LLM :
237+ if asr_result :
238+ llm_result0 = self .llm .send (asr_result )
239+ llm_result = llm_result0 .msg
240+ self .llm .clear_context ()
241+ print (llm_result )
242+ status = Status .TTS
243+ asr_result = None
244+ elif status == Status .TTS :
245+ if self .tts_queue .empty ():
246+ status = Status .IDLE
247+ else :
248+ status = Status .IDLE
249+ time .sleep_ms (5 )
250+
251+ if __name__ == '__main__' :
252+ appication = App ()
253+ appication .run ()
0 commit comments