Skip to content

Commit 415fb74

Browse files
fixed speech recognition
1 parent 5237497 commit 415fb74

File tree

2 files changed

+49
-17
lines changed

2 files changed

+49
-17
lines changed

scripts/respeaker_node.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import usb.core
88
import usb.util
99
import pyaudio
10+
import wave
1011
import math
1112
import numpy as np
1213
import tf.transformations as T
@@ -15,6 +16,7 @@
1516
import struct
1617
import sys
1718
import time
19+
import speech_recognition as SR
1820
from audio_common_msgs.msg import AudioData
1921
from geometry_msgs.msg import PoseStamped
2022
from std_msgs.msg import Bool, Int32, ColorRGBA
@@ -169,12 +171,14 @@ def read(self, name):
169171
usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE,
170172
0, cmd, id, length, self.TIMEOUT)
171173

174+
#response = struct.unpack(b'ii', struct.pack(b'ii',*response))
172175
response = struct.unpack(b'ii', bytes([x for x in response]))
173176

174177
if data[2] == 'int':
175178
result = response[0]
176179
else:
177180
result = response[0] * (2.**response[1])
181+
#result = response[0]
178182

179183
return result
180184

@@ -224,6 +228,7 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
224228
self.rate = 16000
225229
self.bitwidth = 2
226230
self.bitdepth = 16
231+
self.i = 0
227232

228233
# find device
229234
count = self.pyaudio.get_device_count()
@@ -266,6 +271,7 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
266271
input_device_index=self.device_index,
267272
)
268273

274+
269275
def __del__(self):
270276
self.stop()
271277
try:
@@ -279,15 +285,25 @@ def __del__(self):
279285
except:
280286
pass
281287

288+
282289
def stream_callback(self, in_data, frame_count, time_info, status):
290+
283291
# split channel
284292
data = np.frombuffer(in_data, dtype=np.int16)
285293
chunk_per_channel = np.math.ceil(len(data) / self.available_channels)
286294
data = np.reshape(data, (chunk_per_channel, self.available_channels))
287295
for chan in self.channels:
288-
chan_data = data[:, chan]
296+
chan_data = bytearray(data[:, chan].tobytes())
297+
298+
## save this chunk to file
299+
#self.i += 1
300+
#sound_data = SR.AudioData(chan_data, self.rate, self.bitwidth)
301+
#with open(str(chan) + " one chunck " + str(self.i) + ".wav","wb") as f:
302+
# f.write(sound_data.get_wav_data())
303+
289304
# invoke callback
290-
self.on_audio(str(chan_data), chan)
305+
self.on_audio(chan_data, chan)
306+
291307
return None, pyaudio.paContinue
292308

293309
def start(self):
@@ -315,7 +331,7 @@ def __init__(self):
315331
#
316332
self.respeaker = RespeakerInterface()
317333
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
318-
self.speech_audio_buffer = str()
334+
self.speech_audio_buffer = bytearray()
319335
self.is_speeching = False
320336
self.speech_stopped = rospy.Time(0)
321337
self.prev_is_voice = None
@@ -333,16 +349,20 @@ def __init__(self):
333349
# start
334350
self.speech_prefetch_bytes = int(
335351
self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
336-
self.speech_prefetch_buffer = str()
352+
self.speech_prefetch_buffer = bytearray()
337353
self.respeaker_audio.start()
338354
self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
339355
self.on_timer)
340356
self.timer_led = None
341357
self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)
358+
self.big_data0 = []
359+
self.out = wave.open("/home/pi/Desktop/test.wav", 'wb')
360+
self.out.setparams((1, 2, 16000, 1024, "NONE", "NONE"))
342361

343362
def on_shutdown(self):
344363
try:
345364
self.respeaker.close()
365+
self.out.close()
346366
except:
347367
pass
348368
finally:
@@ -377,15 +397,19 @@ def on_status_led(self, msg):
377397
oneshot=True)
378398

379399
def on_audio(self, data, channel):
400+
401+
if channel == 0:
402+
self.out.writeframes(data)
403+
380404
self.pub_audios[channel].publish(AudioData(data=data))
381405
if channel == self.main_channel:
382406
self.pub_audio.publish(AudioData(data=data))
383407
if self.is_speeching:
384408
if len(self.speech_audio_buffer) == 0:
385409
self.speech_audio_buffer = self.speech_prefetch_buffer
386-
self.speech_audio_buffer += data
410+
for x in data: self.speech_audio_buffer += bytearray([x])
387411
else:
388-
self.speech_prefetch_buffer += data
412+
for x in data: self.speech_prefetch_buffer += bytearray([x])
389413
self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]
390414

391415
def on_timer(self, event):
@@ -427,12 +451,12 @@ def on_timer(self, event):
427451
buf = self.speech_audio_buffer
428452
self.speech_audio_buffer = str()
429453
self.is_speeching = False
430-
duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
454+
duration = len(buf) * self.respeaker_audio.bitwidth * 8.0
431455
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
432456
rospy.loginfo("Speech detected for %.3f seconds" % duration)
433457
if self.speech_min_duration <= duration < self.speech_max_duration:
434458

435-
self.pub_speech_audio.publish(AudioData(data=bytes(buf,"utf8")))
459+
self.pub_speech_audio.publish(AudioData(data=list(buf)))
436460

437461

438462
if __name__ == '__main__':

scripts/speech_to_text.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
class SpeechToText(object):
1616
def __init__(self):
1717
# format of input audio data
18-
self.sample_rate = rospy.get_param("~sample_rate", 16000)
19-
self.sample_width = rospy.get_param("~sample_width", 2)
18+
self.sample_rate = 16000 #rospy.get_param("~sample_rate", 16000)
19+
self.sample_width = 2 #rospy.get_param("~sample_width", 2)
2020
# language of STT service
21-
self.language = rospy.get_param("~language", "ja-JP")
21+
self.language = "en-US" #rospy.get_param("~language", "en-US")
2222
# ignore voice input while the robot is speaking
2323
self.self_cancellation = rospy.get_param("~self_cancellation", True)
2424
# time to assume as SPEAKING after tts service is finished
@@ -41,7 +41,7 @@ def __init__(self):
4141

4242
self.pub_speech = rospy.Publisher(
4343
"speech_to_text", SpeechRecognitionCandidates, queue_size=1)
44-
self.sub_audio = rospy.Subscriber("audio", AudioData, self.audio_cb)
44+
self.sub_audio = rospy.Subscriber("speech_audio", AudioData, self.audio_cb)
4545

4646
def tts_timer_cb(self, event):
4747
stamp = event.current_real
@@ -60,23 +60,31 @@ def tts_timer_cb(self, event):
6060
self.last_tts = stamp
6161
if stamp - self.last_tts > self.tts_tolerance:
6262
rospy.logdebug("END CANCELLATION")
63-
self.is_canceling = False
63+
self.is_canceling = Falser
6464

6565
def audio_cb(self, msg):
6666
if self.is_canceling:
6767
rospy.loginfo("Speech is cancelled")
6868
return
69-
data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
69+
data = SR.AudioData(bytes(msg.data), self.sample_rate, self.sample_width)
70+
with open(str(len(msg.data)) + ".wav","wb") as f:
71+
f.write(data.get_wav_data())
72+
7073
try:
7174
rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
72-
result = self.recognizer.recognize_google(
73-
data, language=self.language)
74-
msg = SpeechRecognitionCandidates(transcript=[result])
75+
result = self.recognizer.recognize_sphinx(
76+
data, language=self.language, show_all=False) #, key=None
77+
rospy.loginfo(result)
78+
79+
msg = SpeechRecognitionCandidates(transcript=result)
7580
self.pub_speech.publish(msg)
81+
7682
except SR.UnknownValueError as e:
7783
rospy.logerr("Failed to recognize: %s" % str(e))
84+
rospy.loginfo("value error")
7885
except SR.RequestError as e:
7986
rospy.logerr("Failed to recognize: %s" % str(e))
87+
rospy.loginfo("request error")
8088

8189

8290
if __name__ == '__main__':

0 commit comments

Comments
 (0)