Skip to content

Commit 71aaaff

Browse files
committed
fix: also send WAV header
1 parent 66869fe commit 71aaaff

File tree

1 file changed

+39
-22
lines changed

1 file changed

+39
-22
lines changed

src/arduino/app_bricks/local_asr/local_asr.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def _transcribe_stream(self, duration: int = 0, audio_source: BaseMicrophone | b
284284
future = None
285285

286286
try:
287-
logger.info(f"Creating transcription session with model={self.model}, language={self.language}")
287+
logger.debug(f"Creating transcription session with model={self.model}, language={self.language}")
288288
# Create transcription session
289289
url = f"{self.api_base_url}/transcriptions/create"
290290
data = {
@@ -365,7 +365,7 @@ def _transcribe_stream(self, duration: int = 0, audio_source: BaseMicrophone | b
365365
pass # Ignore any errors during cleanup
366366
raise
367367

368-
except TimeoutError:
368+
except (TimeoutError, asyncio.TimeoutError):
369369
raise
370370

371371
except Exception as e:
@@ -380,7 +380,7 @@ def _asyncio_loop(self):
380380
Dedicated thread for running the asyncio event loop.
381381
Manages transcription sessions posted via run_coroutine_threadsafe.
382382
"""
383-
logger.info("Asyncio event loop starting")
383+
logger.debug("Asyncio event loop starting")
384384

385385
self._worker_loop = asyncio.new_event_loop()
386386
asyncio.set_event_loop(self._worker_loop)
@@ -404,7 +404,7 @@ async def keep_alive():
404404
)
405405
self._worker_loop.close()
406406
self._worker_loop = None
407-
logger.info("Asyncio event loop stopped")
407+
logger.debug("Asyncio event loop stopped")
408408

409409
async def _transcription_session_handler(self, session_info: MicSessionInfo | WAVSessionInfo):
410410
"""
@@ -414,7 +414,7 @@ async def _transcription_session_handler(self, session_info: MicSessionInfo | WA
414414
session_id = session_info.session_id
415415

416416
async with websockets.connect(self.ws_url) as websocket:
417-
logger.info(f"WebSocket connected for session {session_id}")
417+
logger.debug(f"WebSocket connected for session {session_id}")
418418

419419
# Start audio sending and transcription receiving tasks
420420
if isinstance(session_info, MicSessionInfo):
@@ -501,7 +501,7 @@ def _mic_reader_loop(self, mic: BaseMicrophone):
501501
and publishing to its subscribers.
502502
"""
503503
mic_id = id(mic)
504-
logger.info(f"Audio reader thread starting for mic {mic_id}")
504+
logger.debug(f"Audio reader thread starting for mic {mic_id}")
505505

506506
try:
507507
while not self._stop_worker.is_set():
@@ -511,15 +511,15 @@ def _mic_reader_loop(self, mic: BaseMicrophone):
511511
self._audio_stream_router.publish(mic, audio_chunk)
512512
else:
513513
# No subscribers left, exit the thread
514-
logger.info(f"No more subscribers for mic {mic_id}, stopping reader thread")
514+
logger.debug(f"No more subscribers for mic {mic_id}, stopping reader thread")
515515
break
516516

517517
except Exception as e:
518518
logger.error(f"Audio reader thread error for mic {mic_id}: {e}")
519519
raise
520520
finally:
521521
self._audio_stream_router.unregister_thread(mic)
522-
logger.info(f"Audio reader thread stopped for mic {mic_id}")
522+
logger.debug(f"Audio reader thread stopped for mic {mic_id}")
523523

524524
async def _send_wav_audio(self, websocket: websockets.ClientConnection, session_info: WAVSessionInfo):
525525
"""Sends audio chunks from WAV numpy array to WebSocket"""
@@ -529,17 +529,34 @@ async def _send_wav_audio(self, websocket: websockets.ClientConnection, session_
529529
session_id = session_info.session_id
530530
wav_audio = session_info.wav_audio
531531

532-
with wave.open(io.BytesIO(wav_audio), 'rb') as wf:
533-
sample_rate = wf.getframerate()
534-
num_channels = wf.getnchannels()
535-
sample_width = wf.getsampwidth()
536-
frames = wf.readframes(wf.getnframes())
537-
538-
# Calculate chunk size for ~100ms of audio
539-
chunk_duration = 0.1
540-
chunk_size = int(chunk_duration * sample_rate * num_channels * sample_width)
541-
542532
try:
533+
# Parse WAV file to get format information and calculate header size
534+
with wave.open(io.BytesIO(wav_audio), 'rb') as wf:
535+
sample_rate = wf.getframerate()
536+
num_channels = wf.getnchannels()
537+
sample_width = wf.getsampwidth()
538+
num_frames = wf.getnframes()
539+
540+
frames_size = num_frames * num_channels * sample_width
541+
header_size = len(wav_audio) - frames_size
542+
543+
header = wav_audio[:header_size]
544+
frames = wav_audio[header_size:]
545+
546+
header_base64 = base64.b64encode(header).decode('utf-8')
547+
header_message = {
548+
"message_type": "transcriptions_session_audio",
549+
"message_source": "audio_analytics_api",
550+
"session_id": session_id,
551+
"type": "input_audio",
552+
"data": header_base64
553+
}
554+
await websocket.send(json.dumps(header_message))
555+
556+
# Calculate chunk size for ~100ms of audio
557+
chunk_duration = 0.1
558+
chunk_size = int(chunk_duration * sample_rate * num_channels * sample_width)
559+
543560
for i in range(0, len(frames), chunk_size):
544561
iteration_start = time.perf_counter()
545562

@@ -548,16 +565,14 @@ async def _send_wav_audio(self, websocket: websockets.ClientConnection, session_
548565

549566
audio_chunk = frames[i:i + chunk_size]
550567
audio_base64 = base64.b64encode(audio_chunk).decode('utf-8')
551-
552-
message = {
568+
chunk_message = {
553569
"message_type": "transcriptions_session_audio",
554570
"message_source": "audio_analytics_api",
555571
"session_id": session_id,
556572
"type": "input_audio",
557573
"data": audio_base64
558574
}
559-
560-
await websocket.send(json.dumps(message))
575+
await websocket.send(json.dumps(chunk_message))
561576

562577
# Account for processing time to maintain real-time simulation
563578
elapsed = time.perf_counter() - iteration_start
@@ -567,6 +582,7 @@ async def _send_wav_audio(self, websocket: websockets.ClientConnection, session_
567582
except asyncio.CancelledError:
568583
logger.debug(f"Array audio sending cancelled for session {session_id}")
569584
raise
585+
570586
except Exception as e:
571587
logger.error(f"Error sending array audio for session {session_id}: {e}")
572588
raise
@@ -619,6 +635,7 @@ async def _receive_transcription(self, websocket: websockets.ClientConnection, s
619635
except asyncio.CancelledError:
620636
logger.debug(f"Receive task cancelled for session {session_id}")
621637
raise
638+
622639
except Exception as e:
623640
logger.error(f"Error receiving transcription for {session_id}: {e}")
624641
raise

0 commit comments

Comments
 (0)