Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
MAX_RETRY_DELAY = 30.0
INITIAL_RETRY_DELAY = 1.0

_DEFAULT_SPEECH_VOLUME_THRESHOLD = 0.001


@dataclass
class _PersonaplexOptions:
Expand All @@ -51,6 +53,7 @@ class _PersonaplexOptions:
text_prompt: str
seed: int | None
silence_threshold_ms: int
speech_volume_threshold: float
use_ssl: bool = False


Expand Down Expand Up @@ -89,22 +92,25 @@ def __init__(
text_prompt: str = "You are a helpful assistant.",
seed: int | None = None,
silence_threshold_ms: int = DEFAULT_SILENCE_THRESHOLD_MS,
speech_volume_threshold: float = _DEFAULT_SPEECH_VOLUME_THRESHOLD,
http_session: aiohttp.ClientSession | None = None,
) -> None:
"""Initialize the PersonaPlex RealtimeModel.

Args:
base_url: WebSocket URL of the PersonaPlex server
base_url (str): WebSocket URL of the PersonaPlex server
(e.g. "ws://localhost:8998"). If not set, reads from
PERSONAPLEX_URL env var. Defaults to "ws://localhost:8998".
voice: Voice prompt to use. One of the 18 available voices
(e.g. "NATF2", "NATM0", "VARF1").
text_prompt: System instruction / persona description for
the model. Set at connection time.
seed: Optional seed for reproducible generation.
silence_threshold_ms: Duration of silence (no audio from server)
before finalizing a generation. Default 500ms.
http_session: Optional aiohttp session to reuse.
voice (str): Voice prompt to use. One of the 18 available voices
(e.g. "NATF2", "NATM0", "VARF1"). Defaults to "NATF2".
text_prompt (str): System instruction / persona description for
the model. Set at connection time. Defaults to "You are a helpful assistant.".
seed (int | None): Optional seed for reproducible generation.
silence_threshold_ms (int): Duration of silence (no audio from server)
before finalizing a generation. Defaults to 500ms.
speech_volume_threshold (float): Peak volume (0.0–1.0) below which an
audio frame is treated as a filler frame and ignored. Defaults to 0.001.
http_session (aiohttp.ClientSession | None): Optional aiohttp session to reuse.
"""
super().__init__(
capabilities=llm.RealtimeCapabilities(
Expand All @@ -131,6 +137,7 @@ def __init__(
text_prompt=text_prompt,
seed=seed,
silence_threshold_ms=silence_threshold_ms,
speech_volume_threshold=speech_volume_threshold,
use_ssl=use_ssl,
)

Expand Down Expand Up @@ -429,7 +436,6 @@ async def _send_task(self, ws_conn: aiohttp.ClientWebSocketResponse) -> None:
# Queued frames are sent immediately — they're the first audio the
# server's recv_loop will see.
await self._handshake_event.wait()

async for msg in self._msg_ch:
if self._session_should_close.is_set():
break
Expand Down Expand Up @@ -531,6 +537,10 @@ def _handle_audio_data(self, opus_payload: bytes) -> None:
if pcm_float is None or len(pcm_float) == 0:
return

peak = float(np.abs(pcm_float).max())
if peak < self._opts.speech_volume_threshold:
return
Comment on lines +540 to +542
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How big is this frame, don't we need a moving avg?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're streaming 500ms by 500ms?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh the 500ms is the timeout, it streams normally

i think the frame is 80 ms. i don't think we need a moving average if the frames are consistently almost silent

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but isn't 80ms too short? what if it's just a pause between two words?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when i tested it, pauses between words and sentences didn't trigger the timeout, should we increase the timeout to be safe?

one thing to note is that for interruptions, the model replies so quickly that the interruption is included in the original turn, though i don't see a way around this right now

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense! The single-frame peak check works because the silence timeout is the real guard — you'd need ~10 consecutive filler frames (~800ms) before EOT triggers, so a brief pause between words won't cause a false positive. Increasing the default timeout to 800ms sounds like a good safety margin to me.


# Convert float32 to int16 PCM
pcm_int16 = np.clip(pcm_float * 32768.0, -32768, 32767).astype(np.int16)
pcm_bytes = pcm_int16.tobytes()
Expand All @@ -542,7 +552,7 @@ def _handle_audio_data(self, opus_payload: bytes) -> None:
gen = self._current_generation
assert gen is not None

if gen._first_token_timestamp is None and len(pcm_bytes) > 0:
if gen._first_token_timestamp is None:
gen._first_token_timestamp = time.time()

frame = rtc.AudioFrame(
Expand All @@ -554,7 +564,6 @@ def _handle_audio_data(self, opus_payload: bytes) -> None:
with contextlib.suppress(utils.aio.channel.ChanClosed):
gen.audio_ch.send_nowait(frame)

# Reset silence timer on every audio frame
self._reset_silence_timer()

except Exception as e:
Expand Down Expand Up @@ -716,7 +725,6 @@ def _cancel_silence_timer(self) -> None:

def _on_silence_timeout(self) -> None:
if self._current_generation and not self._current_generation._done:
logger.debug("Silence detected, finalizing generation")
self._finalize_generation(interrupted=False)

# -- Internal: audio resampling --
Expand Down
Loading