ovh · MathieuBsqt · Oct 1, 2025 · Oct 1, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/ai/ai-endpoints/audio-virtual-assistant/README.md b/ai/ai-endpoints/audio-virtual-assistant/README.md
@@ -6,7 +6,7 @@ This project illustrate how to put Automatic Speech Recognition (ASR), Large Lan
 
 - create the `.env` file:
 ```
-ASR_GRPC_ENDPOINT=nvr-asr-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443
+ASR_AI_ENDPOINT=https://whisper-large-v3.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
 TTS_GRPC_ENDPOINT=nvr-tts-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443
 LLM_AI_ENDPOINT=https://mixtral-8x7b-instruct-v01.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
 OVH_AI_ENDPOINTS_ACCESS_TOKEN=<ai-endpoints-api-token>

diff --git a/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py b/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py
@@ -8,68 +8,94 @@
 
 # access the environment variables from the .env file
 load_dotenv()
-ai_endpoint_token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
-
-# automatic speech recognition - question transcription
-def asr_transcription(question):
-
-    asr_service = riva.client.ASRService(
-
-riva.client.Auth(uri=os.environ.get('ASR_GRPC_ENDPOINT'), use_ssl=True, 
-metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]])
-                )
-
-    # set up config
-    asr_config = riva.client.RecognitionConfig(
-        language_code="en-US",   # languages: en-US
-        max_alternatives=1,
-        enable_automatic_punctuation=True,
-        audio_channel_count = 1,
-    )  
-
-    # get asr model response
-    response = asr_service.offline_recognize(question, asr_config)
-
-    return response.results[0].alternatives[0].transcript
-
-# text to speech - answer synthesis
-def tts_synthesis(response):
-
-    tts_service = riva.client.SpeechSynthesisService(
-
-riva.client.Auth(uri=os.environ.get('TTS_GRPC_ENDPOINT'), use_ssl=True, 
-metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]])
-                )
-
-    # set up config
-    sample_rate_hz = 48000
-    req = {
-            "language_code"  : "en-US",                                 # languages: en-US
-            "encoding"       : riva.client.AudioEncoding.LINEAR_PCM ,
-            "sample_rate_hz" : sample_rate_hz,                          # sample rate: 48KHz audio
-            "voice_name"     : "English-US.Female-1"                    # voices: `English-US.Female-1`, `English-US.Male-1`
-    }
-
-    # return response
-    req["text"] = response
-    response = tts_service.synthesize(**req)
-
-    return np.frombuffer(response.audio, dtype=np.int16), sample_rate_hz
-
+
+ASR_AI_ENDPOINT = os.environ.get('ASR_AI_ENDPOINT')
+TTS_GRPC_ENDPOINT = os.environ.get('TTS_GRPC_ENDPOINT')
+LLM_AI_ENDPOINT = os.environ.get('LLM_AI_ENDPOINT')
+OVH_AI_ENDPOINTS_ACCESS_TOKEN = os.environ.get('OVH_AI_ENDPOINTS_ACCESS_TOKEN')
+
+llm_client = OpenAI(
+    base_url=LLM_AI_ENDPOINT,
+    api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN
+)
+
+tts_client = riva.client.SpeechSynthesisService(
+    riva.client.Auth(
+        uri=TTS_GRPC_ENDPOINT,
+        use_ssl=True,
+        metadata_args=[["authorization", f"bearer {OVH_AI_ENDPOINTS_ACCESS_TOKEN}"]]
+    )
+)
+
+asr_client = OpenAI(
+    base_url=ASR_AI_ENDPOINT,
+    api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN
+)
+
+def asr_transcription(question, asr_client):
+    return asr_client.audio.transcriptions.create(
+        model="whisper-large-v3",
+        file=question
+    ).text
+
+def llm_answer(input, llm_client):
+    response = llm_client.chat.completions.create(
+                model="Mixtral-8x7B-Instruct-v0.1", 
+                messages=input,
+                temperature=0,
+                max_tokens=1024,
+            )
+    msg = response.choices[0].message.content
+
+    return msg
+
+def tts_synthesis(response, tts_client):
+    # Split response into chunks of max 1000 characters
+    max_chunk_length = 1000
+    words = response.split()
+    chunks = []
+    current_chunk = ""
+
+    for word in words:
+        if len(current_chunk) + len(word) + 1 <= max_chunk_length:
+            current_chunk += " " + word if current_chunk else word
+        else:
+            chunks.append(current_chunk)
+            current_chunk = word
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    all_audio = np.array([], dtype=np.int16)
+    sample_rate_hz = 16000
+
+    # Process each chunk and concatenate the resulting audio
+    for text in chunks:
+        req = {
+            "language_code": "en-US",
+            "encoding": riva.client.AudioEncoding.LINEAR_PCM,
+            "sample_rate_hz": sample_rate_hz,
+            "voice_name": "English-US.Female-1",
+            "text": text.strip(),
+        }
+        synthesized = tts_client.synthesize(**req)
+        audio_segment = np.frombuffer(synthesized.audio, dtype=np.int16)
+        all_audio = np.concatenate((all_audio, audio_segment))
+
+    return all_audio, sample_rate_hz
+
+
 # streamlit interface
 with st.container():
     st.title("💬 Audio Virtual Assistant Chatbot")
-    
+
 with st.container(height=600):
     messages = st.container()
-    
+
     if "messages" not in st.session_state:
-        st.session_state["messages"] = [{"role": "system", "content": 
-"Hello, I'm AVA!", "avatar":"🤖"}]
-
+        st.session_state["messages"] = [{"role": "system", "content": "Hello, I'm AVA!", "avatar":"🤖"}]
+
     for msg in st.session_state.messages:
-        messages.chat_message(msg["role"], 
-avatar=msg["avatar"]).write(msg["content"])
+        messages.chat_message(msg["role"], avatar=msg["avatar"]).write(msg["content"])
 
 with st.container():
 
@@ -81,28 +107,17 @@ def tts_synthesis(response):
             use_container_width=True,
             key='recorder'
         )
-    
+
     if recording:  
-        user_question = asr_transcription(recording['bytes'])
-        
+        user_question = asr_transcription(recording['bytes'], asr_client)
+
         if prompt := user_question:
-            client = OpenAI(base_url=os.getenv("LLM_AI_ENDPOINT"), 
-api_key=ai_endpoint_token)
-            st.session_state.messages.append({"role": "user", "content": 
-prompt, "avatar":"👤"})
+            st.session_state.messages.append({"role": "user", "content": prompt, "avatar":"👤"})
             messages.chat_message("user", avatar="👤").write(prompt)
-            response = client.chat.completions.create(
-                model="Mixtral-8x7B-Instruct-v0.1", 
-                messages=st.session_state.messages,
-                temperature=0,
-                max_tokens=1024,
-            )
-            msg = response.choices[0].message.content
-            st.session_state.messages.append({"role": "system", "content": 
-msg, "avatar": "🤖"})
+            msg = llm_answer(st.session_state.messages, llm_client)
+            st.session_state.messages.append({"role": "assistant", "content": msg, "avatar": "🤖"})
             messages.chat_message("system", avatar="🤖").write(msg)
 
             if msg is not None:
-                audio_samples, sample_rate_hz = tts_synthesis(msg)
-                placeholder.audio(audio_samples, 
-sample_rate=sample_rate_hz, autoplay=True)
+                audio_samples, sample_rate_hz = tts_synthesis(msg, tts_client)
+                placeholder.audio(audio_samples, sample_rate=sample_rate_hz, autoplay=True)