diff --git a/ai/ai-endpoints/audio-virtual-assistant/README.md b/ai/ai-endpoints/audio-virtual-assistant/README.md index 7948f1d..78d1bf4 100644 --- a/ai/ai-endpoints/audio-virtual-assistant/README.md +++ b/ai/ai-endpoints/audio-virtual-assistant/README.md @@ -6,7 +6,7 @@ This project illustrate how to put Automatic Speech Recognition (ASR), Large Lan - create the `.env` file: ``` -ASR_GRPC_ENDPOINT=nvr-asr-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443 +ASR_AI_ENDPOINT=https://whisper-large-v3.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1 TTS_GRPC_ENDPOINT=nvr-tts-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443 LLM_AI_ENDPOINT=https://mixtral-8x7b-instruct-v01.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1 OVH_AI_ENDPOINTS_ACCESS_TOKEN= diff --git a/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py b/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py index eeb51b3..a4fdbb6 100644 --- a/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py +++ b/ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py @@ -8,68 +8,94 @@ # access the environment variables from the .env file load_dotenv() -ai_endpoint_token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN") - -# automatic speech recognition - question transcription -def asr_transcription(question): - - asr_service = riva.client.ASRService( - -riva.client.Auth(uri=os.environ.get('ASR_GRPC_ENDPOINT'), use_ssl=True, -metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]]) - ) - - # set up config - asr_config = riva.client.RecognitionConfig( - language_code="en-US", # languages: en-US - max_alternatives=1, - enable_automatic_punctuation=True, - audio_channel_count = 1, - ) - - # get asr model response - response = asr_service.offline_recognize(question, asr_config) - - return response.results[0].alternatives[0].transcript - -# text to speech - answer synthesis -def tts_synthesis(response): - - tts_service = riva.client.SpeechSynthesisService( - -riva.client.Auth(uri=os.environ.get('TTS_GRPC_ENDPOINT'), use_ssl=True, -metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]]) - ) - - # set up config - sample_rate_hz = 48000 - req = { - "language_code" : "en-US", # languages: en-US - "encoding" : riva.client.AudioEncoding.LINEAR_PCM , - "sample_rate_hz" : sample_rate_hz, # sample rate: 48KHz audio - "voice_name" : "English-US.Female-1" # voices: `English-US.Female-1`, `English-US.Male-1` - } - - # return response - req["text"] = response - response = tts_service.synthesize(**req) - - return np.frombuffer(response.audio, dtype=np.int16), sample_rate_hz - + +ASR_AI_ENDPOINT = os.environ.get('ASR_AI_ENDPOINT') +TTS_GRPC_ENDPOINT = os.environ.get('TTS_GRPC_ENDPOINT') +LLM_AI_ENDPOINT = os.environ.get('LLM_AI_ENDPOINT') +OVH_AI_ENDPOINTS_ACCESS_TOKEN = os.environ.get('OVH_AI_ENDPOINTS_ACCESS_TOKEN') + +llm_client = OpenAI( + base_url=LLM_AI_ENDPOINT, + api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN +) + +tts_client = riva.client.SpeechSynthesisService( + riva.client.Auth( + uri=TTS_GRPC_ENDPOINT, + use_ssl=True, + metadata_args=[["authorization", f"bearer {OVH_AI_ENDPOINTS_ACCESS_TOKEN}"]] + ) +) + +asr_client = OpenAI( + base_url=ASR_AI_ENDPOINT, + api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN +) + +def asr_transcription(question, asr_client): + return asr_client.audio.transcriptions.create( + model="whisper-large-v3", + file=question + ).text + +def llm_answer(input, llm_client): + response = llm_client.chat.completions.create( + model="Mixtral-8x7B-Instruct-v0.1", + messages=input, + temperature=0, + max_tokens=1024, + ) + msg = response.choices[0].message.content + + return msg + +def tts_synthesis(response, tts_client): + # Split response into chunks of max 1000 characters + max_chunk_length = 1000 + words = response.split() + chunks = [] + current_chunk = "" + + for word in words: + if len(current_chunk) + len(word) + 1 <= max_chunk_length: + current_chunk += " " + word if current_chunk else word + else: + chunks.append(current_chunk) + current_chunk = word + if current_chunk: + chunks.append(current_chunk) + + all_audio = np.array([], dtype=np.int16) + sample_rate_hz = 16000 + + # Process each chunk and concatenate the resulting audio + for text in chunks: + req = { + "language_code": "en-US", + "encoding": riva.client.AudioEncoding.LINEAR_PCM, + "sample_rate_hz": sample_rate_hz, + "voice_name": "English-US.Female-1", + "text": text.strip(), + } + synthesized = tts_client.synthesize(**req) + audio_segment = np.frombuffer(synthesized.audio, dtype=np.int16) + all_audio = np.concatenate((all_audio, audio_segment)) + + return all_audio, sample_rate_hz + + # streamlit interface with st.container(): st.title("💬 Audio Virtual Assistant Chatbot") - + with st.container(height=600): messages = st.container() - + if "messages" not in st.session_state: - st.session_state["messages"] = [{"role": "system", "content": -"Hello, I'm AVA!", "avatar":"🤖"}] - + st.session_state["messages"] = [{"role": "system", "content": "Hello, I'm AVA!", "avatar":"🤖"}] + for msg in st.session_state.messages: - messages.chat_message(msg["role"], -avatar=msg["avatar"]).write(msg["content"]) + messages.chat_message(msg["role"], avatar=msg["avatar"]).write(msg["content"]) with st.container(): @@ -81,28 +107,17 @@ def tts_synthesis(response): use_container_width=True, key='recorder' ) - + if recording: - user_question = asr_transcription(recording['bytes']) - + user_question = asr_transcription(recording['bytes'], asr_client) + if prompt := user_question: - client = OpenAI(base_url=os.getenv("LLM_AI_ENDPOINT"), -api_key=ai_endpoint_token) - st.session_state.messages.append({"role": "user", "content": -prompt, "avatar":"👤"}) + st.session_state.messages.append({"role": "user", "content": prompt, "avatar":"👤"}) messages.chat_message("user", avatar="👤").write(prompt) - response = client.chat.completions.create( - model="Mixtral-8x7B-Instruct-v0.1", - messages=st.session_state.messages, - temperature=0, - max_tokens=1024, - ) - msg = response.choices[0].message.content - st.session_state.messages.append({"role": "system", "content": -msg, "avatar": "🤖"}) + msg = llm_answer(st.session_state.messages, llm_client) + st.session_state.messages.append({"role": "assistant", "content": msg, "avatar": "🤖"}) messages.chat_message("system", avatar="🤖").write(msg) if msg is not None: - audio_samples, sample_rate_hz = tts_synthesis(msg) - placeholder.audio(audio_samples, -sample_rate=sample_rate_hz, autoplay=True) + audio_samples, sample_rate_hz = tts_synthesis(msg, tts_client) + placeholder.audio(audio_samples, sample_rate=sample_rate_hz, autoplay=True) \ No newline at end of file