Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ai/ai-endpoints/audio-virtual-assistant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This project illustrate how to put Automatic Speech Recognition (ASR), Large Lan

- create the `.env` file:
```
ASR_GRPC_ENDPOINT=nvr-asr-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443
ASR_AI_ENDPOINT=https://whisper-large-v3.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
TTS_GRPC_ENDPOINT=nvr-tts-en-us.endpoints-grpc.kepler.ai.cloud.ovh.net:443
LLM_AI_ENDPOINT=https://mixtral-8x7b-instruct-v01.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
OVH_AI_ENDPOINTS_ACCESS_TOKEN=<ai-endpoints-api-token>
Expand Down
163 changes: 89 additions & 74 deletions ai/ai-endpoints/audio-virtual-assistant/audio-virtual-assistant-app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,68 +8,94 @@

# access the environment variables from the .env file
load_dotenv()
ai_endpoint_token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")

# automatic speech recognition - question transcription
def asr_transcription(question):

asr_service = riva.client.ASRService(

riva.client.Auth(uri=os.environ.get('ASR_GRPC_ENDPOINT'), use_ssl=True,
metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]])
)

# set up config
asr_config = riva.client.RecognitionConfig(
language_code="en-US", # languages: en-US
max_alternatives=1,
enable_automatic_punctuation=True,
audio_channel_count = 1,
)

# get asr model response
response = asr_service.offline_recognize(question, asr_config)

return response.results[0].alternatives[0].transcript

# text to speech - answer synthesis
def tts_synthesis(response):

tts_service = riva.client.SpeechSynthesisService(

riva.client.Auth(uri=os.environ.get('TTS_GRPC_ENDPOINT'), use_ssl=True,
metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]])
)

# set up config
sample_rate_hz = 48000
req = {
"language_code" : "en-US", # languages: en-US
"encoding" : riva.client.AudioEncoding.LINEAR_PCM ,
"sample_rate_hz" : sample_rate_hz, # sample rate: 48KHz audio
"voice_name" : "English-US.Female-1" # voices: `English-US.Female-1`, `English-US.Male-1`
}

# return response
req["text"] = response
response = tts_service.synthesize(**req)

return np.frombuffer(response.audio, dtype=np.int16), sample_rate_hz


ASR_AI_ENDPOINT = os.environ.get('ASR_AI_ENDPOINT')
TTS_GRPC_ENDPOINT = os.environ.get('TTS_GRPC_ENDPOINT')
LLM_AI_ENDPOINT = os.environ.get('LLM_AI_ENDPOINT')
OVH_AI_ENDPOINTS_ACCESS_TOKEN = os.environ.get('OVH_AI_ENDPOINTS_ACCESS_TOKEN')

llm_client = OpenAI(
base_url=LLM_AI_ENDPOINT,
api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN
)

tts_client = riva.client.SpeechSynthesisService(
riva.client.Auth(
uri=TTS_GRPC_ENDPOINT,
use_ssl=True,
metadata_args=[["authorization", f"bearer {OVH_AI_ENDPOINTS_ACCESS_TOKEN}"]]
)
)

asr_client = OpenAI(
base_url=ASR_AI_ENDPOINT,
api_key=OVH_AI_ENDPOINTS_ACCESS_TOKEN
)

def asr_transcription(question, asr_client):
return asr_client.audio.transcriptions.create(
model="whisper-large-v3",
file=question
).text

def llm_answer(input, llm_client):
response = llm_client.chat.completions.create(
model="Mixtral-8x7B-Instruct-v0.1",
messages=input,
temperature=0,
max_tokens=1024,
)
msg = response.choices[0].message.content

return msg

def tts_synthesis(response, tts_client):
# Split response into chunks of max 1000 characters
max_chunk_length = 1000
words = response.split()
chunks = []
current_chunk = ""

for word in words:
if len(current_chunk) + len(word) + 1 <= max_chunk_length:
current_chunk += " " + word if current_chunk else word
else:
chunks.append(current_chunk)
current_chunk = word
if current_chunk:
chunks.append(current_chunk)

all_audio = np.array([], dtype=np.int16)
sample_rate_hz = 16000

# Process each chunk and concatenate the resulting audio
for text in chunks:
req = {
"language_code": "en-US",
"encoding": riva.client.AudioEncoding.LINEAR_PCM,
"sample_rate_hz": sample_rate_hz,
"voice_name": "English-US.Female-1",
"text": text.strip(),
}
synthesized = tts_client.synthesize(**req)
audio_segment = np.frombuffer(synthesized.audio, dtype=np.int16)
all_audio = np.concatenate((all_audio, audio_segment))

return all_audio, sample_rate_hz


# streamlit interface
with st.container():
st.title("💬 Audio Virtual Assistant Chatbot")

with st.container(height=600):
messages = st.container()

if "messages" not in st.session_state:
st.session_state["messages"] = [{"role": "system", "content":
"Hello, I'm AVA!", "avatar":"🤖"}]

st.session_state["messages"] = [{"role": "system", "content": "Hello, I'm AVA!", "avatar":"🤖"}]

for msg in st.session_state.messages:
messages.chat_message(msg["role"],
avatar=msg["avatar"]).write(msg["content"])
messages.chat_message(msg["role"], avatar=msg["avatar"]).write(msg["content"])

with st.container():

Expand All @@ -81,28 +107,17 @@ def tts_synthesis(response):
use_container_width=True,
key='recorder'
)

if recording:
user_question = asr_transcription(recording['bytes'])
user_question = asr_transcription(recording['bytes'], asr_client)

if prompt := user_question:
client = OpenAI(base_url=os.getenv("LLM_AI_ENDPOINT"),
api_key=ai_endpoint_token)
st.session_state.messages.append({"role": "user", "content":
prompt, "avatar":"👤"})
st.session_state.messages.append({"role": "user", "content": prompt, "avatar":"👤"})
messages.chat_message("user", avatar="👤").write(prompt)
response = client.chat.completions.create(
model="Mixtral-8x7B-Instruct-v0.1",
messages=st.session_state.messages,
temperature=0,
max_tokens=1024,
)
msg = response.choices[0].message.content
st.session_state.messages.append({"role": "system", "content":
msg, "avatar": "🤖"})
msg = llm_answer(st.session_state.messages, llm_client)
st.session_state.messages.append({"role": "assistant", "content": msg, "avatar": "🤖"})
messages.chat_message("system", avatar="🤖").write(msg)

if msg is not None:
audio_samples, sample_rate_hz = tts_synthesis(msg)
placeholder.audio(audio_samples,
sample_rate=sample_rate_hz, autoplay=True)
audio_samples, sample_rate_hz = tts_synthesis(msg, tts_client)
placeholder.audio(audio_samples, sample_rate=sample_rate_hz, autoplay=True)