-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
85 lines (75 loc) · 3.49 KB
/
app.py
File metadata and controls
85 lines (75 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# app.py
import streamlit as st
from dotenv import load_dotenv
import os
from tts_utils import synthesize_and_save, synthesize_local_tts, OUTPUT_DIR
import base64
import time
load_dotenv()
st.set_page_config(page_title="VoiceGenAI — Text to Speech", page_icon="🎤", layout="centered")
st.title("🎤 VoiceGenAI — Text → Speech")
st.caption("Fast demo: Hugging Face Inference API (default) with optional local TTS fallback")
# Sidebar: settings
st.sidebar.header("Settings")
model_choice = st.sidebar.selectbox("TTS Model (Hugging Face)", options=[
"facebook/mms-tts-eng",
"espnet/kan-bayashi-ljspeech",
"tts_models/en/ljspeech/tacotron2-DDC (local optional)"
], index=0)
use_local = model_choice.startswith("tts_models")
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
st.sidebar.warning("HF_TOKEN not found in environment. Put your token in .env or export HF_TOKEN.")
st.sidebar.markdown("**Output options**")
format_choice = st.sidebar.selectbox("Output format (what to save)", ["wav", "mp3"])
st.sidebar.write("Generated files saved to:", OUTPUT_DIR)
# Main UI
col1, col2 = st.columns([3,1])
with col1:
text_input = st.text_area("Enter text to synthesize", value="Welcome to VoiceGenAI — convert text to realistic speech.", height=180)
if st.button("Synthesize"):
if len(text_input.strip()) == 0:
st.error("Please enter some text.")
else:
with st.spinner("Synthesizing..."):
try:
if use_local:
# Local TTS path
out_fp = synthesize_local_tts(text_input)
else:
# Use HF Inference API
model = model_choice
# choose filename
fname = f"voicegen_{int(time.time())}.wav"
out_fp = synthesize_and_save(text=text_input, model=model, out_filename=fname)
st.success(f"Saved ▶ {out_fp}")
# Show audio player
audio_bytes = open(out_fp, "rb").read()
st.audio(audio_bytes)
# Offer download link
b64 = base64.b64encode(audio_bytes).decode()
st.markdown(f"[Download audio file](data:audio/wav;base64,{b64})", unsafe_allow_html=True)
except Exception as e:
st.exception(e)
with col2:
st.header("Quick presets")
if st.button("Short Greeting"):
st.session_state['preset'] = "Hi, this is VoiceGenAI. Have a great day!"
if st.button("Technical Demo"):
st.session_state['preset'] = "This is a demo of text-to-speech synthesis for developer workflows."
if 'preset' in st.session_state:
st.write("Loaded preset:")
st.write(st.session_state['preset'])
st.write("Click Synthesize to generate audio.")
st.markdown("---")
st.info("Notes: For highest quality voices across languages, use the Hugging Face Inference API with a token. Local TTS (coqui) is available as an option but requires installing TTS and model download.")
st.markdown("## Generated files")
import glob
files = sorted(glob.glob(os.path.join(OUTPUT_DIR, "*")), key=os.path.getmtime, reverse=True)[:10]
for f in files:
st.write(f)
with st.expander("Play & download"):
audio_bytes = open(f, "rb").read()
st.audio(audio_bytes)
b64 = base64.b64encode(audio_bytes).decode()
st.markdown(f"[Download](data:audio/wav;base64,{b64})", unsafe_allow_html=True)