diff --git a/src/text2speech/Whisper_TTS/Dockerfile b/src/text2speech/Whisper_TTS/Dockerfile new file mode 100644 index 0000000..633ebc1 --- /dev/null +++ b/src/text2speech/Whisper_TTS/Dockerfile @@ -0,0 +1,13 @@ +FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +WORKDIR /app + +COPY requirements.txt /app/ + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . /app/ + +EXPOSE 8000 + +CMD ["python", "api.py"] diff --git a/src/text2speech/Whisper_TTS/README.md b/src/text2speech/Whisper_TTS/README.md new file mode 100644 index 0000000..7090e37 --- /dev/null +++ b/src/text2speech/Whisper_TTS/README.md @@ -0,0 +1,107 @@ +This project uses WhisperSpeech to convert text to speech, with an option for voice cloning. +reference - https://github.com/collabora/WhisperSpeech + +## Endpoints + +### Text-to-Speech + +**Endpoint:** `/tts` +**Method:** `POST` +**Description:** Converts text to speech and returns an audio file. + +**Request Body:** +```json +{ + "text": "Your text here", + "language": "en", + "cps": 15, + "use_voice_cloning": false +} +``` + +- `text`: The text to be converted to speech. +- `language`: The language for the speech output (default: "en"). +- `cps`: Characters per second for speech synthesis (default: 10.5). +- `use_voice_cloning`: Optional flag to enable voice cloning (default: false). + +**Example Request:** + +Basic TTS: +```bash +curl -X POST http://localhost:8000/tts \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Hello, world! This is a test of the text-to-speech system.", + "language": "en", + "cps": 15 + }' \ + --output output.wav +``` + +Text-to-Speech with Voice Cloning: +```bash +curl -X POST http://localhost:8000/tts \ + -H "Content-Type: application/json" \ + -d '{ + "text": "This is a voice cloning test.", + "language": "en", + "cps": 15, + "use_voice_cloning": true + }' \ + --output cloned_output.wav +``` + +**Notes:** + +- For voice cloning to work, ensure that `reference_audio.wav` is present in the same directory as the server script. + +## Docker Deployment + +1. **Build the Docker Image:** + ```bash + docker build -t quart-tts-gpu-app . + ``` + +2. **Run the Docker Container:** + ```bash + docker run --gpus all -p 8000:8000 quart-tts-gpu-app + ``` + +## Setting Up Without Docker + +1. **Clone the Repository:** + +2. **Set Up a Virtual Environment (Optional but recommended):** + + ```bash + python -m venv venv + source venv/bin/activate # On Windows, use `venv\Scripts\activate` + ``` + +3. **Install Dependencies:** + + Ensure `requirements.txt` is present in the directory, then run: + ```bash + pip install --no-cache-dir -r requirements.txt + ``` + +4. **Prepare the Reference Audio (For Voice Cloning):** + + Place `reference_audio.wav` in the same directory as your `api.py` script if you plan to use voice cloning. + +5. **Run the Quart Application:** + + ```bash + python api.py + ``` + + The service will be accessible at `http://localhost:8000`. + +5. **Example Ouput:** + - ouptut with cloning + https://github.com/user-attachments/assets/fa4bc138-e17a-4e1c-a88a-8dd5f37e0db4 + + + - output without cloning + https://github.com/user-attachments/assets/1bb4ee27-3730-4cca-85f0-f72629d7c640 + diff --git a/src/text2speech/Whisper_TTS/api.py b/src/text2speech/Whisper_TTS/api.py new file mode 100644 index 0000000..4efeac8 --- /dev/null +++ b/src/text2speech/Whisper_TTS/api.py @@ -0,0 +1,47 @@ +from quart import Quart, request, send_file +from model import Model +from request import ModelRequest +import logging +import os + +app = Quart(__name__) +model = None + +logging.basicConfig(level=logging.DEBUG) + +@app.before_serving +async def startup(): + global model + app.logger.info("Initializing model...") + model = Model() + app.logger.info("Model initialized successfully") + +@app.route('/tts', methods=['POST']) +async def text_to_speech(): + global model + app.logger.info("Received request for text-to-speech") + try: + data = await request.get_json() + app.logger.debug(f"Received data: {data}") + + use_voice_cloning = data.get('use_voice_cloning', False) + req = ModelRequest(data) + + if use_voice_cloning: + app.logger.info("Voice cloning requested") + reference_audio_path = 'reference_audio.wav' + if not os.path.exists(reference_audio_path): + raise FileNotFoundError("Reference audio file not found in the current directory") + with open(reference_audio_path, 'rb') as voice_cloning_audio: + result = await model.inference(req, voice_cloning_audio) + else: + result = await model.inference(req) + + app.logger.info("Text-to-speech process completed successfully") + return await send_file(result['output_file']) + except Exception as e: + app.logger.error(f"Error in text_to_speech: {str(e)}") + return {"error": str(e)}, 500 + +if __name__ == "__main__": + app.run(host='0.0.0.0', port=8000, debug=True) diff --git a/src/text2speech/Whisper_TTS/audio/output_with_cloning.wav b/src/text2speech/Whisper_TTS/audio/output_with_cloning.wav new file mode 100644 index 0000000..d35778e Binary files /dev/null and b/src/text2speech/Whisper_TTS/audio/output_with_cloning.wav differ diff --git a/src/text2speech/Whisper_TTS/audio/output_without_cloning.wav b/src/text2speech/Whisper_TTS/audio/output_without_cloning.wav new file mode 100644 index 0000000..33f3e37 Binary files /dev/null and b/src/text2speech/Whisper_TTS/audio/output_without_cloning.wav differ diff --git a/src/text2speech/Whisper_TTS/model.py b/src/text2speech/Whisper_TTS/model.py new file mode 100644 index 0000000..5965811 --- /dev/null +++ b/src/text2speech/Whisper_TTS/model.py @@ -0,0 +1,37 @@ +import torchaudio +from whisperspeech.pipeline import Pipeline +from request import ModelRequest +import uuid + +class Model: + + def __init__(self): + self.pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model') + + def generate_uuid(self): + return uuid.uuid4() + + async def inference(self, request: ModelRequest, voice_cloning_audio=None): + reference_audio_path = None + if request.use_voice_cloning and voice_cloning_audio: + reference_audio_path = 'reference_audio.wav' + audio = self.text_to_speech(request.text, language=request.language, cps=request.cps, reference_audio_path=reference_audio_path) + + output_file = f"audio/output_{self.generate_uuid()}.wav" if not reference_audio_path else f"audio/output_cloned_{self.generate_uuid()}.wav" + self.save_audio(audio, output_file) + return {"output_file": output_file} + + def text_to_speech(self, text, language='en', cps=10.5, reference_audio_path=None): + if reference_audio_path: + audio = self.pipe.generate(text, lang=language, cps=cps, speaker=reference_audio_path) + else: + audio = self.pipe.generate(text, lang=language, cps=cps) + + audio_cpu = audio.cpu().squeeze() + if audio_cpu.dim() == 1: + audio_cpu = audio_cpu.unsqueeze(0) + return audio_cpu + + def save_audio(self, audio, output_file, sample_rate=24000): + torchaudio.save(output_file, audio, sample_rate=sample_rate, encoding="PCM_F") + print(f"Generated audio file: {output_file}") diff --git a/src/text2speech/Whisper_TTS/reference_audio.wav b/src/text2speech/Whisper_TTS/reference_audio.wav new file mode 100644 index 0000000..6ff5296 Binary files /dev/null and b/src/text2speech/Whisper_TTS/reference_audio.wav differ diff --git a/src/text2speech/Whisper_TTS/request.py b/src/text2speech/Whisper_TTS/request.py new file mode 100644 index 0000000..252fa1e --- /dev/null +++ b/src/text2speech/Whisper_TTS/request.py @@ -0,0 +1,6 @@ +class ModelRequest: + def __init__(self, data): + self.text = data.get('text', '') + self.language = data.get('language', 'en') + self.cps = data.get('cps', 10.5) + self.use_voice_cloning = data.get('use_voice_cloning', False) diff --git a/src/text2speech/Whisper_TTS/requirements.txt b/src/text2speech/Whisper_TTS/requirements.txt new file mode 100644 index 0000000..2921823 --- /dev/null +++ b/src/text2speech/Whisper_TTS/requirements.txt @@ -0,0 +1,5 @@ +quart +torch +torchaudio +whisperspeech +webdataset \ No newline at end of file