From 51f591a7e5f9e67b055093254e97dd0a7bf95fb8 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 03:34:50 +0530 Subject: [PATCH 1/9] Fastapi app --- Dockerfile | 44 ++++++++++++++++++++ fastapi_app.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 Dockerfile create mode 100644 fastapi_app.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..b4565a0f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,44 @@ +# File: services/OpenVoice/Dockerfile +# Usa l'immagine di base di Ubuntu +FROM ubuntu:22.04 + +# Aggiorna il sistema e installa le dipendenze necessarie +RUN apt-get update && DEBIEN_FRONTEND=noninteractive apt-get install -y \ + sudo \ + python3.9 \ + python3-distutils \ + python3-pip \ + ffmpeg \ + git + +# Aggiorna pip +RUN pip install --upgrade pip + +# Imposta il working directory nel container +WORKDIR /app + +# Installa openai-whisper +RUN git clone https://github.com/myshell-ai/OpenVoice openvoice + +# Install FastAPI and Uvicorn, and other dependencies +RUN pip install uvicorn fastapi python-multipart langid faster-whisper whisper-timestamped unidecode eng-to-ipa pypinyin cn2an + +# Imposta il working directory nel container +WORKDIR /app/openvoice + +RUN pip install -e . +RUN pip install soundfile librosa inflect jieba silero + +RUN apt -y install -qq aria2 unzip +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /app/openvoice -o checkpoints_1226.zip +RUN unzip /app/openvoice/checkpoints_1226.zip +RUN mv /app/openvoice/checkpoints /app/openvoice/openvoice/checkpoints +RUN mv /app/openvoice/resources /app/openvoice/openvoice/resources + +EXPOSE 7860 + +# Set the working directory to the openvoice directory where fastapi_app.py will reside +WORKDIR /app/openvoice/openvoice + +# Command to run the FastAPI application with Uvicorn +CMD ["uvicorn", "fastapi_app:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/fastapi_app.py b/fastapi_app.py new file mode 100644 index 00000000..013333b0 --- /dev/null +++ b/fastapi_app.py @@ -0,0 +1,106 @@ +from fastapi import FastAPI, File, UploadFile, Form, HTTPException +from fastapi.responses import FileResponse +import os +import torch +import langid +from openvoice import se_extractor +from openvoice.api import BaseSpeakerTTS, ToneColorConverter +import shutil + +app = FastAPI() + +# Configuration from openvoice_app.py +en_ckpt_base = 'checkpoints/base_speakers/EN' +zh_ckpt_base = 'checkpoints/base_speakers/ZH' +ckpt_converter = 'checkpoints/converter' +device = 'cuda' if torch.cuda.is_available() else 'cpu' +output_dir = 'outputs' +os.makedirs(output_dir, exist_ok=True) + +# Load models +en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device) +en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth') +zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device) +zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth') +tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) +tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') + +# Load speaker embeddings +en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device) +en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device) +zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device) + +supported_languages = ['zh', 'en'] + +@app.post("/synthesize/") +async def synthesize_speech( + prompt: str = Form(...), + style: str = Form(...), + audio_file: UploadFile = File(...), + agree: bool = Form(...) +): + if not agree: + raise HTTPException(status_code=400, detail="Please accept the Terms & Condition!") + + # Save the uploaded audio file temporarily + temp_audio_path = os.path.join(output_dir, audio_file.filename) + with open(temp_audio_path, "wb") as buffer: + shutil.copyfileobj(audio_file.file, buffer) + + language_predicted = langid.classify(prompt)[0].strip() + print(f"Detected language: {language_predicted}") + + if language_predicted not in supported_languages: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}") + + if language_predicted == "zh": + tts_model = zh_base_speaker_tts + source_se = zh_source_se + language = 'Chinese' + if style not in ['default']: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The style {style} is not supported for Chinese, which should be in ['default']") + else: + tts_model = en_base_speaker_tts + if style == 'default': + source_se = en_source_default_se + else: + source_se = en_source_style_se + language = 'English' + if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") + + if len(prompt) < 2: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail="Please give a longer prompt text") + if len(prompt) > 200: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail="Text length limited to 200 characters for this demo, please try shorter text.") + + try: + target_se, audio_name = se_extractor.get_se(temp_audio_path, tone_color_converter, target_dir='processed', vad=True) + except Exception as e: + os.remove(temp_audio_path) + raise HTTPException(status_code=500, detail=f"Get target tone color error: {str(e)}") + + src_path = os.path.join(output_dir, 'tmp.wav') + tts_model.tts(prompt, src_path, speaker=style, language=language) + + save_path = os.path.join(output_dir, 'output.wav') + encode_message = "@MyShell" + tone_color_converter.convert( + audio_src_path=src_path, + src_se=source_se, + tgt_se=target_se, + output_path=save_path, + message=encode_message + ) + + # Clean up temporary files + os.remove(temp_audio_path) + os.remove(src_path) + + return FileResponse(save_path, media_type="audio/wav", filename="synthesized_audio.wav") + From 2b1f76c88f0fcaedfec3dd1608d0ef757a61ded6 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 03:36:18 +0530 Subject: [PATCH 2/9] Fastapi app comments added --- Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index b4565a0f..80064f44 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ # File: services/OpenVoice/Dockerfile -# Usa l'immagine di base di Ubuntu +# Use Ubuntu base image FROM ubuntu:22.04 -# Aggiorna il sistema e installa le dipendenze necessarie +# Update the system and install necessary dependencies RUN apt-get update && DEBIEN_FRONTEND=noninteractive apt-get install -y \ sudo \ python3.9 \ @@ -11,19 +11,19 @@ RUN apt-get update && DEBIEN_FRONTEND=noninteractive apt-get install -y \ ffmpeg \ git -# Aggiorna pip +# Upgrade pip RUN pip install --upgrade pip -# Imposta il working directory nel container +# Set the working directory in the container WORKDIR /app -# Installa openai-whisper +# Install openai-whisper RUN git clone https://github.com/myshell-ai/OpenVoice openvoice # Install FastAPI and Uvicorn, and other dependencies RUN pip install uvicorn fastapi python-multipart langid faster-whisper whisper-timestamped unidecode eng-to-ipa pypinyin cn2an -# Imposta il working directory nel container +# Set the working directory in the container WORKDIR /app/openvoice RUN pip install -e . From 0a6bcd917c5f05289fea2ab991e4118a1bda6f96 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 09:40:32 +0530 Subject: [PATCH 3/9] FastApi app production level --- Dockerfile | 4 ++-- fastapi_app.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 80064f44..f2d250af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # File: services/OpenVoice/Dockerfile # Use Ubuntu base image -FROM ubuntu:22.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 # Update the system and install necessary dependencies RUN apt-get update && DEBIEN_FRONTEND=noninteractive apt-get install -y \ @@ -18,7 +18,7 @@ RUN pip install --upgrade pip WORKDIR /app # Install openai-whisper -RUN git clone https://github.com/myshell-ai/OpenVoice openvoice +RUN git clone https://github.com/namanthapliyal/OpenVoice.git openvoice # Install FastAPI and Uvicorn, and other dependencies RUN pip install uvicorn fastapi python-multipart langid faster-whisper whisper-timestamped unidecode eng-to-ipa pypinyin cn2an diff --git a/fastapi_app.py b/fastapi_app.py index 013333b0..a31d91d4 100644 --- a/fastapi_app.py +++ b/fastapi_app.py @@ -32,6 +32,10 @@ supported_languages = ['zh', 'en'] +@app.get("/") +async def root(): + return {"message": "Welcome to the OpenVoice API! Server is up and running!"} + @app.post("/synthesize/") async def synthesize_speech( prompt: str = Form(...), From b583fb6ddfb2e9c6e71dd6c77659bfd664245cf5 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 10:02:18 +0530 Subject: [PATCH 4/9] Updated readme for usage --- docs/USAGE.md | 7 +++-- docs/docker_usage.md | 74 ++++++++++++++++++++++++++++++++++++++++++++ fastapi_app.py | 3 -- 3 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 docs/docker_usage.md diff --git a/docs/USAGE.md b/docs/USAGE.md index ff051a83..861e0326 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -4,8 +4,8 @@ - [Quick Use](#quick-use): directly use OpenVoice without installation. - [Linux Install](#linux-install): for researchers and developers only. - - [V1](#openvoice-v1) - - [V2](#openvoice-v2) + - [V1](#openvoice-v1) + - [V2](#openvoice-v2) - [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community ## Quick Use @@ -63,6 +63,7 @@ Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for language Download the checkpoint from [here](https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder. Install [MeloTTS](https://github.com/myshell-ai/MeloTTS): + ``` pip install git+https://github.com/myshell-ai/MeloTTS.git python -m unidic download @@ -70,7 +71,6 @@ python -m unidic download **Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean. - ## Install on Other Platforms This section provides the unofficial installation guides by open-source contributors in the community: @@ -79,5 +79,6 @@ This section provides the unofficial installation guides by open-source contribu - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups) - You are welcome to contribute if you have a better installation guide. We will list you here. - Docker + - [Guide] (https://github.com/namanthapliyal/OpenVoice/docs/docker_usage.md) by [@namanthapliyal](https://github.com/namanthapliyal/) - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF) - You are welcome to contribute if you have a better installation guide. We will list you here. diff --git a/docs/docker_usage.md b/docs/docker_usage.md new file mode 100644 index 00000000..10d5813e --- /dev/null +++ b/docs/docker_usage.md @@ -0,0 +1,74 @@ +## Local Development Setup + +Follow these steps to set up and run the application locally for development and debugging. + +### 1. Clone the Repository + +First, clone this repository to your local machine: + +```bash +git clone https://github.com/namanthapliyal/OpenVoice.git +cd ./OpenVoice +``` + +### 2. Build the Docker Image + +In the root location of the project, build the Docker image using the following command: + +```bash +docker build -t openvoice-fastapi . +``` + +This command will: + +Pull the nvidia/cuda base image. +Install necessary system dependencies and Python packages. +Clone the OpenVoice library. +Download pre-trained checkpoints required for voice synthesis. +Set up the working directory and expose the application port. +Build the Docker image. + +This process may take some time, especially during the initial download of the base image and checkpoints. + +### 3. Run the Docker Container + +Once the image is built, you can run a container from it. To enable GPU acceleration and map the application's port to your host machine, use the following command: + +```bash +docker run --gpus all -p 7860:7860 openvoice-fastapi + +``` + +- --gpus all: Exposes all available NVIDIA GPUs on your host to the container. Ensure the NVIDIA Container Toolkit is correctly installed. +- -p 7860:7860: Maps port 7860 inside the container (where FastAPI runs) to port 7860 on your host machine. + +The FastAPI application will now be accessible at http://localhost:7860. + +### 4. Interact with the API + +You can test the API using curl or any API client (like Postman, Insomnia, or your browser for GET requests). The primary endpoint is /synthesize/ which accepts POST requests with multipart/form-data. + +Example curl Request: + +```bash +curl -X POST "http://localhost:7860/synthesize/" \ + -H "accept: application/json" \ + -H "Content-Type: multipart/form-data" \ + -F "prompt=This is a test sentence for voice synthesis." \ + -F "style=default" \ + -F "audio_file=@/path/to/your/reference_audio.mp3" \ + -F "agree=true" \ + --output synthesized_audio.wav +``` + +Parameters: + +- prompt (string, required): The text to be synthesized. +- style (string, required): The speaking style. Supported values: default, whispering, shouting, excited, cheerful, terrified, angry, sad, friendly. (Note: Chinese only supports default). +- audio_file (file, required): An audio file (.mp3 or .wav) of the reference speaker whose voice you want to clone. +- agree (boolean, required): Must be true to accept the terms and conditions. + +The API will return the synthesized audio as a .wav file. + +Output Directory +Synthesized audio files and temporary processing files will be stored in the outputs/ directory within the container. For local debugging, you might want to mount a volume to persist these outputs on your host machine. diff --git a/fastapi_app.py b/fastapi_app.py index a31d91d4..73c9ffc3 100644 --- a/fastapi_app.py +++ b/fastapi_app.py @@ -41,10 +41,7 @@ async def synthesize_speech( prompt: str = Form(...), style: str = Form(...), audio_file: UploadFile = File(...), - agree: bool = Form(...) ): - if not agree: - raise HTTPException(status_code=400, detail="Please accept the Terms & Condition!") # Save the uploaded audio file temporarily temp_audio_path = os.path.join(output_dir, audio_file.filename) From ef6ab7fca31511e09027e033a755c29c0ae2546e Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 10:03:35 +0530 Subject: [PATCH 5/9] Updated readme for usage --- docs/USAGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/USAGE.md b/docs/USAGE.md index 861e0326..33debbe5 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -79,6 +79,6 @@ This section provides the unofficial installation guides by open-source contribu - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups) - You are welcome to contribute if you have a better installation guide. We will list you here. - Docker - - [Guide] (https://github.com/namanthapliyal/OpenVoice/docs/docker_usage.md) by [@namanthapliyal](https://github.com/namanthapliyal/) + - [Guide](https://github.com/namanthapliyal/OpenVoice/blob/main/docs/docker_usage.md) by [@namanthapliyal](https://github.com/namanthapliyal/) - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF) - You are welcome to contribute if you have a better installation guide. We will list you here. From af9b9e63c97735098cbd20e889cf93330eb5c55a Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 10:06:54 +0530 Subject: [PATCH 6/9] Updated readme for usage --- docs/docker_usage.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/docker_usage.md b/docs/docker_usage.md index 10d5813e..c43962bf 100644 --- a/docs/docker_usage.md +++ b/docs/docker_usage.md @@ -72,3 +72,7 @@ The API will return the synthesized audio as a .wav file. Output Directory Synthesized audio files and temporary processing files will be stored in the outputs/ directory within the container. For local debugging, you might want to mount a volume to persist these outputs on your host machine. + +### 4. Access Swagger Doc + +You can access the Swagger UI documentation by navigating to http://localhost:7860/docs in your web browser. This provides an interactive API reference and allows you to test the API endpoints directly through the UI. From 8c04fa3980209aa124303dbbe2a667591418cce8 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 19:57:46 +0530 Subject: [PATCH 7/9] updated requirements --- requirements.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/requirements.txt b/requirements.txt index 8ddba70d..8ffb0d89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,15 @@ cn2an==0.5.22 jieba==0.42.1 gradio==3.48.0 langid==1.1.6 +# Core web server +uvicorn +fastapi +python-multipart + +# Whisper and language processing +eng-to-ipa + + +# Audio and text processing +soundfile +silero From b1929569bb0538afc8d4e584dda7870a9beb7316 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 20:20:54 +0530 Subject: [PATCH 8/9] updated requirements --- requirements.txt | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8ffb0d89..b3cccb77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +# OpenVoice core requirements, with pinned versions for compatibility librosa==0.9.1 faster-whisper==0.9.0 pydub==0.25.1 @@ -14,15 +15,9 @@ cn2an==0.5.22 jieba==0.42.1 gradio==3.48.0 langid==1.1.6 -# Core web server -uvicorn + +# Add extra requirements for your FastAPI wrapper fastapi +uvicorn python-multipart -# Whisper and language processing -eng-to-ipa - - -# Audio and text processing -soundfile -silero From 8b3571bda08b6d72b1b8cf3e62cfd2b841d7f038 Mon Sep 17 00:00:00 2001 From: Naman Thapliyal Date: Sat, 19 Jul 2025 20:57:43 +0530 Subject: [PATCH 9/9] optimized docker file --- Dockerfile | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/Dockerfile b/Dockerfile index f2d250af..98585b80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,44 +1,39 @@ -# File: services/OpenVoice/Dockerfile -# Use Ubuntu base image FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -# Update the system and install necessary dependencies -RUN apt-get update && DEBIEN_FRONTEND=noninteractive apt-get install -y \ - sudo \ - python3.9 \ - python3-distutils \ +# Install Python 3.10 and pip, as well as other dependencies +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.10 \ + python3.10-distutils \ python3-pip \ + sudo \ ffmpeg \ - git + git \ + aria2 \ + unzip && \ + rm -rf /var/lib/apt/lists/* -# Upgrade pip -RUN pip install --upgrade pip +# Optional: ensure python3 points to python3.10 +RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 -# Set the working directory in the container WORKDIR /app -# Install openai-whisper +# Clone OpenVoice (or use COPY for local code) RUN git clone https://github.com/namanthapliyal/OpenVoice.git openvoice -# Install FastAPI and Uvicorn, and other dependencies -RUN pip install uvicorn fastapi python-multipart langid faster-whisper whisper-timestamped unidecode eng-to-ipa pypinyin cn2an - -# Set the working directory in the container WORKDIR /app/openvoice -RUN pip install -e . -RUN pip install soundfile librosa inflect jieba silero +# Install Python dependencies +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --no-cache-dir -r requirements.txt && \ + python3 -m pip install --no-cache-dir -e . -RUN apt -y install -qq aria2 unzip -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /app/openvoice -o checkpoints_1226.zip -RUN unzip /app/openvoice/checkpoints_1226.zip -RUN mv /app/openvoice/checkpoints /app/openvoice/openvoice/checkpoints -RUN mv /app/openvoice/resources /app/openvoice/openvoice/resources +# Download and place checkpoints/resources +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /app/openvoice -o checkpoints_1226.zip && \ + unzip /app/openvoice/checkpoints_1226.zip && \ + rm checkpoints_1226.zip EXPOSE 7860 -# Set the working directory to the openvoice directory where fastapi_app.py will reside -WORKDIR /app/openvoice/openvoice -# Command to run the FastAPI application with Uvicorn CMD ["uvicorn", "fastapi_app:app", "--host", "0.0.0.0", "--port", "7860"]