diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 413d0476bb65..7f82f2eeb75a 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -105,6 +105,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "9" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "9" @@ -353,6 +366,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -680,6 +706,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-voxcpm' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -890,6 +929,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-voxcpm' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1343,6 +1395,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-voxcpm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxcpm" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 0d01cde73e37..6c87a3b08a69 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -284,4 +284,23 @@ jobs: - name: Test pocket-tts run: | make --jobs=5 --output-sync=target -C backend/python/pocket-tts - make --jobs=5 --output-sync=target -C backend/python/pocket-tts test \ No newline at end of file + make --jobs=5 --output-sync=target -C backend/python/pocket-tts test + tests-voxcpm: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential ffmpeg + sudo apt-get install -y ca-certificates cmake curl patch python3-pip + # Install UV + curl -LsSf https://astral.sh/uv/install.sh | sh + pip install --user --no-cache-dir grpcio-tools==1.64.1 + - name: Test voxcpm + run: | + make --jobs=5 --output-sync=target -C backend/python/voxcpm + make --jobs=5 --output-sync=target -C backend/python/voxcpm test \ No newline at end of file diff --git a/Makefile b/Makefile index 9bc95063e4d9..63687731f034 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine backends/pocket-tts +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine backends/pocket-tts backends/voxcpm GOCMD=go GOTEST=$(GOCMD) test @@ -317,6 +317,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/vibevoice $(MAKE) -C backend/python/moonshine $(MAKE) -C backend/python/pocket-tts + $(MAKE) -C backend/python/voxcpm test-extra: prepare-test-extra $(MAKE) -C backend/python/transformers test @@ -326,6 +327,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/vibevoice test $(MAKE) -C backend/python/moonshine test $(MAKE) -C backend/python/pocket-tts test + $(MAKE) -C backend/python/voxcpm test DOCKER_IMAGE?=local-ai DOCKER_AIO_IMAGE?=local-ai-aio @@ -459,6 +461,7 @@ BACKEND_CHATTERBOX = chatterbox|python|.|false|true BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true BACKEND_MOONSHINE = moonshine|python|.|false|true BACKEND_POCKET_TTS = pocket-tts|python|.|false|true +BACKEND_VOXCPM = voxcpm|python|.|false|true # Helper function to build docker image for a backend # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG) @@ -505,12 +508,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX))) $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE))) $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS))) +$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts docker-build-voxcpm ######################################################## ### END Backends diff --git a/backend/index.yaml b/backend/index.yaml index 916d070ab959..f7e90ad76b2d 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -428,6 +428,25 @@ nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice" icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 +- &voxcpm + urls: + - https://github.com/ModelBest/VoxCPM + description: | + VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech. + tags: + - text-to-speech + - TTS + license: mit + name: "voxcpm" + alias: "voxcpm" + capabilities: + nvidia: "cuda12-voxcpm" + intel: "intel-voxcpm" + amd: "rocm-voxcpm" + default: "cpu-voxcpm" + nvidia-cuda-13: "cuda13-voxcpm" + nvidia-cuda-12: "cuda12-voxcpm" + icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 - &pocket-tts urls: - https://github.com/kyutai-labs/pocket-tts @@ -1613,6 +1632,66 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice +## voxcpm +- !!merge <<: *voxcpm + name: "voxcpm-development" + capabilities: + nvidia: "cuda12-voxcpm-development" + intel: "intel-voxcpm-development" + amd: "rocm-voxcpm-development" + default: "cpu-voxcpm-development" + nvidia-cuda-13: "cuda13-voxcpm-development" + nvidia-cuda-12: "cuda12-voxcpm-development" +- !!merge <<: *voxcpm + name: "cpu-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxcpm" + mirrors: + - localai/localai-backends:latest-cpu-voxcpm +- !!merge <<: *voxcpm + name: "cpu-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxcpm" + mirrors: + - localai/localai-backends:master-cpu-voxcpm +- !!merge <<: *voxcpm + name: "cuda12-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-voxcpm +- !!merge <<: *voxcpm + name: "cuda12-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-voxcpm +- !!merge <<: *voxcpm + name: "cuda13-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-voxcpm +- !!merge <<: *voxcpm + name: "cuda13-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-voxcpm +- !!merge <<: *voxcpm + name: "intel-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-intel-voxcpm +- !!merge <<: *voxcpm + name: "intel-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-intel-voxcpm +- !!merge <<: *voxcpm + name: "rocm-voxcpm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voxcpm" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-voxcpm +- !!merge <<: *voxcpm + name: "rocm-voxcpm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voxcpm" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-voxcpm ## pocket-tts - !!merge <<: *pocket-tts name: "pocket-tts-development" diff --git a/backend/python/voxcpm/Makefile b/backend/python/voxcpm/Makefile new file mode 100644 index 000000000000..bfcf684aec23 --- /dev/null +++ b/backend/python/voxcpm/Makefile @@ -0,0 +1,23 @@ +.PHONY: voxcpm +voxcpm: + bash install.sh + +.PHONY: run +run: voxcpm + @echo "Running voxcpm..." + bash run.sh + @echo "voxcpm run." + +.PHONY: test +test: voxcpm + @echo "Testing voxcpm..." + bash test.sh + @echo "voxcpm tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/voxcpm/backend.py b/backend/python/voxcpm/backend.py new file mode 100644 index 000000000000..84bb99e96021 --- /dev/null +++ b/backend/python/voxcpm/backend.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +This is an extra gRPC server of LocalAI for VoxCPM +""" +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import traceback +import numpy as np +import soundfile as sf +from voxcpm import VoxCPM + +import backend_pb2 +import backend_pb2_grpc +import torch + +import grpc + +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False + +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + BackendServicer is the class that implements the gRPC service + """ + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + # Get device + if torch.cuda.is_available(): + print("CUDA is available", file=sys.stderr) + device = "cuda" + else: + print("CUDA is not available", file=sys.stderr) + device = "cpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + # Normalize potential 'mpx' typo to 'mps' + if device == "mpx": + print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr) + device = "mps" + + # Validate mps availability if requested + if device == "mps" and not torch.backends.mps.is_available(): + print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) + device = "cpu" + + self.device = device + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the audio + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon + # if value is a number, convert it to the appropriate type + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + # Get model path from request + model_path = request.Model + if not model_path: + model_path = "openbmb/VoxCPM1.5" + + try: + print(f"Loading model from {model_path}", file=sys.stderr) + self.model = VoxCPM.from_pretrained(model_path) + print(f"Model loaded successfully on device: {self.device}", file=sys.stderr) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def TTS(self, request, context): + try: + # Get generation parameters from options with defaults + cfg_value = self.options.get("cfg_value", 2.0) + inference_timesteps = self.options.get("inference_timesteps", 10) + normalize = self.options.get("normalize", False) + denoise = self.options.get("denoise", False) + retry_badcase = self.options.get("retry_badcase", True) + retry_badcase_max_times = self.options.get("retry_badcase_max_times", 3) + retry_badcase_ratio_threshold = self.options.get("retry_badcase_ratio_threshold", 6.0) + use_streaming = self.options.get("streaming", False) + + # Handle voice cloning via prompt_wav_path and prompt_text + prompt_wav_path = None + prompt_text = None + + # Priority: request.voice > AudioPath > options + if hasattr(request, 'voice') and request.voice: + # If voice is provided, try to use it as a path + if os.path.exists(request.voice): + prompt_wav_path = request.voice + elif hasattr(request, 'ModelFile') and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + potential_path = os.path.join(model_file_base, request.voice) + if os.path.exists(potential_path): + prompt_wav_path = potential_path + elif hasattr(request, 'ModelPath') and request.ModelPath: + potential_path = os.path.join(request.ModelPath, request.voice) + if os.path.exists(potential_path): + prompt_wav_path = potential_path + + if hasattr(request, 'AudioPath') and request.AudioPath: + if os.path.isabs(request.AudioPath): + prompt_wav_path = request.AudioPath + elif hasattr(request, 'ModelFile') and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + prompt_wav_path = os.path.join(model_file_base, request.AudioPath) + elif hasattr(request, 'ModelPath') and request.ModelPath: + prompt_wav_path = os.path.join(request.ModelPath, request.AudioPath) + else: + prompt_wav_path = request.AudioPath + + # Get prompt_text from options if available + if "prompt_text" in self.options: + prompt_text = self.options["prompt_text"] + + # Prepare text + text = request.text.strip() + + print(f"Generating audio with cfg_value: {cfg_value}, inference_timesteps: {inference_timesteps}, streaming: {use_streaming}", file=sys.stderr) + + # Generate audio + if use_streaming: + # Streaming generation + chunks = [] + for chunk in self.model.generate_streaming( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + normalize=normalize, + denoise=denoise, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ): + chunks.append(chunk) + wav = np.concatenate(chunks) + else: + # Non-streaming generation + wav = self.model.generate( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + normalize=normalize, + denoise=denoise, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + + # Get sample rate from model + sample_rate = self.model.tts_model.sample_rate + + # Save output + sf.write(request.dst, wav, sample_rate) + print(f"Saved output to {request.dst}", file=sys.stderr) + + except Exception as err: + print(f"Error in TTS: {err}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(success=True) + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + serve(args.addr) diff --git a/backend/python/voxcpm/install.sh b/backend/python/voxcpm/install.sh new file mode 100755 index 000000000000..9d167d8292ac --- /dev/null +++ b/backend/python/voxcpm/install.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +installRequirements + +# Apply patch to fix PyTorch compatibility issue in voxcpm +# This fixes the "Dimension out of range" error in scaled_dot_product_attention +# by changing .contiguous() to .unsqueeze(0) in the attention module +# The patch is needed because voxcpm's initialization test generation fails with +# certain PyTorch versions due to a bug in scaled_dot_product_attention +# https://github.com/OpenBMB/VoxCPM/issues/71#issuecomment-3441789452 +VOXCPM_PATH=$(python -c "import voxcpm; import os; print(os.path.dirname(voxcpm.__file__))" 2>/dev/null || echo "") +if [ -n "$VOXCPM_PATH" ] && [ -f "$VOXCPM_PATH/modules/minicpm4/model.py" ]; then + echo "Applying patch to voxcpm at $VOXCPM_PATH/modules/minicpm4/model.py" + # Replace .contiguous() with .unsqueeze(0) for the three lines in the attention forward_step method + # This fixes the dimension error in scaled_dot_product_attention + sed -i 's/query_states = query_states\.contiguous()/query_states = query_states.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + sed -i 's/key_cache = key_cache\.contiguous()/key_cache = key_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + sed -i 's/value_cache = value_cache\.contiguous()/value_cache = value_cache.unsqueeze(0)/g' "$VOXCPM_PATH/modules/minicpm4/model.py" + echo "Patch applied successfully" +else + echo "Warning: Could not find voxcpm installation to apply patch (path: ${VOXCPM_PATH:-not found})" +fi diff --git a/backend/python/voxcpm/protogen.sh b/backend/python/voxcpm/protogen.sh new file mode 100755 index 000000000000..df3325c6f94c --- /dev/null +++ b/backend/python/voxcpm/protogen.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runProtogen diff --git a/backend/python/voxcpm/requirements-cpu.txt b/backend/python/voxcpm/requirements-cpu.txt new file mode 100644 index 000000000000..a6369ef0100f --- /dev/null +++ b/backend/python/voxcpm/requirements-cpu.txt @@ -0,0 +1,6 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch +soundfile +numpy +voxcpm +torchcodec \ No newline at end of file diff --git a/backend/python/voxcpm/requirements-cublas12.txt b/backend/python/voxcpm/requirements-cublas12.txt new file mode 100644 index 000000000000..0482e1408a20 --- /dev/null +++ b/backend/python/voxcpm/requirements-cublas12.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-cublas13.txt b/backend/python/voxcpm/requirements-cublas13.txt new file mode 100644 index 000000000000..a17b28fa7dca --- /dev/null +++ b/backend/python/voxcpm/requirements-cublas13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-hipblas.txt b/backend/python/voxcpm/requirements-hipblas.txt new file mode 100644 index 000000000000..7541c8149db8 --- /dev/null +++ b/backend/python/voxcpm/requirements-hipblas.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.7.1+rocm6.3 +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-intel.txt b/backend/python/voxcpm/requirements-intel.txt new file mode 100644 index 000000000000..4a1780a68cbc --- /dev/null +++ b/backend/python/voxcpm/requirements-intel.txt @@ -0,0 +1,8 @@ +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +intel-extension-for-pytorch==2.3.110+xpu +torch==2.5.1+cxx11.abi +oneccl_bind_pt==2.8.0+xpu +setuptools +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-l4t12.txt b/backend/python/voxcpm/requirements-l4t12.txt new file mode 100644 index 000000000000..5967d6fd9d87 --- /dev/null +++ b/backend/python/voxcpm/requirements-l4t12.txt @@ -0,0 +1,5 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-l4t13.txt b/backend/python/voxcpm/requirements-l4t13.txt new file mode 100644 index 000000000000..a17b28fa7dca --- /dev/null +++ b/backend/python/voxcpm/requirements-l4t13.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements-mps.txt b/backend/python/voxcpm/requirements-mps.txt new file mode 100644 index 000000000000..bebe7af62dbb --- /dev/null +++ b/backend/python/voxcpm/requirements-mps.txt @@ -0,0 +1,4 @@ +torch +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/requirements.txt b/backend/python/voxcpm/requirements.txt new file mode 100644 index 000000000000..cc1cd74bd4de --- /dev/null +++ b/backend/python/voxcpm/requirements.txt @@ -0,0 +1,7 @@ +grpcio==1.76.0 +protobuf +certifi +packaging==24.1 +soundfile +numpy +voxcpm diff --git a/backend/python/voxcpm/run.sh b/backend/python/voxcpm/run.sh new file mode 100755 index 000000000000..eae121f37b0b --- /dev/null +++ b/backend/python/voxcpm/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ diff --git a/backend/python/voxcpm/test.py b/backend/python/voxcpm/test.py new file mode 100644 index 000000000000..c45462cfc73a --- /dev/null +++ b/backend/python/voxcpm/test.py @@ -0,0 +1,51 @@ +""" +A test script to test the gRPC service +""" +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) + time.sleep(30) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self.service.terminate() + self.service.wait() + + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + print("Starting test_load_model") + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="openbmb/VoxCPM1.5")) + print(response) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + tts_request = backend_pb2.TTSRequest(text="VoxCPM is an innovative end-to-end TTS model from ModelBest.") + tts_response = stub.TTS(tts_request) + self.assertIsNotNone(tts_response) + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() diff --git a/backend/python/voxcpm/test.sh b/backend/python/voxcpm/test.sh new file mode 100755 index 000000000000..eb59f2aaf3f3 --- /dev/null +++ b/backend/python/voxcpm/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 115a001498d9..2af25b279919 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -491,6 +491,8 @@ func (re *RequestExtractor) SetOpenResponsesRequest(c echo.Context) error { return echo.ErrBadRequest } + // Convert input items to Messages (this will be done in the endpoint handler) + // We store the input in the request for the endpoint to process cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) if !ok || cfg == nil { return echo.ErrBadRequest