Skip to content

Commit 4b91c6e

Browse files
committed
feat: add VoxCPM tts backend
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent ec15988 commit 4b91c6e

21 files changed

+599
-21
lines changed

.github/workflows/backend.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,19 @@ jobs:
118118
dockerfile: "./backend/Dockerfile.python"
119119
context: "./"
120120
ubuntu-version: '2404'
121+
- build-type: 'cublas'
122+
cuda-major-version: "12"
123+
cuda-minor-version: "9"
124+
platforms: 'linux/amd64'
125+
tag-latest: 'auto'
126+
tag-suffix: '-gpu-nvidia-cuda-12-voxcpm'
127+
runs-on: 'ubuntu-latest'
128+
base-image: "ubuntu:24.04"
129+
skip-drivers: 'false'
130+
backend: "voxcpm"
131+
dockerfile: "./backend/Dockerfile.python"
132+
context: "./"
133+
ubuntu-version: '2404'
121134
- build-type: 'cublas'
122135
cuda-major-version: "12"
123136
cuda-minor-version: "9"
@@ -366,6 +379,19 @@ jobs:
366379
dockerfile: "./backend/Dockerfile.python"
367380
context: "./"
368381
ubuntu-version: '2404'
382+
- build-type: 'cublas'
383+
cuda-major-version: "13"
384+
cuda-minor-version: "0"
385+
platforms: 'linux/amd64'
386+
tag-latest: 'auto'
387+
tag-suffix: '-gpu-nvidia-cuda-13-voxcpm'
388+
runs-on: 'ubuntu-latest'
389+
base-image: "ubuntu:24.04"
390+
skip-drivers: 'false'
391+
backend: "voxcpm"
392+
dockerfile: "./backend/Dockerfile.python"
393+
context: "./"
394+
ubuntu-version: '2404'
369395
- build-type: 'cublas'
370396
cuda-major-version: "13"
371397
cuda-minor-version: "0"
@@ -719,6 +745,19 @@ jobs:
719745
dockerfile: "./backend/Dockerfile.python"
720746
context: "./"
721747
ubuntu-version: '2404'
748+
- build-type: 'hipblas'
749+
cuda-major-version: ""
750+
cuda-minor-version: ""
751+
platforms: 'linux/amd64'
752+
tag-latest: 'auto'
753+
tag-suffix: '-gpu-rocm-hipblas-voxcpm'
754+
runs-on: 'arc-runner-set'
755+
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
756+
skip-drivers: 'false'
757+
backend: "voxcpm"
758+
dockerfile: "./backend/Dockerfile.python"
759+
context: "./"
760+
ubuntu-version: '2404'
722761
- build-type: 'hipblas'
723762
cuda-major-version: ""
724763
cuda-minor-version: ""
@@ -942,6 +981,19 @@ jobs:
942981
dockerfile: "./backend/Dockerfile.python"
943982
context: "./"
944983
ubuntu-version: '2404'
984+
- build-type: 'intel'
985+
cuda-major-version: ""
986+
cuda-minor-version: ""
987+
platforms: 'linux/amd64'
988+
tag-latest: 'auto'
989+
tag-suffix: '-gpu-intel-voxcpm'
990+
runs-on: 'arc-runner-set'
991+
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
992+
skip-drivers: 'false'
993+
backend: "voxcpm"
994+
dockerfile: "./backend/Dockerfile.python"
995+
context: "./"
996+
ubuntu-version: '2404'
945997
- build-type: 'intel'
946998
cuda-major-version: ""
947999
cuda-minor-version: ""
@@ -1341,6 +1393,19 @@ jobs:
13411393
dockerfile: "./backend/Dockerfile.python"
13421394
context: "./"
13431395
ubuntu-version: '2404'
1396+
- build-type: ''
1397+
cuda-major-version: ""
1398+
cuda-minor-version: ""
1399+
platforms: 'linux/amd64,linux/arm64'
1400+
tag-latest: 'auto'
1401+
tag-suffix: '-cpu-voxcpm'
1402+
runs-on: 'ubuntu-latest'
1403+
base-image: "ubuntu:24.04"
1404+
skip-drivers: 'false'
1405+
backend: "voxcpm"
1406+
dockerfile: "./backend/Dockerfile.python"
1407+
context: "./"
1408+
ubuntu-version: '2404'
13441409
- build-type: ''
13451410
cuda-major-version: ""
13461411
cuda-minor-version: ""

.github/workflows/test-extra.yml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -304,22 +304,22 @@ jobs:
304304
run: |
305305
make --jobs=5 --output-sync=target -C backend/python/qwen-tts
306306
make --jobs=5 --output-sync=target -C backend/python/qwen-tts test
307-
# tests-vibevoice:
308-
# runs-on: bigger-runner
309-
# steps:
310-
# - name: Clone
311-
# uses: actions/checkout@v6
312-
# with:
313-
# submodules: true
314-
# - name: Dependencies
315-
# run: |
316-
# sudo apt-get update
317-
# sudo apt-get install -y build-essential ffmpeg
318-
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip wget
319-
# # Install UV
320-
# curl -LsSf https://astral.sh/uv/install.sh | sh
321-
# pip install --user --no-cache-dir --break-system-packages grpcio-tools==1.64.1
322-
# - name: Test vibevoice
323-
# run: |
324-
# make --jobs=5 --output-sync=target -C backend/python/vibevoice
325-
# make --jobs=5 --output-sync=target -C backend/python/vibevoice test
307+
tests-voxcpm:
308+
runs-on: ubuntu-latest
309+
steps:
310+
- name: Clone
311+
uses: actions/checkout@v6
312+
with:
313+
submodules: true
314+
- name: Dependencies
315+
run: |
316+
sudo apt-get update
317+
sudo apt-get install build-essential ffmpeg
318+
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
319+
# Install UV
320+
curl -LsSf https://astral.sh/uv/install.sh | sh
321+
pip install --user --no-cache-dir grpcio-tools==1.64.1
322+
- name: Test voxcpm
323+
run: |
324+
make --jobs=5 --output-sync=target -C backend/python/voxcpm
325+
make --jobs=5 --output-sync=target -C backend/python/voxcpm test

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Disable parallel execution for backend builds
2-
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts
2+
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/voxcpm
33

44
GOCMD=go
55
GOTEST=$(GOCMD) test
@@ -319,6 +319,7 @@ prepare-test-extra: protogen-python
319319
$(MAKE) -C backend/python/moonshine
320320
$(MAKE) -C backend/python/pocket-tts
321321
$(MAKE) -C backend/python/qwen-tts
322+
$(MAKE) -C backend/python/voxcpm
322323

323324
test-extra: prepare-test-extra
324325
$(MAKE) -C backend/python/transformers test
@@ -330,6 +331,7 @@ test-extra: prepare-test-extra
330331
$(MAKE) -C backend/python/moonshine test
331332
$(MAKE) -C backend/python/pocket-tts test
332333
$(MAKE) -C backend/python/qwen-tts test
334+
$(MAKE) -C backend/python/voxcpm test
333335

334336
DOCKER_IMAGE?=local-ai
335337
DOCKER_AIO_IMAGE?=local-ai-aio
@@ -462,6 +464,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
462464
BACKEND_MOONSHINE = moonshine|python|.|false|true
463465
BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
464466
BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
467+
BACKEND_VOXCPM = voxcpm|python|.|false|true
465468

466469
# Helper function to build docker image for a backend
467470
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
@@ -507,12 +510,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
507510
$(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
508511
$(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
509512
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
513+
$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM)))
510514

511515
# Pattern rule for docker-save targets
512516
docker-save-%: backend-images
513517
docker save local-ai-backend:$* -o backend-images/$*.tar
514518

515-
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts
519+
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-voxcpm
516520

517521
########################################################
518522
### END Backends

backend/index.yaml

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,25 @@
414414
nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts"
415415
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts"
416416
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
417+
- &voxcpm
418+
urls:
419+
- https://github.com/ModelBest/VoxCPM
420+
description: |
421+
VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.
422+
tags:
423+
- text-to-speech
424+
- TTS
425+
license: mit
426+
name: "voxcpm"
427+
alias: "voxcpm"
428+
capabilities:
429+
nvidia: "cuda12-voxcpm"
430+
intel: "intel-voxcpm"
431+
amd: "rocm-voxcpm"
432+
default: "cpu-voxcpm"
433+
nvidia-cuda-13: "cuda13-voxcpm"
434+
nvidia-cuda-12: "cuda12-voxcpm"
435+
icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4
417436
- &pocket-tts
418437
urls:
419438
- https://github.com/kyutai-labs/pocket-tts
@@ -1652,6 +1671,66 @@
16521671
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts"
16531672
mirrors:
16541673
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts
1674+
## voxcpm
1675+
- !!merge <<: *voxcpm
1676+
name: "voxcpm-development"
1677+
capabilities:
1678+
nvidia: "cuda12-voxcpm-development"
1679+
intel: "intel-voxcpm-development"
1680+
amd: "rocm-voxcpm-development"
1681+
default: "cpu-voxcpm-development"
1682+
nvidia-cuda-13: "cuda13-voxcpm-development"
1683+
nvidia-cuda-12: "cuda12-voxcpm-development"
1684+
- !!merge <<: *voxcpm
1685+
name: "cpu-voxcpm"
1686+
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxcpm"
1687+
mirrors:
1688+
- localai/localai-backends:latest-cpu-voxcpm
1689+
- !!merge <<: *voxcpm
1690+
name: "cpu-voxcpm-development"
1691+
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxcpm"
1692+
mirrors:
1693+
- localai/localai-backends:master-cpu-voxcpm
1694+
- !!merge <<: *voxcpm
1695+
name: "cuda12-voxcpm"
1696+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voxcpm"
1697+
mirrors:
1698+
- localai/localai-backends:latest-gpu-nvidia-cuda-12-voxcpm
1699+
- !!merge <<: *voxcpm
1700+
name: "cuda12-voxcpm-development"
1701+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voxcpm"
1702+
mirrors:
1703+
- localai/localai-backends:master-gpu-nvidia-cuda-12-voxcpm
1704+
- !!merge <<: *voxcpm
1705+
name: "cuda13-voxcpm"
1706+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voxcpm"
1707+
mirrors:
1708+
- localai/localai-backends:latest-gpu-nvidia-cuda-13-voxcpm
1709+
- !!merge <<: *voxcpm
1710+
name: "cuda13-voxcpm-development"
1711+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voxcpm"
1712+
mirrors:
1713+
- localai/localai-backends:master-gpu-nvidia-cuda-13-voxcpm
1714+
- !!merge <<: *voxcpm
1715+
name: "intel-voxcpm"
1716+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-voxcpm"
1717+
mirrors:
1718+
- localai/localai-backends:latest-gpu-intel-voxcpm
1719+
- !!merge <<: *voxcpm
1720+
name: "intel-voxcpm-development"
1721+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-voxcpm"
1722+
mirrors:
1723+
- localai/localai-backends:master-gpu-intel-voxcpm
1724+
- !!merge <<: *voxcpm
1725+
name: "rocm-voxcpm"
1726+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voxcpm"
1727+
mirrors:
1728+
- localai/localai-backends:latest-gpu-rocm-hipblas-voxcpm
1729+
- !!merge <<: *voxcpm
1730+
name: "rocm-voxcpm-development"
1731+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voxcpm"
1732+
mirrors:
1733+
- localai/localai-backends:master-gpu-rocm-hipblas-voxcpm
16551734
## pocket-tts
16561735
- !!merge <<: *pocket-tts
16571736
name: "pocket-tts-development"

backend/python/voxcpm/Makefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.PHONY: voxcpm
2+
voxcpm:
3+
bash install.sh
4+
5+
.PHONY: run
6+
run: voxcpm
7+
@echo "Running voxcpm..."
8+
bash run.sh
9+
@echo "voxcpm run."
10+
11+
.PHONY: test
12+
test: voxcpm
13+
@echo "Testing voxcpm..."
14+
bash test.sh
15+
@echo "voxcpm tested."
16+
17+
.PHONY: protogen-clean
18+
protogen-clean:
19+
$(RM) backend_pb2_grpc.py backend_pb2.py
20+
21+
.PHONY: clean
22+
clean: protogen-clean
23+
rm -rf venv __pycache__

0 commit comments

Comments
 (0)