Support tts (#602)

QinYuuuu · Dev Agent · web-flow · commit 335093b5146b · 2025-12-16T17:20:39.000+08:00
Co-authored-by: Dev Agent &lt;dev-agent@example.com&gt;
diff --git a/builder/store/database/deploy_task.go b/builder/store/database/deploy_task.go
@@ -51,7 +51,7 @@ type Deploy struct {
 	SecureLevel int `json:"secure_level"`
 	// 0-space, 1-inference, 2-finetune, 3-serverless, 4-evaluation, 5-notebook
 	Type          int                `json:"type"`
-	Task          types.PipelineTask `bun:",nullzero" json:"task"` //text-generation,text-to-image
+	Task          types.PipelineTask `bun:",nullzero" json:"task"` //text-generation,text-to-image,text-to-speech
 	UserUUID      string             `bun:"," json:"user_uuid"`
 	SKU           string             `bun:"," json:"sku"`
 	OrderDetailID int64              `bun:"," json:"order_detail_id"`
diff --git a/common/types/repo.go b/common/types/repo.go
@@ -93,6 +93,7 @@ const (
 	SentenceSimilarity PipelineTask    = "sentence-similarity"
 	TaskAutoDetection  PipelineTask    = "task-auto-detection"
 	VideoText2Text     PipelineTask    = "video-text-to-text"
+	TextToSpeech       PipelineTask    = "text-to-speech"
 	LlamaCpp           InferenceEngine = "llama.cpp"
 	TEI                InferenceEngine = "tei"
 	Ktransformers      InferenceEngine = "ktransformers"
diff --git a/component/callback/git_callback.go b/component/callback/git_callback.go
@@ -561,6 +561,9 @@ func GetPipelineTaskFromTags(tags []database.Tag) types.PipelineTask {
 		if tag.Name == string(types.Text2Image) {
 			return types.Text2Image
 		}
+		if tag.Name == string(types.TextToSpeech) {
+			return types.TextToSpeech
+		}
 	}
 	return ""
 }
diff --git a/component/model.go b/component/model.go
@@ -1370,6 +1370,9 @@ func GetBuiltInTaskFromTags(tags []database.Tag) string {
 		if tag.Name == string(types.ImageText2Text) {
 			return tag.Name
 		}
+		if tag.Name == string(types.TextToSpeech) {
+			return tag.Name
+		}
 	}
 	return string(types.TextGeneration)
 }
diff --git a/configs/inference/audio-fish.json b/configs/inference/audio-fish.json
@@ -0,0 +1,20 @@
+{
+  "engine_name": "hf-inference-toolkit",
+  "enabled": 1,
+  "container_port": 8000,
+  "model_format": "safetensors",
+  "engine_images": [
+    {
+      "compute_type": "gpu",
+      "image": "opencsghq/fish-speech:server-cuda",
+      "driver_version": "12.6",
+      "engine_version": "1.5.1"
+    },
+    {
+      "compute_type": "cpu",
+      "image": "opencsghq/fish-speech:server-cpu",
+      "engine_version": "1.5.1"
+    }
+  ],
+  "supported_models": ["openaudio-s1-mini"]
+}
diff --git a/docker/inference/Dockerfile.fishaudio-cpu b/docker/inference/Dockerfile.fishaudio-cpu
@@ -0,0 +1,27 @@
+FROM docker.1ms.run/fishaudio/fish-speech:server-cpu
+USER root
+RUN apt-get update && \
+    apt-get install -y dumb-init && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Switch to the user from base image (fish, UID 1000) to install packages
+# Install directly into the virtual environment to avoid uv sync delays
+USER 1000:1000
+WORKDIR /app
+# Configure pip to use Aliyun mirror and install into the virtual environment
+RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
+    uv pip install --no-cache-dir csghub-sdk==0.7.10
+
+USER root
+COPY ./fishaudio/ /etc/csghub/
+RUN chmod +x /etc/csghub/*.sh
+
+WORKDIR /workspace/
+RUN curl -L -o references.tar.gz https://git-devops.opencsg.com/opensource/public_files/-/raw/main/references.tar.gz && \
+    tar -xzf references.tar.gz && \
+    rm references.tar.gz
+ENV HUGGINGFACE_HUB_CACHE=/workspace/ \
+    HF_HUB_ENABLE_HF_TRANSFER=0
+ENV PORT=8000
+EXPOSE 8000
+ENTRYPOINT [ "/usr/bin/dumb-init", "--" ]
+CMD ["/etc/csghub/serve.sh"]
diff --git a/docker/inference/Dockerfile.fishaudio-gpu b/docker/inference/Dockerfile.fishaudio-gpu
@@ -0,0 +1,27 @@
+FROM docker.1ms.run/fishaudio/fish-speech:server-cuda
+
+USER root
+RUN apt-get update && \
+    apt-get install -y dumb-init && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Switch to the user from base image (fish, UID 1000) to install packages
+# Install directly into the virtual environment to avoid uv sync delays
+USER 1000:1000
+WORKDIR /app
+RUN uv pip install --index-url https://mirrors.aliyun.com/pypi/simple --no-cache-dir csghub-sdk==0.7.10
+
+USER root
+COPY ./fishaudio/ /etc/csghub/
+RUN chmod +x /etc/csghub/*.sh
+
+WORKDIR /workspace/
+RUN curl -L -o references.tar.gz https://git-devops.opencsg.com/opensource/public_files/-/raw/main/references.tar.gz && \
+    tar -xzf references.tar.gz && \
+    rm references.tar.gz
+ENV HUGGINGFACE_HUB_CACHE=/workspace/ \
+    HF_HUB_ENABLE_HF_TRANSFER=0
+ENV PORT=8000
+ENV COMPILE=1
+EXPOSE 8000
+ENTRYPOINT [ "/usr/bin/dumb-init", "--" ]
+CMD ["/etc/csghub/serve.sh"]
diff --git a/docker/inference/fishaudio/README.md b/docker/inference/fishaudio/README.md
diff --git a/docker/inference/fishaudio/entry.py b/docker/inference/fishaudio/entry.py
diff --git a/docker/inference/fishaudio/serve.sh b/docker/inference/fishaudio/serve.sh

Original file line number	Diff line number	Diff line change
`@@ -561,6 +561,9 @@ func GetPipelineTaskFromTags(tags []database.Tag) types.PipelineTask {`
`561`	`561`	`if tag.Name == string(types.Text2Image) {`
`562`	`562`	`return types.Text2Image`
`563`	`563`	`}`
	`564`	`+ if tag.Name == string(types.TextToSpeech) {`
	`565`	`+ return types.TextToSpeech`
	`566`	`+ }`
`564`	`567`	`}`
`565`	`568`	`return ""`
`566`	`569`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1370,6 +1370,9 @@ func GetBuiltInTaskFromTags(tags []database.Tag) string {`
`1370`	`1370`	`if tag.Name == string(types.ImageText2Text) {`
`1371`	`1371`	`return tag.Name`
`1372`	`1372`	`}`
	`1373`	`+ if tag.Name == string(types.TextToSpeech) {`
	`1374`	`+ return tag.Name`
	`1375`	`+ }`
`1373`	`1376`	`}`
`1374`	`1377`	`return string(types.TextGeneration)`
`1375`	`1378`	`}`