edge_tts 支持对语速的控制

shell-nlp · shell-nlp · commit d7fa9d3ae9e6 · 2024-12-23T17:47:48.000+08:00
diff --git a/gpt_server/openai_api_protocol/custom_api_protocol.py b/gpt_server/openai_api_protocol/custom_api_protocol.py
@@ -30,7 +30,9 @@ class SpeechRequest(BaseModel):
     )
     speed: Optional[float] = Field(
         default=1.0,
-        description="The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.",
+        description="The speed of the generated audio. Select a value from 0.25 to 5.0. 1.0 is the default.",
+        ge=0,
+        le=5,
     )
 
 
diff --git a/gpt_server/serving/openai_api_server.py b/gpt_server/serving/openai_api_server.py
@@ -3,9 +3,8 @@
 - Chat Completions. (Reference: https://platform.openai.com/docs/api-reference/chat)
 - Completions. (Reference: https://platform.openai.com/docs/api-reference/completions)
 - Embeddings. (Reference: https://platform.openai.com/docs/api-reference/embeddings)
-
-Usage:
-python3 -m fastchat.serve.openai_api_server
+- Moderations. (Reference: https://platform.openai.com/docs/api-reference/moderations)
+- Audio. (Reference: https://platform.openai.com/docs/api-reference/audio)
 """
 
 import asyncio
@@ -721,7 +720,14 @@ async def speech(request: SpeechRequest):
         )
     filename = f"{uuid.uuid4()}.mp3"
     output_path = os.path.join(OUTPUT_DIR, filename)
-    communicate = edge_tts.Communicate(text=request.input, voice=request.voice)
+    rate = 1.0
+    if request.speed >= 1:
+        rate = f"+{int((request.speed - 1) * 100)}%"
+    else:
+        rate = f"-{int((1-request.speed) * 100)}%"
+    communicate = edge_tts.Communicate(
+        text=request.input, voice=request.voice, rate=rate
+    )
     await communicate.save(output_path)
     return FileResponse(output_path, media_type="audio/mpeg", filename=filename)
 
diff --git a/gpt_server/version.py b/gpt_server/version.py
@@ -1,6 +1,6 @@
 from typing import Tuple
 
-__version__ = "0.3.2"
+__version__ = "0.3.5"
 short_version = __version__
 
 
diff --git a/tests/test_tts.py b/tests/test_tts.py
@@ -1,12 +1,25 @@
 from pathlib import Path
 from openai import OpenAI
+import asyncio
+import edge_tts
+from rich import print
 
-# 新版本 opnai
-client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
-speech_file_path = Path(__file__).parent / "speech.mp3"
-response = client.audio.speech.create(
-    model="edge_tts",
-    voice="zh-CN-YunxiNeural",
-    input="你好啊，我是人工智能。",
-)
-response.write_to_file(speech_file_path)
+
+async def main():
+    list_voices = await edge_tts.list_voices()
+    zh_list_voices = [i["ShortName"] for i in list_voices if "zh-CN" in i["ShortName"]]
+    print(f"支持以下中文voice: \n{zh_list_voices}")
+    # 新版本 opnai
+    client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
+    speech_file_path = Path(__file__).parent / "speech.mp3"
+    response = client.audio.speech.create(
+        model="edge_tts",
+        voice="zh-CN-YunxiNeural",
+        input="你好啊，我是人工智能。",
+        speed=1.0,
+    )
+    response.write_to_file(speech_file_path)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,9 @@ class SpeechRequest(BaseModel):`
`30`	`30`	`)`
`31`	`31`	`speed: Optional[float] = Field(`
`32`	`32`	`default=1.0,`
`33`		`- description="The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.",`
	`33`	`+ description="The speed of the generated audio. Select a value from 0.25 to 5.0. 1.0 is the default.",`
	`34`	`+ ge=0,`
	`35`	`+ le=5,`
`34`	`36`	`)`
`35`	`37`
`36`	`38`