Skip to content

Commit 926ea8c

Browse files
committed
Refactor Docker configurations and update test mocks for development routers
1 parent e8c1284 commit 926ea8c

File tree

11 files changed

+63
-49
lines changed

11 files changed

+63
-49
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ ui/data/*
66
*.db
77
*.pyc
88
*.pth
9-
*.pt
109

1110
Kokoro-82M/*
1211
__pycache__/

api/src/routers/openai_compatible.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ async def create_speech(
9393
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
9494
"X-Accel-Buffering": "no", # Disable proxy buffering
9595
"Cache-Control": "no-cache", # Prevent caching
96+
"Transfer-Encoding": "chunked", # Enable chunked transfer encoding
9697
},
9798
)
9899
else:

api/src/services/audio.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def convert_audio(
104104
# Raw 16-bit PCM samples, no header
105105
buffer.write(normalized_audio.tobytes())
106106
elif output_format == "wav":
107-
# Always use soundfile for WAV to ensure proper headers and normalization
107+
# WAV format with headers
108108
sf.write(
109109
buffer,
110110
normalized_audio,
@@ -113,14 +113,14 @@ def convert_audio(
113113
subtype="PCM_16",
114114
)
115115
elif output_format == "mp3":
116-
# Use format settings or defaults
116+
# MP3 format with proper framing
117117
settings = format_settings.get("mp3", {}) if format_settings else {}
118118
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
119119
sf.write(
120120
buffer, normalized_audio, sample_rate, format="MP3", **settings
121121
)
122-
123122
elif output_format == "opus":
123+
# Opus format in OGG container
124124
settings = format_settings.get("opus", {}) if format_settings else {}
125125
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
126126
sf.write(
@@ -131,8 +131,8 @@ def convert_audio(
131131
subtype="OPUS",
132132
**settings,
133133
)
134-
135134
elif output_format == "flac":
135+
# FLAC format with proper framing
136136
if is_first_chunk:
137137
logger.info("Starting FLAC stream...")
138138
settings = format_settings.get("flac", {}) if format_settings else {}
@@ -145,15 +145,14 @@ def convert_audio(
145145
subtype="PCM_16",
146146
**settings,
147147
)
148+
elif output_format == "aac":
149+
raise ValueError(
150+
"Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm."
151+
)
148152
else:
149-
if output_format == "aac":
150-
raise ValueError(
151-
"Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm."
152-
)
153-
else:
154-
raise ValueError(
155-
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
156-
)
153+
raise ValueError(
154+
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
155+
)
157156

158157
buffer.seek(0)
159158
return buffer.getvalue()

api/src/services/tts_service.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,15 @@ async def generate_audio_stream(
177177
)
178178

179179
if chunk_audio is not None:
180-
# Convert chunk with proper header handling
180+
# Convert chunk with proper streaming header handling
181181
chunk_bytes = AudioService.convert_audio(
182182
chunk_audio,
183183
24000,
184184
output_format,
185185
is_first_chunk=is_first,
186186
normalizer=stream_normalizer,
187187
is_last_chunk=(next_chunk is None), # Last if no next chunk
188+
stream=True # Ensure proper streaming format handling
188189
)
189190

190191
yield chunk_bytes

api/tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def mock_tts_service(monkeypatch):
181181
# Mock TTSModel.generate_from_tokens since we call it directly
182182
mock_generate = Mock(return_value=np.zeros(48000))
183183
monkeypatch.setattr(
184-
"api.src.routers.text_processing.TTSModel.generate_from_tokens", mock_generate
184+
"api.src.routers.development.TTSModel.generate_from_tokens", mock_generate
185185
)
186186

187187
return mock_service
@@ -192,5 +192,5 @@ def mock_audio_service(monkeypatch):
192192
"""Mock AudioService"""
193193
mock_service = Mock()
194194
mock_service.convert_audio.return_value = b"mock audio data"
195-
monkeypatch.setattr("api.src.routers.text_processing.AudioService", mock_service)
195+
monkeypatch.setattr("api.src.routers.development.AudioService", mock_service)
196196
return mock_service

api/tests/test_audio_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def test_convert_to_aac_raises_error(sample_audio):
6363
audio_data, sample_rate = sample_audio
6464
with pytest.raises(
6565
ValueError,
66-
match="Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm.",
66+
match="Failed to convert audio to aac: Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm.",
6767
):
6868
AudioService.convert_audio(audio_data, sample_rate, "aac")
6969

api/tests/test_text_processing.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ async def async_client():
2020
@pytest.mark.asyncio
2121
async def test_phonemize_endpoint(async_client):
2222
"""Test phoneme generation endpoint"""
23-
with patch("api.src.routers.text_processing.phonemize") as mock_phonemize, patch(
24-
"api.src.routers.text_processing.tokenize"
23+
with patch("api.src.routers.development.phonemize") as mock_phonemize, patch(
24+
"api.src.routers.development.tokenize"
2525
) as mock_tokenize:
2626
# Setup mocks
2727
mock_phonemize.return_value = "həlˈoʊ"
@@ -56,7 +56,7 @@ async def test_generate_from_phonemes(
5656
):
5757
"""Test audio generation from phonemes"""
5858
with patch(
59-
"api.src.routers.text_processing.TTSService", return_value=mock_tts_service
59+
"api.src.routers.development.TTSService", return_value=mock_tts_service
6060
):
6161
response = await async_client.post(
6262
"/text/generate_from_phonemes",
@@ -76,7 +76,7 @@ async def test_generate_from_phonemes_invalid_voice(async_client, mock_tts_servi
7676
"""Test audio generation with invalid voice"""
7777
mock_tts_service._get_voice_path.return_value = None
7878
with patch(
79-
"api.src.routers.text_processing.TTSService", return_value=mock_tts_service
79+
"api.src.routers.development.TTSService", return_value=mock_tts_service
8080
):
8181
response = await async_client.post(
8282
"/text/generate_from_phonemes",
@@ -111,7 +111,7 @@ async def test_generate_from_phonemes_invalid_speed(async_client, monkeypatch):
111111
async def test_generate_from_phonemes_empty_phonemes(async_client, mock_tts_service):
112112
"""Test audio generation with empty phonemes"""
113113
with patch(
114-
"api.src.routers.text_processing.TTSService", return_value=mock_tts_service
114+
"api.src.routers.development.TTSService", return_value=mock_tts_service
115115
):
116116
response = await async_client.post(
117117
"/text/generate_from_phonemes",

docker-compose.cpu.yml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ services:
2626
start_period: 1s
2727

2828
kokoro-tts:
29-
# image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
30-
# Uncomment below to build from source instead of using the released image
31-
build:
32-
context: .
33-
dockerfile: Dockerfile.cpu
29+
image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
30+
# Uncomment below (and comment out above) to build from source instead of using the released image
31+
# build:
32+
# context: .
33+
# dockerfile: Dockerfile.cpu
3434
volumes:
3535
- ./api/src:/app/api/src
3636
- ./Kokoro-82M:/app/Kokoro-82M
@@ -52,8 +52,8 @@ services:
5252

5353
# Gradio UI service [Comment out everything below if you don't need it]
5454
gradio-ui:
55-
# image: ghcr.io/remsky/kokoro-fastapi:latest-ui
56-
# Uncomment below to build from source instead of using the released image
55+
image: ghcr.io/remsky/kokoro-fastapi:latest-ui
56+
# Uncomment below (and comment out above) to build from source instead of using the released image
5757
build:
5858
context: ./ui
5959
ports:
@@ -63,3 +63,4 @@ services:
6363
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
6464
environment:
6565
- GRADIO_WATCH=True # Enable hot reloading
66+
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered

docker-compose.yml

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ services:
3232
start_period: 1s
3333

3434
kokoro-tts:
35-
# image: ghcr.io/remsky/kokoro-fastapi:latest
36-
# Uncomment below to build from source instead of using the released image
37-
build:
38-
context: .
35+
image: ghcr.io/remsky/kokoro-fastapi:latest
36+
# Uncomment below (and comment out above) to build from source instead of using the released image
37+
# build:
38+
# context: .
3939
volumes:
4040
- ./api/src:/app/api/src
4141
- ./Kokoro-82M:/app/Kokoro-82M
@@ -50,20 +50,30 @@ services:
5050
- driver: nvidia
5151
count: 1
5252
capabilities: [gpu]
53+
healthcheck:
54+
test: ["CMD", "curl", "-f", "http://localhost:8880/v1/audio/voices"]
55+
interval: 10s
56+
timeout: 5s
57+
retries: 30
58+
start_period: 30s
5359
depends_on:
5460
model-fetcher:
5561
condition: service_healthy
5662

5763
# Gradio UI service [Comment out everything below if you don't need it]
5864
gradio-ui:
59-
# image: ghcr.io/remsky/kokoro-fastapi:latest-ui
60-
# Uncomment below to build from source instead of using the released image
61-
build:
62-
context: ./ui
65+
image: ghcr.io/remsky/kokoro-fastapi:latest-ui
66+
# Uncomment below (and comment out above) to build from source instead of using the released image
67+
# build:
68+
# context: ./ui
6369
ports:
6470
- "7860:7860"
6571
volumes:
6672
- ./ui/data:/app/ui/data
6773
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
6874
environment:
6975
- GRADIO_WATCH=True # Enable hot reloading
76+
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
77+
depends_on:
78+
kokoro-tts:
79+
condition: service_healthy

examples/openai_streaming_audio.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
#!/usr/bin/env rye run python
2-
# %%
32
import time
43
from pathlib import Path
54

65
from openai import OpenAI
76

87
# gets OPENAI_API_KEY from your environment variables
9-
openai = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed-for-local")
8+
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
109

1110
speech_file_path = Path(__file__).parent / "speech.mp3"
1211

1312

14-
15-
16-
17-
1813
def main() -> None:
1914
stream_to_speakers()
2015

16+
# Create text-to-speech audio file
17+
with openai.audio.speech.with_streaming_response.create(
18+
model="kokoro",
19+
voice="af_bella",
20+
input="the quick brown fox jumped over the lazy dogs",
21+
) as response:
22+
response.stream_to_file(speech_file_path)
2123

2224

2325
def stream_to_speakers() -> None:
@@ -31,9 +33,12 @@ def stream_to_speakers() -> None:
3133

3234
with openai.audio.speech.with_streaming_response.create(
3335
model="kokoro",
34-
voice=VOICE,
35-
response_format="mp3", # similar to WAV, but without a header chunk at the start.
36-
input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension""",
36+
voice="af_bella",
37+
response_format="pcm", # similar to WAV, but without a header chunk at the start.
38+
input="""I see skies of blue and clouds of white
39+
The bright blessed days, the dark sacred nights
40+
And I think to myself
41+
What a wonderful world""",
3742
) as response:
3843
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
3944
for chunk in response.iter_bytes(chunk_size=1024):
@@ -44,5 +49,3 @@ def stream_to_speakers() -> None:
4449

4550
if __name__ == "__main__":
4651
main()
47-
48-
# %%

0 commit comments

Comments
 (0)