Skip to content

Commit 90d7d10

Browse files
authored
Feature: 🚀 Add Audio Content Support to OpenAISpec Request (#439)
* feat: add AudioContent and InputAudio models to support audio data in chat messages * chore: add fixtures for OpenAI request data with audio support in WAV and FLAC formats * test: add end-to-end tests for OpenAI Spec audio input handling with different audio formats * test: rename test function for audio input parity with OpenAI Spec * fix: update comment style for data attribute in InputAudio model * refactor: remove unnecessary line breaks
1 parent b3d4a97 commit 90d7d10

File tree

4 files changed

+102
-1
lines changed

4 files changed

+102
-1
lines changed

src/litserve/specs/openai.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,16 @@ class ImageContent(BaseModel):
7777
image_url: Union[str, ImageContentURL]
7878

7979

80+
class InputAudio(BaseModel):
81+
data: str # base64 encoded audio data.
82+
format: Literal["wav", "mp3"]
83+
84+
85+
class AudioContent(BaseModel):
86+
type: Literal["input_audio"]
87+
input_audio: InputAudio
88+
89+
8090
class Function(BaseModel):
8191
name: str
8292
description: str
@@ -133,7 +143,7 @@ class ResponseFormatJSONSchema(BaseModel):
133143

134144
class ChatMessage(BaseModel):
135145
role: str
136-
content: Optional[Union[str, List[Union[TextContent, ImageContent]]]] = None
146+
content: Optional[Union[str, List[Union[TextContent, ImageContent, AudioContent]]]] = None
137147
name: Optional[str] = None
138148
tool_calls: Optional[List[ToolCall]] = None
139149
tool_call_id: Optional[str] = None

tests/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import base64
1415
import time
1516
from typing import Generator
1617

@@ -193,6 +194,44 @@ def openai_request_data_with_image():
193194
}
194195

195196

197+
@pytest.fixture
198+
def openai_request_data_with_audio_wav(openai_request_data):
199+
# Create a base64 encoded string from a list of audio data
200+
audio_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
201+
encoded_string = base64.b64encode(bytearray(audio_data)).decode("utf-8")
202+
203+
request_data = openai_request_data.copy()
204+
request_data["messages"] = [
205+
{
206+
"role": "user",
207+
"content": [
208+
{"type": "text", "text": "What is in this recording?"},
209+
{"type": "input_audio", "input_audio": {"data": encoded_string, "format": "wav"}},
210+
],
211+
},
212+
]
213+
return request_data
214+
215+
216+
@pytest.fixture
217+
def openai_request_data_with_audio_flac(openai_request_data):
218+
# Create a base64 encoded string from a list of audio data
219+
audio_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
220+
encoded_string = base64.b64encode(bytearray(audio_data)).decode("utf-8")
221+
222+
request_data = openai_request_data.copy()
223+
request_data["messages"] = [
224+
{
225+
"role": "user",
226+
"content": [
227+
{"type": "text", "text": "What is in this recording?"},
228+
{"type": "input_audio", "input_audio": {"data": encoded_string, "format": "flac"}},
229+
],
230+
},
231+
]
232+
return request_data
233+
234+
196235
@pytest.fixture
197236
def openai_request_data_with_tools():
198237
return {

tests/e2e/test_e2e.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,34 @@ def test_openai_parity_with_image_input():
183183
)
184184

185185

186+
@e2e_from_file("tests/e2e/default_openaispec.py")
187+
def test_openai_parity_with_audio_input(openai_request_data_with_audio_wav):
188+
client = OpenAI(
189+
base_url="http://127.0.0.1:8000/v1",
190+
api_key="lit", # required, but unused
191+
)
192+
messages = openai_request_data_with_audio_wav["messages"]
193+
response = client.chat.completions.create(
194+
model="lit",
195+
messages=messages,
196+
)
197+
assert response.choices[0].message.content == "This is a generated output", (
198+
f"Server didn't return expected output\nOpenAI client output: {response}"
199+
)
200+
201+
response = client.chat.completions.create(
202+
model="lit",
203+
messages=messages,
204+
stream=True,
205+
)
206+
207+
expected_outputs = ["This is a generated output", None]
208+
for r, expected_out in zip(response, expected_outputs):
209+
assert r.choices[0].delta.content == expected_out, (
210+
f"Server didn't return expected output.\nOpenAI client output: {r}"
211+
)
212+
213+
186214
@e2e_from_file("tests/e2e/default_openaispec_tools.py")
187215
def test_openai_parity_with_tools():
188216
client = OpenAI(

tests/test_specs.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,30 @@ async def test_openai_spec_with_image(openai_request_data_with_image):
147147
)
148148

149149

150+
@pytest.mark.asyncio
151+
async def test_openai_spec_with_audio(openai_request_data_with_audio_wav, openai_request_data_with_audio_flac):
152+
server = ls.LitServer(TestAPI(), spec=OpenAISpec())
153+
154+
with wrap_litserve_start(server) as server:
155+
async with LifespanManager(server.app) as manager, AsyncClient(
156+
transport=ASGITransport(app=manager.app), base_url="http://test"
157+
) as ac:
158+
resp = await ac.post("/v1/chat/completions", json=openai_request_data_with_audio_wav, timeout=10)
159+
assert resp.status_code == 200, "Status code should be 200"
160+
161+
assert resp.json()["choices"][0]["message"]["content"] == "This is a generated output", (
162+
"LitAPI predict response should match with the generated output"
163+
)
164+
165+
# test for unsupported audio format
166+
resp = await ac.post("/v1/chat/completions", json=openai_request_data_with_audio_flac, timeout=10)
167+
assert resp.status_code == 422, "Status code should be 422"
168+
errors = resp.json()["detail"]
169+
assert any(error["msg"] == "Input should be 'wav' or 'mp3'" for error in errors), (
170+
"Error message for unsupported audio format should be present"
171+
)
172+
173+
150174
@pytest.mark.asyncio
151175
async def test_override_encode(openai_request_data):
152176
server = ls.LitServer(TestAPIWithCustomEncode(), spec=OpenAISpec())

0 commit comments

Comments
 (0)