|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +# test_audio.py |
| 4 | +import base64 |
| 5 | +from pathlib import Path |
| 6 | +from unittest.mock import patch |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import pytest |
| 10 | + |
| 11 | +from vllm.multimodal.audio import (AudioMediaIO, AudioResampler, |
| 12 | + resample_audio_librosa, |
| 13 | + resample_audio_scipy) |
| 14 | + |
| 15 | + |
| 16 | +@pytest.fixture |
| 17 | +def dummy_audio(): |
| 18 | + return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) |
| 19 | + |
| 20 | + |
| 21 | +def test_resample_audio_librosa(dummy_audio): |
| 22 | + with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: |
| 23 | + mock_resample.return_value = dummy_audio * 2 |
| 24 | + out = resample_audio_librosa(dummy_audio, |
| 25 | + orig_sr=44100, |
| 26 | + target_sr=22050) |
| 27 | + mock_resample.assert_called_once_with(dummy_audio, |
| 28 | + orig_sr=44100, |
| 29 | + target_sr=22050) |
| 30 | + assert np.all(out == dummy_audio * 2) |
| 31 | + |
| 32 | + |
| 33 | +def test_resample_audio_scipy(dummy_audio): |
| 34 | + out_down = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=2) |
| 35 | + out_up = resample_audio_scipy(dummy_audio, orig_sr=2, target_sr=4) |
| 36 | + out_same = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=4) |
| 37 | + |
| 38 | + assert len(out_down) == 3 |
| 39 | + assert len(out_up) == 10 |
| 40 | + assert np.all(out_same == dummy_audio) |
| 41 | + |
| 42 | + |
| 43 | +@pytest.mark.xfail( |
| 44 | + reason="resample_audio_scipy is buggy for non-integer ratios") |
| 45 | +def test_resample_audio_scipy_non_integer_ratio(dummy_audio): |
| 46 | + out = resample_audio_scipy(dummy_audio, orig_sr=5, target_sr=3) |
| 47 | + |
| 48 | + expected_len = int(round(len(dummy_audio) * 3 / 5)) |
| 49 | + assert len(out) == expected_len |
| 50 | + |
| 51 | + assert isinstance(out, np.ndarray) |
| 52 | + assert np.isfinite(out).all() |
| 53 | + |
| 54 | + |
| 55 | +def test_audio_resampler_librosa_calls_resample(dummy_audio): |
| 56 | + resampler = AudioResampler(target_sr=22050, method="librosa") |
| 57 | + with patch( |
| 58 | + "vllm.multimodal.audio.resample_audio_librosa") as mock_resample: |
| 59 | + mock_resample.return_value = dummy_audio |
| 60 | + out = resampler.resample(dummy_audio, orig_sr=44100) |
| 61 | + mock_resample.assert_called_once_with(dummy_audio, |
| 62 | + orig_sr=44100, |
| 63 | + target_sr=22050) |
| 64 | + assert np.all(out == dummy_audio) |
| 65 | + |
| 66 | + |
| 67 | +def test_audio_resampler_scipy_calls_resample(dummy_audio): |
| 68 | + resampler = AudioResampler(target_sr=22050, method="scipy") |
| 69 | + with patch("vllm.multimodal.audio.resample_audio_scipy") as mock_resample: |
| 70 | + mock_resample.return_value = dummy_audio |
| 71 | + out = resampler.resample(dummy_audio, orig_sr=44100) |
| 72 | + mock_resample.assert_called_once_with(dummy_audio, |
| 73 | + orig_sr=44100, |
| 74 | + target_sr=22050) |
| 75 | + assert np.all(out == dummy_audio) |
| 76 | + |
| 77 | + |
| 78 | +def test_audio_resampler_invalid_method(dummy_audio): |
| 79 | + resampler = AudioResampler(target_sr=22050, method="invalid") |
| 80 | + with pytest.raises(ValueError): |
| 81 | + resampler.resample(dummy_audio, orig_sr=44100) |
| 82 | + |
| 83 | + |
| 84 | +def test_audio_resampler_no_target_sr(dummy_audio): |
| 85 | + resampler = AudioResampler(target_sr=None) |
| 86 | + with pytest.raises(RuntimeError): |
| 87 | + resampler.resample(dummy_audio, orig_sr=44100) |
| 88 | + |
| 89 | + |
| 90 | +@pytest.fixture |
| 91 | +def dummy_audio_bytes(): |
| 92 | + return b"FAKEAUDIOBYTES" |
| 93 | + |
| 94 | + |
| 95 | +def test_audio_media_io_load_bytes(dummy_audio_bytes): |
| 96 | + audio_io = AudioMediaIO() |
| 97 | + with patch("vllm.multimodal.audio.librosa.load") as mock_load: |
| 98 | + mock_load.return_value = (np.array([0.1, 0.2]), 16000) |
| 99 | + out = audio_io.load_bytes(dummy_audio_bytes) |
| 100 | + mock_load.assert_called_once() |
| 101 | + assert isinstance(out[0], np.ndarray) |
| 102 | + assert out[1] == 16000 |
| 103 | + |
| 104 | + |
| 105 | +def test_audio_media_io_load_base64(dummy_audio_bytes): |
| 106 | + audio_io = AudioMediaIO() |
| 107 | + encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") |
| 108 | + with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: |
| 109 | + mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) |
| 110 | + out = audio_io.load_base64("audio/wav", encoded) |
| 111 | + mock_load_bytes.assert_called_once() |
| 112 | + assert isinstance(out[0], np.ndarray) |
| 113 | + assert out[1] == 16000 |
| 114 | + |
| 115 | + |
| 116 | +def test_audio_media_io_load_file(): |
| 117 | + audio_io = AudioMediaIO() |
| 118 | + path = Path("/fake/path.wav") |
| 119 | + with patch("vllm.multimodal.audio.librosa.load") as mock_load: |
| 120 | + mock_load.return_value = (np.array([0.1, 0.2]), 16000) |
| 121 | + out = audio_io.load_file(path) |
| 122 | + mock_load.assert_called_once_with(path, sr=None) |
| 123 | + assert isinstance(out[0], np.ndarray) |
| 124 | + assert out[1] == 16000 |
| 125 | + |
| 126 | + |
| 127 | +def test_audio_media_io_encode_base64(dummy_audio): |
| 128 | + audio_io = AudioMediaIO() |
| 129 | + media = (dummy_audio, 16000) |
| 130 | + with patch("vllm.multimodal.audio.soundfile.write") as mock_write: |
| 131 | + |
| 132 | + def write_to_buffer(buffer, *_args, **_kwargs): |
| 133 | + buffer.write(b"dummy_wav_data") |
| 134 | + |
| 135 | + mock_write.side_effect = write_to_buffer |
| 136 | + |
| 137 | + out = audio_io.encode_base64(media) |
| 138 | + decoded = base64.b64decode(out) |
| 139 | + assert decoded == b"dummy_wav_data" |
| 140 | + mock_write.assert_called_once() |
0 commit comments