|
26 | 26 | NASA_AUDIO_MP3, |
27 | 27 | NASA_VIDEO, |
28 | 28 | SINE_MONO_S32, |
| 29 | + SINE_MONO_S32_44100, |
| 30 | + SINE_MONO_S32_8000, |
29 | 31 | ) |
30 | 32 |
|
31 | 33 |
|
@@ -1088,3 +1090,65 @@ def test_format_conversion(self): |
1088 | 1090 |
|
1089 | 1091 | reference_frames = asset.get_frame_data_by_range(start=0, stop=asset.num_frames) |
1090 | 1092 | torch.testing.assert_close(all_samples.data, reference_frames) |
| 1093 | + |
| 1094 | + @pytest.mark.parametrize( |
| 1095 | + "start_seconds, stop_seconds", |
| 1096 | + ( |
| 1097 | + (0, None), |
| 1098 | + (0, 4), |
| 1099 | + (0, 3), |
| 1100 | + (2, None), |
| 1101 | + (2, 3), |
| 1102 | + ), |
| 1103 | + ) |
| 1104 | + def test_sample_rate_conversion(self, start_seconds, stop_seconds): |
| 1105 | + # When start_seconds is not exactly 0, we have to increase the tolerance |
| 1106 | + # a bit. This is because sample_rate conversion relies on a sliding |
| 1107 | + # window of samples: if we start a stream in the middle, the first few |
| 1108 | + # samples aren't able to take advantage of the preceeding samples. |
| 1109 | + atol = 1e-4 if start_seconds == 0 else 1e-2 |
| 1110 | + rtol = 1e-6 |
| 1111 | + |
| 1112 | + # Upsample |
| 1113 | + decoder = AudioDecoder(SINE_MONO_S32_44100.path) |
| 1114 | + assert decoder.metadata.sample_rate == 44_100 |
| 1115 | + frames_44100_native = decoder.get_samples_played_in_range( |
| 1116 | + start_seconds=start_seconds, stop_seconds=stop_seconds |
| 1117 | + ) |
| 1118 | + assert frames_44100_native.sample_rate == 44_100 |
| 1119 | + |
| 1120 | + decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=44_100) |
| 1121 | + frames_upsampled_to_44100 = decoder.get_samples_played_in_range( |
| 1122 | + start_seconds=start_seconds, stop_seconds=stop_seconds |
| 1123 | + ) |
| 1124 | + assert decoder.metadata.sample_rate == 16_000 |
| 1125 | + assert frames_upsampled_to_44100.sample_rate == 44_100 |
| 1126 | + |
| 1127 | + torch.testing.assert_close( |
| 1128 | + frames_upsampled_to_44100.data, |
| 1129 | + frames_44100_native.data, |
| 1130 | + atol=atol, |
| 1131 | + rtol=rtol, |
| 1132 | + ) |
| 1133 | + |
| 1134 | + # Downsample |
| 1135 | + decoder = AudioDecoder(SINE_MONO_S32_8000.path) |
| 1136 | + assert decoder.metadata.sample_rate == 8000 |
| 1137 | + frames_8000_native = decoder.get_samples_played_in_range( |
| 1138 | + start_seconds=start_seconds, stop_seconds=stop_seconds |
| 1139 | + ) |
| 1140 | + assert frames_8000_native.sample_rate == 8000 |
| 1141 | + |
| 1142 | + decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=8000) |
| 1143 | + frames_downsampled_to_8000 = decoder.get_samples_played_in_range( |
| 1144 | + start_seconds=start_seconds, stop_seconds=stop_seconds |
| 1145 | + ) |
| 1146 | + assert decoder.metadata.sample_rate == 16_000 |
| 1147 | + assert frames_downsampled_to_8000.sample_rate == 8000 |
| 1148 | + |
| 1149 | + torch.testing.assert_close( |
| 1150 | + frames_downsampled_to_8000.data, |
| 1151 | + frames_8000_native.data, |
| 1152 | + atol=atol, |
| 1153 | + rtol=rtol, |
| 1154 | + ) |
0 commit comments