Skip to content

Commit 914d276

Browse files
authored
fix: Update client to properly infer that flac (along with other browser renderable audio types) are loaded as DatapointType.AUDIO (#783)
This was previously system dependent. Some systems would load this from /etc/mime.types, but it was not defined in the Python's built-in mimetypes.
1 parent 8e5a002 commit 914d276

File tree

2 files changed

+16
-0
lines changed

2 files changed

+16
-0
lines changed

kolena/dataset/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ def _infer_datatype_value_from_url(url: str) -> str:
114114
return datatype
115115
elif url.endswith(".pcd"):
116116
return DatapointType.POINT_CLOUD.value
117+
# Explicit handling for formats not consistently in Python's built-in mimetypes
118+
elif url.lower().endswith((".flac", ".ogg", ".opus", ".aac", ".m4a")):
119+
return DatapointType.AUDIO.value
117120

118121
return DatapointType.TABULAR.value
119122

tests/unit/dataset/test_dataset.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@
5252
("test.pcd", DatapointType.POINT_CLOUD),
5353
("gcp://summary.pdf", DatapointType.DOCUMENT),
5454
("//my.mp3", DatapointType.AUDIO),
55+
("s3://bucket/audio.flac", DatapointType.AUDIO),
56+
("https://example.com/audio.FLAC", DatapointType.AUDIO),
57+
("audio.ogg", DatapointType.AUDIO),
58+
("/path/to/audio.opus", DatapointType.AUDIO),
59+
("audio.aac", DatapointType.AUDIO),
60+
("audio.m4a", DatapointType.AUDIO),
5561
(None, DatapointType.TABULAR),
5662
(123, DatapointType.TABULAR),
5763
],
@@ -72,6 +78,13 @@ def test__infer_datatype_value(uri: Any, expected: str) -> None:
7278
(".csv", DatapointType.DOCUMENT),
7379
(".pdf", DatapointType.DOCUMENT),
7480
(".mp3", DatapointType.AUDIO),
81+
(".flac", DatapointType.AUDIO),
82+
("flac", DatapointType.AUDIO),
83+
("FLAC", DatapointType.AUDIO),
84+
(".ogg", DatapointType.AUDIO),
85+
(".opus", DatapointType.AUDIO),
86+
(".aac", DatapointType.AUDIO),
87+
(".m4a", DatapointType.AUDIO),
7588
("tabular", DatapointType.TABULAR),
7689
(None, DatapointType.TABULAR),
7790
(123, DatapointType.TABULAR),

0 commit comments

Comments
 (0)