Skip to content

Commit c0a712b

Browse files
rdheekondamonoxgas
andauthored
feat: Add Comprehensive Dreadnode Data Types: Image, Audio, Video, Table, and Object3D (#48)
* Add log object support for custom data types: Image, Video, Table, Audio, and 3D Objects * Fix ruff errors * Fix mypy errors * Updated potery lock * Add missing dependency * Fix object identification to maintain schema uniqueness while preserving storage efficiency --------- Co-authored-by: monoxgas <monoxgas@gmail.com>
1 parent e6dfb71 commit c0a712b

File tree

18 files changed

+2360
-139
lines changed

18 files changed

+2360
-139
lines changed

dreadnode/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from dreadnode.data_types import Audio, Image, Object3D, Table, Video
12
from dreadnode.main import DEFAULT_INSTANCE, Dreadnode
23
from dreadnode.metric import Metric, MetricDict, Scorer
34
from dreadnode.object import Object
@@ -30,15 +31,22 @@
3031
__version__ = VERSION
3132

3233
__all__ = [
34+
"Audio",
3335
"Dreadnode",
36+
"Image",
3437
"Metric",
3538
"MetricDict",
3639
"Object",
40+
"Object3D",
41+
"Run",
3742
"RunSpan",
3843
"Scorer",
3944
"Span",
45+
"Table",
4046
"Task",
4147
"TaskSpan",
48+
"Video",
49+
"__version__",
4250
"api",
4351
"configure",
4452
"link_objects",

dreadnode/data_types/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .audio import Audio
2+
from .image import Image
3+
from .object_3d import Object3D
4+
from .table import Table
5+
from .video import Video
6+
7+
__all__ = ["Audio", "Image", "Object3D", "Table", "Video"]

dreadnode/data_types/audio.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
import io
2+
import typing as t
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import soundfile as sf # type: ignore # noqa: PGH003
7+
from pydub import AudioSegment # type: ignore # noqa: PGH003
8+
9+
from dreadnode.data_types.base_data_type import BaseDataType
10+
11+
AudioDataType: t.TypeAlias = str | Path | np.ndarray[t.Any, t.Any] | bytes | AudioSegment
12+
13+
14+
class Audio(BaseDataType):
15+
"""
16+
Audio media type for Dreadnode logging.
17+
18+
Supports:
19+
- Local file paths (str or Path)
20+
- Numpy arrays with sample rate
21+
- Raw bytes
22+
- Pydub AudioSegment object
23+
"""
24+
25+
def __init__(
26+
self,
27+
data: AudioDataType,
28+
sample_rate: int | None = None,
29+
caption: str | None = None,
30+
format: str | None = None,
31+
):
32+
"""
33+
Initialize an Audio object.
34+
35+
Args:
36+
data: The audio data, which can be:
37+
- A path to a local audio file (str or Path)
38+
- A numpy array (requires sample_rate)
39+
- Raw bytes
40+
- A pydub AudioSegment
41+
sample_rate: Required when using numpy arrays
42+
caption: Optional caption for the audio
43+
format: Optional format to use (default is wav for numpy arrays)
44+
"""
45+
self._data = data
46+
self._sample_rate = sample_rate
47+
self._caption = caption
48+
self._format = format
49+
50+
def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]:
51+
"""
52+
Serialize the audio data to bytes and return with metadata.
53+
Returns:
54+
A tuple of (audio_bytes, metadata_dict)
55+
"""
56+
audio_bytes, format_name, sample_rate, duration = self._process_audio_data()
57+
metadata = self._generate_metadata(format_name, sample_rate, duration)
58+
return audio_bytes, metadata
59+
60+
def _process_audio_data(self) -> tuple[bytes, str, int | None, float | None]:
61+
"""
62+
Process the audio data and return bytes, format, sample rate, and duration.
63+
Returns:
64+
A tuple of (audio_bytes, format_name, sample_rate, duration)
65+
"""
66+
if isinstance(self._data, (str, Path)) and Path(self._data).exists():
67+
return self._process_file_path()
68+
if isinstance(self._data, np.ndarray):
69+
return self._process_numpy_array()
70+
if isinstance(self._data, bytes):
71+
return self._process_raw_bytes()
72+
if isinstance(self._data, AudioSegment):
73+
return self._process_pydub_audio_segment()
74+
raise TypeError(f"Unsupported audio data type: {type(self._data)}")
75+
76+
def _process_file_path(self) -> tuple[bytes, str, int | None, float | None]:
77+
"""
78+
Process audio from file path. Obtain sample rate and duration using soundfile.
79+
Returns:
80+
A tuple of (audio_bytes, format_name, sample_rate, duration)
81+
"""
82+
path_str = str(self._data)
83+
audio_bytes = Path(path_str).read_bytes()
84+
format_name = self._format or Path(path_str).suffix.lstrip(".").lower() or "wav"
85+
sample_rate = self._sample_rate
86+
duration = None
87+
with sf.SoundFile(path_str) as f:
88+
sample_rate = sample_rate or f.samplerate
89+
duration = f.frames / f.samplerate
90+
91+
return audio_bytes, format_name, sample_rate, duration
92+
93+
def _process_numpy_array(self) -> tuple[bytes, str, int | None, float | None]:
94+
"""
95+
Process numpy array to WAV using soundfile.
96+
Returns:
97+
A tuple of (audio_bytes, format_name, sample_rate, duration)
98+
"""
99+
if self._sample_rate is None:
100+
raise ValueError('Argument "sample_rate" is required when using numpy arrays.')
101+
102+
buffer = io.BytesIO()
103+
format_name = self._format or "wav"
104+
sf.write(buffer, self._data, self._sample_rate, format=format_name)
105+
buffer.seek(0)
106+
audio_bytes = buffer.read()
107+
108+
if isinstance(self._data, np.ndarray):
109+
duration = len(self._data) / float(self._sample_rate)
110+
else:
111+
raise TypeError("Invalid data type for numpy array processing.")
112+
113+
return audio_bytes, format_name, self._sample_rate, duration
114+
115+
def _process_raw_bytes(self) -> tuple[bytes, str, int | None, float | None]:
116+
"""
117+
Process raw bytes. Format is determined by the provided format argument.
118+
Returns:
119+
A tuple of (audio_bytes, format_name, sample_rate, duration)
120+
"""
121+
format_name = self._format or "wav"
122+
if not isinstance(self._data, bytes):
123+
raise TypeError("Raw bytes are expected for this processing method.")
124+
return self._data, format_name, self._sample_rate, None
125+
126+
def _process_pydub_audio_segment(self) -> tuple[bytes, str, int | None, float | None]:
127+
"""
128+
Process pydub AudioSegment to bytes.
129+
Returns:
130+
A tuple of (audio_bytes, format_name, sample_rate, duration)
131+
"""
132+
133+
if not isinstance(self._data, AudioSegment):
134+
raise TypeError("AudioSegment is expected for this processing method.")
135+
136+
sample_rate = self._data.frame_rate
137+
138+
buffer = io.BytesIO()
139+
format_name = self._format or "wav"
140+
self._data.export(buffer, format=format_name)
141+
buffer.seek(0)
142+
audio_bytes = buffer.read()
143+
144+
# PyDUB provides duration in milliseconds, convert to seconds for consistency
145+
duration = len(self._data) / 1000.0
146+
147+
return audio_bytes, format_name, sample_rate, duration
148+
149+
def _generate_metadata(
150+
self, format_name: str, sample_rate: int | None, duration: float | None
151+
) -> dict[str, str | int | float | None]:
152+
"""
153+
Generate metadata for the audio data.
154+
Returns:
155+
A dictionary of metadata
156+
"""
157+
metadata: dict[str, str | int | float | None] = {
158+
"extension": format_name.lower(),
159+
"x-python-datatype": "dreadnode.Audio.bytes",
160+
}
161+
162+
if isinstance(self._data, (str, Path)):
163+
metadata["source-type"] = "file"
164+
metadata["source-path"] = str(self._data)
165+
elif isinstance(self._data, np.ndarray):
166+
metadata["source-type"] = "numpy.ndarray"
167+
elif isinstance(self._data, bytes):
168+
metadata["source-type"] = "bytes"
169+
elif isinstance(self._data, AudioSegment):
170+
metadata["source-type"] = "pydub.AudioSegment"
171+
172+
if sample_rate is not None:
173+
metadata["sample-rate"] = sample_rate
174+
175+
if duration is not None:
176+
metadata["duration"] = duration
177+
178+
# Add pydub-specific metadata if available
179+
if isinstance(self._data, AudioSegment):
180+
metadata["channels"] = self._data.channels
181+
metadata["sample-width"] = self._data.sample_width
182+
183+
if self._caption:
184+
metadata["caption"] = self._caption
185+
186+
return metadata
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import typing as t
2+
from abc import ABC, abstractmethod
3+
4+
5+
class BaseDataType(ABC):
6+
"""Base class for all data types that can be logged with Dreadnode."""
7+
8+
@abstractmethod
9+
def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]:
10+
"""
11+
Convert the media type to a serializable format.
12+
13+
Returns:
14+
Tuple of (data, metadata) where:
15+
- data: The serialized data
16+
- metadata: Additional metadata for this data type
17+
"""

0 commit comments

Comments
 (0)