99
1010from pydantic import BaseModel
1111
12+ if t .TYPE_CHECKING :
13+ from pandas import DataFrame as PandasDataFrame
14+
1215from .backends import BaseBackend , get_registry
16+ from .backends .inmemory import InMemoryBackend
1317
1418# For backwards compatibility, use typing_extensions for older Python versions
1519try :
@@ -190,6 +194,59 @@ def load(
190194 # Unvalidated mode - keep as dicts but wrapped in Dataset API
191195 return cls (name , backend , None , dict_data )
192196
197+ @classmethod
198+ def from_pandas (
199+ cls : t .Type [Self ],
200+ dataframe : "PandasDataFrame" ,
201+ name : str ,
202+ backend : t .Union [BaseBackend , str ],
203+ data_model : t .Optional [t .Type [T ]] = None ,
204+ ** kwargs ,
205+ ) -> Self :
206+ """Create a DataTable from a pandas DataFrame.
207+
208+ Args:
209+ dataframe: The pandas DataFrame to convert
210+ name: Name of the dataset
211+ backend: Either a BaseBackend instance or backend name string (e.g., "local/csv")
212+ data_model: Optional Pydantic model for validation
213+ **kwargs: Additional arguments passed to backend constructor (when using string backend)
214+
215+ Returns:
216+ DataTable instance with data from the DataFrame
217+
218+ Examples:
219+ # Using string backend name
220+ dataset = Dataset.load_from_pandas(df, "my_data", "local/csv", root_dir="./data")
221+
222+ # Using backend instance
223+ backend = LocalCSVBackend(root_dir="./data")
224+ dataset = Dataset.load_from_pandas(df, "my_data", backend)
225+ """
226+ try :
227+ import pandas as pd
228+ except ImportError :
229+ raise ImportError (
230+ "pandas is not installed. Please install it to use this function."
231+ )
232+
233+ if not isinstance (dataframe , pd .DataFrame ):
234+ raise TypeError (f"Expected pandas DataFrame, got { type (dataframe )} " )
235+
236+ # Convert DataFrame to list of dictionaries
237+ dict_data = dataframe .to_dict (orient = "records" )
238+
239+ # Resolve backend if string
240+ backend = cls ._resolve_backend (backend , ** kwargs )
241+
242+ if data_model :
243+ # Validated mode - convert dicts to Pydantic models
244+ validated_data = [data_model (** d ) for d in dict_data ]
245+ return cls (name , backend , data_model , validated_data )
246+ else :
247+ # Unvalidated mode - keep as dicts but wrapped in DataTable API
248+ return cls (name , backend , None , dict_data )
249+
193250 def save (self ) -> None :
194251 """Save dataset - converts to dicts if needed"""
195252 dict_data : t .List [t .Dict [str , t .Any ]] = []
@@ -252,6 +309,27 @@ def validate_with(self, data_model: t.Type[T]) -> Self:
252309 data = validated_data ,
253310 )
254311
312+ def to_pandas (self ) -> "PandasDataFrame" :
313+ """Convert the dataset to a pandas DataFrame."""
314+ try :
315+ import pandas as pd
316+ except ImportError :
317+ raise ImportError (
318+ "pandas is not installed. Please install it to use this function."
319+ )
320+
321+ # Convert data to list of dictionaries
322+ dict_data : t .List [t .Dict [str , t .Any ]] = []
323+ for item in self ._data :
324+ if isinstance (item , BaseModel ):
325+ dict_data .append (item .model_dump ())
326+ elif isinstance (item , dict ):
327+ dict_data .append (item )
328+ else :
329+ raise TypeError (f"Unexpected type in dataset: { type (item )} " )
330+
331+ return pd .DataFrame (dict_data )
332+
255333 def append (self , item : t .Union [t .Dict , BaseModel ]) -> None :
256334 """Add item to dataset with validation if model exists"""
257335 if self .data_model is not None :
@@ -290,21 +368,17 @@ def __str__(self):
290368
291369 return f"{ self .DATATABLE_TYPE } (name={ self .name } , { data_model_str } len={ len (self ._data )} )"
292370
293- __repr__ = __str__
294-
295-
296- class Dataset (DataTable [T ]):
297- """Dataset class for managing dataset entries.
298-
299- Inherits all functionality from DataTable. This class represents
300- datasets specifically (as opposed to experiments).
301- """
371+ def get_row_value (self , row , key : str ):
372+ """Helper method to get value from row (dict or BaseModel)"""
302373
303- DATATABLE_TYPE = "Dataset"
374+ if isinstance (row , dict ):
375+ return row .get (key )
376+ else :
377+ return getattr (row , key , None )
304378
305379 def train_test_split (
306380 self , test_size : float = 0.2 , random_state : t .Optional [int ] = None
307- ) -> t .Tuple ["Dataset [T]" , "Dataset [T]" ]:
381+ ) -> t .Tuple ["DataTable [T]" , "DataTable [T]" ]:
308382 """Split the dataset into training and testing sets.
309383
310384 Args:
@@ -327,6 +401,9 @@ def train_test_split(
327401 split_index = int (len (self ._data ) * (1 - test_size ))
328402
329403 # Create new dataset instances with proper initialization
404+ # Use inmemory backend for split datasets (temporary datasets)
405+ inmemory_backend = InMemoryBackend ()
406+
330407 # Handle type-safe constructor calls based on data_model presence
331408 if self .data_model is not None :
332409 # Validated dataset case - data should be List[T]
@@ -335,14 +412,14 @@ def train_test_split(
335412
336413 train_dataset = type (self )(
337414 name = f"{ self .name } _train" ,
338- backend = self . backend ,
415+ backend = inmemory_backend ,
339416 data_model = self .data_model ,
340417 data = train_data ,
341418 )
342419
343420 test_dataset = type (self )(
344421 name = f"{ self .name } _test" ,
345- backend = self . backend ,
422+ backend = inmemory_backend ,
346423 data_model = self .data_model ,
347424 data = test_data ,
348425 )
@@ -353,16 +430,32 @@ def train_test_split(
353430
354431 train_dataset = type (self )(
355432 name = f"{ self .name } _train" ,
356- backend = self . backend ,
433+ backend = inmemory_backend ,
357434 data_model = None ,
358435 data = train_data ,
359436 )
360437
361438 test_dataset = type (self )(
362439 name = f"{ self .name } _test" ,
363- backend = self . backend ,
440+ backend = inmemory_backend ,
364441 data_model = None ,
365442 data = test_data ,
366443 )
367444
445+ # save to inmemory backend
446+ train_dataset .save ()
447+ test_dataset .save ()
448+
368449 return train_dataset , test_dataset
450+
451+ __repr__ = __str__
452+
453+
454+ class Dataset (DataTable [T ]):
455+ """Dataset class for managing dataset entries.
456+
457+ Inherits all functionality from DataTable. This class represents
458+ datasets specifically (as opposed to experiments).
459+ """
460+
461+ DATATABLE_TYPE = "Dataset"
0 commit comments