@@ -112,11 +112,8 @@ def test_from_features(catalog):
112112 params = "parent" ,
113113 output = {"file" : File , "t1" : MyFr },
114114 )
115- df1 = ds .to_pandas ()
116-
117- assert df1 [["t1.nnn" , "t1.count" ]].equals (
118- pd .DataFrame ({"t1.nnn" : ["n1" , "n2" , "n1" ], "t1.count" : [3 , 5 , 1 ]})
119- )
115+ for i , (_ , t1 ) in enumerate (ds .iterate ()):
116+ assert t1 == features [i ]
120117
121118
122119def test_preserve_feature_schema (catalog ):
@@ -212,33 +209,33 @@ class _TestFr(BaseModel):
212209 params = "t1" ,
213210 output = {"x" : _TestFr },
214211 )
212+ # assert ds.collect() == 1
215213
216- df = ds .to_pandas ()
214+ for i , (x ,) in enumerate (ds .iterate ()):
215+ assert isinstance (x , _TestFr )
217216
218- assert df ["x.my_name" ].tolist () == ["n1" , "n2" , "n1" ]
219- assert np .allclose (df ["x.sqrt" ], [math .sqrt (x ) for x in [3 , 5 , 1 ]])
220- with pytest .raises (KeyError ):
221- df ["x.t1.nnn" ]
217+ fr = features [i ]
218+ test_fr = _TestFr (file = File (name = "" ), sqrt = math .sqrt (fr .count ), my_name = fr .nnn )
219+ assert x == test_fr
222220
223221
224222def test_map (catalog ):
225223 class _TestFr (BaseModel ):
226224 sqrt : float
227225 my_name : str
228226
229- ds = DataChain .from_values (t1 = features )
230-
231- df = ds .map (
227+ dc = DataChain .from_values (t1 = features ).map (
232228 x = lambda m_fr : _TestFr (
233229 sqrt = math .sqrt (m_fr .count ),
234230 my_name = m_fr .nnn + "_suf" ,
235231 ),
236232 params = "t1" ,
237233 output = {"x" : _TestFr },
238- ). to_pandas ()
234+ )
239235
240- assert df ["x.my_name" ].tolist () == ["n1_suf" , "n2_suf" , "n1_suf" ]
241- assert np .allclose (df ["x.sqrt" ], [math .sqrt (x ) for x in [3 , 5 , 1 ]])
236+ assert dc .collect_one ("x" ) == [
237+ _TestFr (sqrt = math .sqrt (fr .count ), my_name = fr .nnn + "_suf" ) for fr in features
238+ ]
242239
243240
244241def test_agg (catalog ):
@@ -247,26 +244,31 @@ class _TestFr(BaseModel):
247244 cnt : int
248245 my_name : str
249246
250- df = (
251- DataChain .from_values (t1 = features )
252- .agg (
253- x = lambda frs : [
254- _TestFr (
255- f = File (name = "" ),
256- cnt = sum (f .count for f in frs ),
257- my_name = "-" .join ([fr .nnn for fr in frs ]),
258- )
259- ],
260- partition_by = C .t1 .nnn ,
261- params = "t1" ,
262- output = {"x" : _TestFr },
263- )
264- .to_pandas ()
247+ dc = DataChain .from_values (t1 = features ).agg (
248+ x = lambda frs : [
249+ _TestFr (
250+ f = File (name = "" ),
251+ cnt = sum (f .count for f in frs ),
252+ my_name = "-" .join ([fr .nnn for fr in frs ]),
253+ )
254+ ],
255+ partition_by = C .t1 .nnn ,
256+ params = "t1" ,
257+ output = {"x" : _TestFr },
265258 )
266259
267- assert len (df ) == 2
268- assert df ["x.my_name" ].tolist () == ["n1-n1" , "n2" ]
269- assert df ["x.cnt" ].tolist () == [4 , 5 ]
260+ assert dc .collect_one ("x" ) == [
261+ _TestFr (
262+ f = File (name = "" ),
263+ cnt = sum (fr .count for fr in features if fr .nnn == "n1" ),
264+ my_name = "-" .join ([fr .nnn for fr in features if fr .nnn == "n1" ]),
265+ ),
266+ _TestFr (
267+ f = File (name = "" ),
268+ cnt = sum (fr .count for fr in features if fr .nnn == "n2" ),
269+ my_name = "-" .join ([fr .nnn for fr in features if fr .nnn == "n2" ]),
270+ ),
271+ ]
270272
271273
272274def test_agg_two_params (catalog ):
@@ -294,10 +296,8 @@ class _TestFr(BaseModel):
294296 output = {"x" : _TestFr },
295297 )
296298
297- df = ds .to_pandas ()
298- assert len (df ) == 2
299- assert df ["x.my_name" ].tolist () == ["n1-n1" , "n2" ]
300- assert df ["x.cnt" ].tolist () == [12 , 15 ]
299+ assert ds .collect_one ("x.my_name" ) == ["n1-n1" , "n2" ]
300+ assert ds .collect_one ("x.cnt" ) == [12 , 15 ]
301301
302302
303303def test_agg_simple_iterator (catalog ):
@@ -356,10 +356,8 @@ def func(key, val) -> Iterator[tuple[File, _ImageGroup]]:
356356 values = [1 , 5 , 9 ]
357357 ds = DataChain .from_values (key = keys , val = values ).agg (x = func , partition_by = C ("key" ))
358358
359- df = ds .to_pandas ()
360- assert len (df ) == 2
361- assert df ["x_1.name" ].tolist () == ["n1-n1" , "n2" ]
362- assert df ["x_1.size" ].tolist () == [10 , 5 ]
359+ assert ds .collect_one ("x_1.name" ) == ["n1-n1" , "n2" ]
360+ assert ds .collect_one ("x_1.size" ) == [10 , 5 ]
363361
364362
365363def test_agg_tuple_result_generator (catalog ):
@@ -376,10 +374,8 @@ def func(key, val) -> Generator[tuple[File, _ImageGroup], None, None]:
376374 values = [1 , 5 , 9 ]
377375 ds = DataChain .from_values (key = keys , val = values ).agg (x = func , partition_by = C ("key" ))
378376
379- df = ds .to_pandas ()
380- assert len (df ) == 2
381- assert df ["x_1.name" ].tolist () == ["n1-n1" , "n2" ]
382- assert df ["x_1.size" ].tolist () == [10 , 5 ]
377+ assert ds .collect_one ("x_1.name" ) == ["n1-n1" , "n2" ]
378+ assert ds .collect_one ("x_1.size" ) == [10 , 5 ]
383379
384380
385381def test_iterate (catalog ):
@@ -829,15 +825,15 @@ def test_from_features_object_name(tmp_dir, catalog):
829825 values = ["odd" if num % 2 else "even" for num in fib ]
830826
831827 dc = DataChain .from_values (fib = fib , odds = values , object_name = "custom" )
832- assert "custom.fib" in dc .to_pandas ().columns
828+ assert "custom.fib" in dc .to_pandas (flatten = True ).columns
833829
834830
835831def test_parse_tabular_object_name (tmp_dir , catalog ):
836832 df = pd .DataFrame (DF_DATA )
837833 path = tmp_dir / "test.parquet"
838834 df .to_parquet (path )
839- dc = DataChain .from_storage (path .as_uri ()).parse_tabular (object_name = "name " )
840- assert "name .first_name" in dc .to_pandas ().columns
835+ dc = DataChain .from_storage (path .as_uri ()).parse_tabular (object_name = "tbl " )
836+ assert "tbl .first_name" in dc .to_pandas (flatten = True ).columns
841837
842838
843839def test_sys_feature (tmp_dir , catalog ):
@@ -868,3 +864,12 @@ def test_sys_feature(tmp_dir, catalog):
868864 MyFr (nnn = "n1" , count = 1 ),
869865 ]
870866 assert "sys" not in ds_no_sys .catalog .get_dataset ("ds_no_sys" ).feature_schema
867+
868+
869+ def test_to_pandas_multi_level ():
870+ df = DataChain .from_values (t1 = features ).to_pandas ()
871+
872+ assert "t1" in df .columns
873+ assert "nnn" in df ["t1" ].columns
874+ assert "count" in df ["t1" ].columns
875+ assert df ["t1" ]["count" ].tolist () == [3 , 5 , 1 ]
0 commit comments