@@ -230,19 +230,42 @@ def is_lambda(fn):
230
230
231
231
232
232
@pytest .mark .parametrize (
233
- 'entries' ,
233
+ 'entries,expected_records ' ,
234
234
[
235
- [
236
- {'text' : 'Dummy example 0' , 'index' : 0 },
237
- {'text' : 'Dummy example 1' , 'index' : 1 },
238
- ],
239
- [
240
- {'text' : 'Dummy example 0' , 'index' : 0 },
241
- {'text' : None , 'index' : 1 },
242
- ],
235
+ (
236
+ [
237
+ {'text' : 'Dummy example 0' , 'index' : 0 },
238
+ ],
239
+ [
240
+ {'text' : b'Dummy example 0' , 'index' : 0 },
241
+ ],
242
+ ),
243
+ (
244
+ [
245
+ {'text' : 'Dummy example 0' , 'index' : 0 },
246
+ {'text' : None , 'index' : 1 },
247
+ ],
248
+ [
249
+ {'text' : b'Dummy example 0' , 'index' : 0 },
250
+ {'text' : None , 'index' : 1 },
251
+ ],
252
+ ),
253
+ (
254
+ [],
255
+ [],
256
+ ),
257
+ # If entries is None, dummy_croissant_file will create two dummy
258
+ # entries.
259
+ (
260
+ None ,
261
+ [
262
+ {'text' : b'Dummy example 0' , 'index' : 0 },
263
+ {'text' : b'Dummy example 1' , 'index' : 1 },
264
+ ],
265
+ ),
243
266
],
244
267
)
245
- def test_dummy_croissant_file (entries ):
268
+ def test_dummy_croissant_file (entries , expected_records ):
246
269
with test_utils .dummy_croissant_file (entries = entries ) as croissant_file :
247
270
dataset = mlc .Dataset (jsonld = croissant_file )
248
271
@@ -255,9 +278,7 @@ def test_dummy_croissant_file(entries):
255
278
assert [record_set .id for record_set in dataset .metadata .record_sets ] == [
256
279
'jsonl'
257
280
]
281
+ if entries is not None :
282
+ assert len (tuple (dataset .records ('jsonl' ))) == len (expected_records )
258
283
for i , record in enumerate (dataset .records ('jsonl' )):
259
- assert record ['index' ] == entries [i ]['index' ]
260
- if record ['text' ] is not None :
261
- assert record ['text' ].decode () == entries [i ]['text' ]
262
- else :
263
- assert record ['text' ] == entries [i ]['text' ]
284
+ assert record == expected_records [i ]
0 commit comments