Skip to content

Commit 5b1d12d

Browse files
authored
Replace Sequence by List (#7634)
* no more sequence * docs * fix tests * fix tests and add backward compatibility utilities * fix tests * last fix * last * fix docstrings * again
1 parent d041311 commit 5b1d12d

31 files changed

+497
-601
lines changed

docs/source/about_dataset_features.mdx

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,21 @@ Refer to [`Value`] for a full list of supported data types.
3232

3333
The [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa.
3434

35-
If your data type contains a list of objects, then you want to use the [`Sequence`] feature. Remember the SQuAD dataset?
35+
If your data type contains a list of objects, then you want to use the [`List`] feature. Remember the SQuAD dataset?
3636

3737
```py
3838
>>> from datasets import load_dataset
3939
>>> dataset = load_dataset('rajpurkar/squad', split='train')
4040
>>> dataset.features
41-
{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
42-
'context': Value(dtype='string'),
43-
'id': Value(dtype='string'),
44-
'question': Value(dtype='string'),
45-
'title': Value(dtype='string')}
41+
{'id': Value(dtype='string'),
42+
'title': Value(dtype='string'),
43+
'context': Value(dtype='string'),
44+
'question': Value(dtype='string'),
45+
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
46+
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
4647
```
4748

48-
The `answers` field is constructed using the [`Sequence`] feature because it contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
49+
The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
4950

5051
<Tip>
5152

docs/source/package_reference/main_classes.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable
235235

236236
[[autodoc]] datasets.LargeList
237237

238+
[[autodoc]] datasets.List
239+
238240
[[autodoc]] datasets.Sequence
239241

240242
### Translation

docs/source/process.mdx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -265,11 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th
265265
>>> from datasets import load_dataset
266266
>>> dataset = load_dataset("rajpurkar/squad", split="train")
267267
>>> dataset.features
268-
{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
269-
'context': Value(dtype='string'),
270-
'id': Value(dtype='string'),
271-
'question': Value(dtype='string'),
272-
'title': Value(dtype='string')}
268+
{'id': Value(dtype='string'),
269+
'title': Value(dtype='string'),
270+
'context': Value(dtype='string'),
271+
'question': Value(dtype='string'),
272+
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
273+
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
273274
```
274275

275276
The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:

src/datasets/arrow_dataset.py

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,12 @@
7676
from .arrow_writer import ArrowWriter, OptimizedTypedSequence
7777
from .data_files import sanitize_patterns
7878
from .download.streaming_download_manager import xgetsize
79-
from .features import Audio, ClassLabel, Features, Image, Sequence, Value, Video
79+
from .features import Audio, ClassLabel, Features, Image, List, Value, Video
8080
from .features.features import (
8181
FeatureType,
8282
_align_features,
8383
_check_if_features_can_be_aligned,
84+
_fix_for_backward_compatible_features,
8485
generate_from_arrow_type,
8586
pandas_types_mapper,
8687
require_decoding,
@@ -897,6 +898,8 @@ def from_pandas(
897898
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
898899
)
899900
features = features if features is not None else info.features if info is not None else None
901+
if features is not None:
902+
features = _fix_for_backward_compatible_features(features)
900903
if info is None:
901904
info = DatasetInfo()
902905
info.features = features
@@ -942,6 +945,8 @@ def from_polars(
942945
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
943946
)
944947
features = features if features is not None else info.features if info is not None else None
948+
if features is not None:
949+
features = _fix_for_backward_compatible_features(features)
945950
if info is None:
946951
info = DatasetInfo()
947952
info.features = features
@@ -987,6 +992,8 @@ def from_dict(
987992
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
988993
)
989994
features = features if features is not None else info.features if info is not None else None
995+
if features is not None:
996+
features = _fix_for_backward_compatible_features(features)
990997
arrow_typed_mapping = {}
991998
for col, data in mapping.items():
992999
if isinstance(data, (pa.Array, pa.ChunkedArray)):
@@ -1950,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
19501957
>>> from datasets import load_dataset
19511958
>>> ds = load_dataset("boolq", split="validation")
19521959
>>> ds.features
1953-
{'answer': Value(dtype='bool', id=None),
1954-
'passage': Value(dtype='string', id=None),
1955-
'question': Value(dtype='string', id=None)}
1960+
{'answer': Value(dtype='bool'),
1961+
'passage': Value(dtype='string'),
1962+
'question': Value(dtype='string')}
19561963
>>> ds = ds.class_encode_column('answer')
19571964
>>> ds.features
1958-
{'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
1959-
'passage': Value(dtype='string', id=None),
1960-
'question': Value(dtype='string', id=None)}
1965+
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
1966+
'passage': Value(dtype='string'),
1967+
'question': Value(dtype='string')}
19611968
```
19621969
"""
19631970
# Sanity checks
@@ -2028,11 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
20282035
>>> from datasets import load_dataset
20292036
>>> ds = load_dataset("rajpurkar/squad", split="train")
20302037
>>> ds.features
2031-
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
2032-
'context': Value(dtype='string', id=None),
2033-
'id': Value(dtype='string', id=None),
2034-
'question': Value(dtype='string', id=None),
2035-
'title': Value(dtype='string', id=None)}
2038+
{'id': Value(dtype='string'),
2039+
'title': Value(dtype='string'),
2040+
'context': Value(dtype='string'),
2041+
'question': Value(dtype='string'),
2042+
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
2043+
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
20362044
>>> ds.flatten()
20372045
Dataset({
20382046
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
@@ -2100,15 +2108,15 @@ def cast(
21002108
>>> from datasets import load_dataset, ClassLabel, Value
21012109
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21022110
>>> ds.features
2103-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
2104-
'text': Value(dtype='string', id=None)}
2111+
{'label': ClassLabel(names=['neg', 'pos']),
2112+
'text': Value(dtype='string')}
21052113
>>> new_features = ds.features.copy()
21062114
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
21072115
>>> new_features['text'] = Value('large_string')
21082116
>>> ds = ds.cast(new_features)
21092117
>>> ds.features
2110-
{'label': ClassLabel(names=['bad', 'good'], id=None),
2111-
'text': Value(dtype='large_string', id=None)}
2118+
{'label': ClassLabel(names=['bad', 'good']),
2119+
'text': Value(dtype='large_string')}
21122120
```
21132121
"""
21142122
if sorted(features) != sorted(self._data.column_names):
@@ -2117,6 +2125,7 @@ def cast(
21172125
f"as the columns in the dataset: {self._data.column_names}"
21182126
)
21192127

2128+
features = _fix_for_backward_compatible_features(features)
21202129
schema = features.arrow_schema
21212130
format = self.format
21222131
dataset = self.with_format("arrow")
@@ -2158,14 +2167,15 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
21582167
>>> from datasets import load_dataset, ClassLabel
21592168
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21602169
>>> ds.features
2161-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
2162-
'text': Value(dtype='string', id=None)}
2170+
{'label': ClassLabel(names=['neg', 'pos']),
2171+
'text': Value(dtype='string')}
21632172
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
21642173
>>> ds.features
2165-
{'label': ClassLabel(names=['bad', 'good'], id=None),
2166-
'text': Value(dtype='string', id=None)}
2174+
{'label': ClassLabel(names=['bad', 'good']),
2175+
'text': Value(dtype='string')}
21672176
```
21682177
"""
2178+
feature = _fix_for_backward_compatible_features(feature)
21692179
if hasattr(feature, "decode_example"):
21702180
dataset = copy.deepcopy(self)
21712181
dataset._info.features[column] = feature
@@ -3082,6 +3092,9 @@ def map(
30823092
if fn_kwargs is None:
30833093
fn_kwargs = {}
30843094

3095+
if features is not None:
3096+
features = _fix_for_backward_compatible_features(features)
3097+
30853098
if num_proc is not None and num_proc > len(self):
30863099
num_proc = len(self)
30873100
logger.warning(
@@ -6350,7 +6363,7 @@ def process_label_ids(batch):
63506363
features[label_column] = (
63516364
ClassLabel(num_classes=len(label_names), names=label_names)
63526365
if isinstance(label_feature, ClassLabel)
6353-
else Sequence(ClassLabel(num_classes=len(label_names), names=label_names))
6366+
else List(ClassLabel(num_classes=len(label_names), names=label_names))
63546367
)
63556368
return self.map(process_label_ids, features=features, batched=True, desc="Aligning the labels")
63566369

src/datasets/builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
513513
>>> from datasets import load_dataset_builder
514514
>>> ds_builder = load_dataset_builder('vivos')
515515
>>> ds_builder.get_all_exported_dataset_infos()
516-
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
516+
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
517517
```
518518
"""
519519
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
@@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
527527
>>> from datasets import load_dataset_builder
528528
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
529529
>>> ds_builder.get_exported_dataset_info()
530-
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
530+
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
531531
```
532532
"""
533533
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())

src/datasets/dataset_dict.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict":
201201
>>> from datasets import load_dataset
202202
>>> ds = load_dataset("rajpurkar/squad")
203203
>>> ds["train"].features
204-
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
205-
'context': Value(dtype='string', id=None),
206-
'id': Value(dtype='string', id=None),
207-
'question': Value(dtype='string', id=None),
208-
'title': Value(dtype='string', id=None)}
204+
{'id': Value(dtype='string'),
205+
'title': Value(dtype='string'),
206+
'context': Value(dtype='string'),
207+
'question': Value(dtype='string'),
208+
'answers.text': List(feature=Value(dtype='string'), length=-1),
209+
'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)}
209210
>>> ds.flatten()
210211
DatasetDict({
211212
train: Dataset({
@@ -288,15 +289,15 @@ def cast(self, features: Features) -> "DatasetDict":
288289
>>> from datasets import load_dataset, ClassLabel, Value
289290
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
290291
>>> ds["train"].features
291-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
292-
'text': Value(dtype='string', id=None)}
292+
{'label': ClassLabel(names=['neg', 'pos']),
293+
'text': Value(dtype='string')}
293294
>>> new_features = ds["train"].features.copy()
294295
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
295296
>>> new_features['text'] = Value('large_string')
296297
>>> ds = ds.cast(new_features)
297298
>>> ds["train"].features
298-
{'label': ClassLabel(names=['bad', 'good'], id=None),
299-
'text': Value(dtype='large_string', id=None)}
299+
{'label': ClassLabel(names=['bad', 'good']),
300+
'text': Value(dtype='large_string')}
300301
```
301302
"""
302303
self._check_values_type()
@@ -320,12 +321,12 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
320321
>>> from datasets import load_dataset, ClassLabel
321322
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
322323
>>> ds["train"].features
323-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
324-
'text': Value(dtype='string', id=None)}
324+
{'label': ClassLabel(names=['neg', 'pos']),
325+
'text': Value(dtype='string')}
325326
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
326327
>>> ds["train"].features
327-
{'label': ClassLabel(names=['bad', 'good'], id=None),
328-
'text': Value(dtype='string', id=None)}
328+
{'label': ClassLabel(names=['bad', 'good']),
329+
'text': Value(dtype='string')}
329330
```
330331
"""
331332
self._check_values_type()
@@ -512,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
512513
>>> from datasets import load_dataset
513514
>>> ds = load_dataset("boolq")
514515
>>> ds["train"].features
515-
{'answer': Value(dtype='bool', id=None),
516-
'passage': Value(dtype='string', id=None),
517-
'question': Value(dtype='string', id=None)}
516+
{'answer': Value(dtype='bool'),
517+
'passage': Value(dtype='string'),
518+
'question': Value(dtype='string')}
518519
>>> ds = ds.class_encode_column("answer")
519520
>>> ds["train"].features
520-
{'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
521-
'passage': Value(dtype='string', id=None),
522-
'question': Value(dtype='string', id=None)}
521+
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
522+
'passage': Value(dtype='string'),
523+
'question': Value(dtype='string')}
523524
```
524525
"""
525526
self._check_values_type()
@@ -2379,12 +2380,12 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
23792380
>>> from datasets import load_dataset, ClassLabel
23802381
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
23812382
>>> ds["train"].features
2382-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
2383-
'text': Value(dtype='string', id=None)}
2383+
{'label': ClassLabel(names=['neg', 'pos']),
2384+
'text': Value(dtype='string')}
23842385
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
23852386
>>> ds["train"].features
2386-
{'label': ClassLabel(names=['bad', 'good'], id=None),
2387-
'text': Value(dtype='string', id=None)}
2387+
{'label': ClassLabel(names=['bad', 'good']),
2388+
'text': Value(dtype='string')}
23882389
```
23892390
"""
23902391
return IterableDatasetDict(
@@ -2415,15 +2416,15 @@ def cast(
24152416
>>> from datasets import load_dataset
24162417
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
24172418
>>> ds["train"].features
2418-
{'label': ClassLabel(names=['neg', 'pos'], id=None),
2419-
'text': Value(dtype='string', id=None)}
2419+
{'label': ClassLabel(names=['neg', 'pos']),
2420+
'text': Value(dtype='string')}
24202421
>>> new_features = ds["train"].features.copy()
24212422
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
24222423
>>> new_features['text'] = Value('large_string')
24232424
>>> ds = ds.cast(new_features)
24242425
>>> ds["train"].features
2425-
{'label': ClassLabel(names=['bad', 'good'], id=None),
2426-
'text': Value(dtype='large_string', id=None)}
2426+
{'label': ClassLabel(names=['bad', 'good']),
2427+
'text': Value(dtype='large_string')}
24272428
```
24282429
"""
24292430
return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})

src/datasets/features/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"ClassLabel",
88
"Features",
99
"LargeList",
10+
"List",
1011
"Sequence",
1112
"Value",
1213
"Image",
@@ -16,7 +17,7 @@
1617
"Pdf",
1718
]
1819
from .audio import Audio
19-
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
20+
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
2021
from .image import Image
2122
from .pdf import Pdf
2223
from .translation import Translation, TranslationVariableLanguages

0 commit comments

Comments
 (0)