huggingface
diff --git a/‎docs/source/about_dataset_features.mdx‎
Lines changed: 8 additions & 7 deletions b/‎docs/source/about_dataset_features.mdx‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎docs/source/package_reference/main_classes.mdx‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/package_reference/main_classes.mdx‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/process.mdx‎
Lines changed: 6 additions & 5 deletions b/‎docs/source/process.mdx‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/datasets/arrow_dataset.py‎
Lines changed: 34 additions & 21 deletions b/‎src/datasets/arrow_dataset.py‎
Lines changed: 34 additions & 21 deletions
diff --git a/‎src/datasets/builder.py‎
Lines changed: 2 additions & 2 deletions b/‎src/datasets/builder.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/datasets/dataset_dict.py‎
Lines changed: 28 additions & 27 deletions b/‎src/datasets/dataset_dict.py‎
Lines changed: 28 additions & 27 deletions
diff --git a/‎src/datasets/features/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/datasets/features/__init__.py‎
Lines changed: 2 additions & 1 deletion
@@ -32,20 +32,21 @@ Refer to [`Value`] for a full list of supported data types.
 
 The [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa.
 
-If your data type contains a list of objects, then you want to use the [`Sequence`] feature. Remember the SQuAD dataset?
+If your data type contains a list of objects, then you want to use the [`List`] feature. Remember the SQuAD dataset?
 
 ```py
 >>> from datasets import load_dataset
 >>> dataset = load_dataset('rajpurkar/squad', split='train')
 >>> dataset.features
-{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
-'context': Value(dtype='string'),
-'id': Value(dtype='string'),
-'question': Value(dtype='string'),
-'title': Value(dtype='string')}
+{'id': Value(dtype='string'),
+ 'title': Value(dtype='string'),
+ 'context': Value(dtype='string'),
+ 'question': Value(dtype='string'),
+ 'answers': {'text': List(feature=Value(dtype='string'), length=-1),
+  'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
 ```
 
-The `answers` field is constructed using the [`Sequence`] feature because it contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
+The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
 
 <Tip>
 
 
@@ -235,6 +235,8 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable
 
 [[autodoc]] datasets.LargeList
 
+[[autodoc]] datasets.List
+
 [[autodoc]] datasets.Sequence
 
 ### Translation
 
@@ -265,11 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th
 >>> from datasets import load_dataset
 >>> dataset = load_dataset("rajpurkar/squad", split="train")
 >>> dataset.features
-{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
-'context': Value(dtype='string'),
-'id': Value(dtype='string'),
-'question': Value(dtype='string'),
-'title': Value(dtype='string')}
+{'id': Value(dtype='string'),
+ 'title': Value(dtype='string'),
+ 'context': Value(dtype='string'),
+ 'question': Value(dtype='string'),
+ 'answers': {'text': List(feature=Value(dtype='string'), length=-1),
+  'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
 ```
 
 The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:
 
@@ -76,11 +76,12 @@
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
 from .data_files import sanitize_patterns
 from .download.streaming_download_manager import xgetsize
-from .features import Audio, ClassLabel, Features, Image, Sequence, Value, Video
+from .features import Audio, ClassLabel, Features, Image, List, Value, Video
 from .features.features import (
     FeatureType,
     _align_features,
     _check_if_features_can_be_aligned,
+    _fix_for_backward_compatible_features,
     generate_from_arrow_type,
     pandas_types_mapper,
     require_decoding,
@@ -897,6 +898,8 @@ def from_pandas(
                 f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
             )
         features = features if features is not None else info.features if info is not None else None
+        if features is not None:
+            features = _fix_for_backward_compatible_features(features)
         if info is None:
             info = DatasetInfo()
         info.features = features
@@ -942,6 +945,8 @@ def from_polars(
                 f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
             )
         features = features if features is not None else info.features if info is not None else None
+        if features is not None:
+            features = _fix_for_backward_compatible_features(features)
         if info is None:
             info = DatasetInfo()
         info.features = features
@@ -987,6 +992,8 @@ def from_dict(
                 f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
             )
         features = features if features is not None else info.features if info is not None else None
+        if features is not None:
+            features = _fix_for_backward_compatible_features(features)
         arrow_typed_mapping = {}
         for col, data in mapping.items():
             if isinstance(data, (pa.Array, pa.ChunkedArray)):
@@ -1950,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
         >>> from datasets import load_dataset
         >>> ds = load_dataset("boolq", split="validation")
         >>> ds.features
-        {'answer': Value(dtype='bool', id=None),
-         'passage': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None)}
+        {'answer': Value(dtype='bool'),
+         'passage': Value(dtype='string'),
+         'question': Value(dtype='string')}
         >>> ds = ds.class_encode_column('answer')
         >>> ds.features
-        {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
-         'passage': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None)}
+        {'answer': ClassLabel(num_classes=2, names=['False', 'True']),
+         'passage': Value(dtype='string'),
+         'question': Value(dtype='string')}
         ```
         """
         # Sanity checks
@@ -2028,11 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
         >>> from datasets import load_dataset
         >>> ds = load_dataset("rajpurkar/squad", split="train")
         >>> ds.features
-        {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
-         'context': Value(dtype='string', id=None),
-         'id': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None),
-         'title': Value(dtype='string', id=None)}
+        {'id': Value(dtype='string'),
+         'title': Value(dtype='string'),
+         'context': Value(dtype='string'),
+         'question': Value(dtype='string'),
+         'answers': {'text': List(feature=Value(dtype='string'), length=-1),
+         'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
         >>> ds.flatten()
         Dataset({
             features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
@@ -2100,15 +2108,15 @@ def cast(
         >>> from datasets import load_dataset, ClassLabel, Value
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
         >>> ds.features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> new_features = ds.features.copy()
         >>> new_features['label'] = ClassLabel(names=['bad', 'good'])
         >>> new_features['text'] = Value('large_string')
         >>> ds = ds.cast(new_features)
         >>> ds.features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='large_string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='large_string')}
         ```
         """
         if sorted(features) != sorted(self._data.column_names):
@@ -2117,6 +2125,7 @@ def cast(
                 f"as the columns in the dataset: {self._data.column_names}"
             )
 
+        features = _fix_for_backward_compatible_features(features)
         schema = features.arrow_schema
         format = self.format
         dataset = self.with_format("arrow")
@@ -2158,14 +2167,15 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
         >>> from datasets import load_dataset, ClassLabel
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
         >>> ds.features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
         >>> ds.features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='string')}
         ```
         """
+        feature = _fix_for_backward_compatible_features(feature)
         if hasattr(feature, "decode_example"):
             dataset = copy.deepcopy(self)
             dataset._info.features[column] = feature
@@ -3082,6 +3092,9 @@ def map(
         if fn_kwargs is None:
             fn_kwargs = {}
 
+        if features is not None:
+            features = _fix_for_backward_compatible_features(features)
+
         if num_proc is not None and num_proc > len(self):
             num_proc = len(self)
             logger.warning(
@@ -6350,7 +6363,7 @@ def process_label_ids(batch):
         features[label_column] = (
             ClassLabel(num_classes=len(label_names), names=label_names)
             if isinstance(label_feature, ClassLabel)
-            else Sequence(ClassLabel(num_classes=len(label_names), names=label_names))
+            else List(ClassLabel(num_classes=len(label_names), names=label_names))
         )
         return self.map(process_label_ids, features=features, batched=True, desc="Aligning the labels")
 
 
@@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
         >>> from datasets import load_dataset_builder
         >>> ds_builder = load_dataset_builder('vivos')
         >>> ds_builder.get_all_exported_dataset_infos()
-        {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
+        {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
         ```
         """
         return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
@@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
         >>> from datasets import load_dataset_builder
         >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
         >>> ds_builder.get_exported_dataset_info()
-        DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
+        DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
         ```
         """
         return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
 
@@ -201,11 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict":
         >>> from datasets import load_dataset
         >>> ds = load_dataset("rajpurkar/squad")
         >>> ds["train"].features
-        {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
-         'context': Value(dtype='string', id=None),
-         'id': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None),
-         'title': Value(dtype='string', id=None)}
+        {'id': Value(dtype='string'),
+         'title': Value(dtype='string'),
+         'context': Value(dtype='string'),
+         'question': Value(dtype='string'),
+         'answers.text': List(feature=Value(dtype='string'), length=-1),
+         'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)}
         >>> ds.flatten()
         DatasetDict({
             train: Dataset({
@@ -288,15 +289,15 @@ def cast(self, features: Features) -> "DatasetDict":
         >>> from datasets import load_dataset, ClassLabel, Value
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
         >>> ds["train"].features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> new_features = ds["train"].features.copy()
         >>> new_features['label'] = ClassLabel(names=['bad', 'good'])
         >>> new_features['text'] = Value('large_string')
         >>> ds = ds.cast(new_features)
         >>> ds["train"].features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='large_string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='large_string')}
         ```
         """
         self._check_values_type()
@@ -320,12 +321,12 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
         >>> from datasets import load_dataset, ClassLabel
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
         >>> ds["train"].features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
         >>> ds["train"].features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='string')}
         ```
         """
         self._check_values_type()
@@ -512,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
         >>> from datasets import load_dataset
         >>> ds = load_dataset("boolq")
         >>> ds["train"].features
-        {'answer': Value(dtype='bool', id=None),
-         'passage': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None)}
+        {'answer': Value(dtype='bool'),
+         'passage': Value(dtype='string'),
+         'question': Value(dtype='string')}
         >>> ds = ds.class_encode_column("answer")
         >>> ds["train"].features
-        {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
-         'passage': Value(dtype='string', id=None),
-         'question': Value(dtype='string', id=None)}
+        {'answer': ClassLabel(num_classes=2, names=['False', 'True']),
+         'passage': Value(dtype='string'),
+         'question': Value(dtype='string')}
         ```
         """
         self._check_values_type()
@@ -2379,12 +2380,12 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
         >>> from datasets import load_dataset, ClassLabel
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
         >>> ds["train"].features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
         >>> ds["train"].features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='string')}
         ```
         """
         return IterableDatasetDict(
@@ -2415,15 +2416,15 @@ def cast(
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
         >>> ds["train"].features
-        {'label': ClassLabel(names=['neg', 'pos'], id=None),
-         'text': Value(dtype='string', id=None)}
+        {'label': ClassLabel(names=['neg', 'pos']),
+         'text': Value(dtype='string')}
         >>> new_features = ds["train"].features.copy()
         >>> new_features['label'] = ClassLabel(names=['bad', 'good'])
         >>> new_features['text'] = Value('large_string')
         >>> ds = ds.cast(new_features)
         >>> ds["train"].features
-        {'label': ClassLabel(names=['bad', 'good'], id=None),
-         'text': Value(dtype='large_string', id=None)}
+        {'label': ClassLabel(names=['bad', 'good']),
+         'text': Value(dtype='large_string')}
         ```
         """
         return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})
 
@@ -7,6 +7,7 @@
     "ClassLabel",
     "Features",
     "LargeList",
+    "List",
     "Sequence",
     "Value",
     "Image",
@@ -16,7 +17,7 @@
     "Pdf",
 ]
 from .audio import Audio
-from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
+from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
 from .image import Image
 from .pdf import Pdf
 from .translation import Translation, TranslationVariableLanguages