7676from .arrow_writer import ArrowWriter , OptimizedTypedSequence
7777from .data_files import sanitize_patterns
7878from .download .streaming_download_manager import xgetsize
79- from .features import Audio , ClassLabel , Features , Image , Sequence , Value , Video
79+ from .features import Audio , ClassLabel , Features , Image , List , Value , Video
8080from .features .features import (
8181 FeatureType ,
8282 _align_features ,
8383 _check_if_features_can_be_aligned ,
84+ _fix_for_backward_compatible_features ,
8485 generate_from_arrow_type ,
8586 pandas_types_mapper ,
8687 require_decoding ,
@@ -897,6 +898,8 @@ def from_pandas(
897898 f"Features specified in `features` and `info.features` can't be different:\n { features } \n { info .features } "
898899 )
899900 features = features if features is not None else info .features if info is not None else None
901+ if features is not None :
902+ features = _fix_for_backward_compatible_features (features )
900903 if info is None :
901904 info = DatasetInfo ()
902905 info .features = features
@@ -942,6 +945,8 @@ def from_polars(
942945 f"Features specified in `features` and `info.features` can't be different:\n { features } \n { info .features } "
943946 )
944947 features = features if features is not None else info .features if info is not None else None
948+ if features is not None :
949+ features = _fix_for_backward_compatible_features (features )
945950 if info is None :
946951 info = DatasetInfo ()
947952 info .features = features
@@ -987,6 +992,8 @@ def from_dict(
987992 f"Features specified in `features` and `info.features` can't be different:\n { features } \n { info .features } "
988993 )
989994 features = features if features is not None else info .features if info is not None else None
995+ if features is not None :
996+ features = _fix_for_backward_compatible_features (features )
990997 arrow_typed_mapping = {}
991998 for col , data in mapping .items ():
992999 if isinstance (data , (pa .Array , pa .ChunkedArray )):
@@ -1950,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
19501957 >>> from datasets import load_dataset
19511958 >>> ds = load_dataset("boolq", split="validation")
19521959 >>> ds.features
1953- {'answer': Value(dtype='bool', id=None ),
1954- 'passage': Value(dtype='string', id=None ),
1955- 'question': Value(dtype='string', id=None )}
1960+ {'answer': Value(dtype='bool'),
1961+ 'passage': Value(dtype='string'),
1962+ 'question': Value(dtype='string')}
19561963 >>> ds = ds.class_encode_column('answer')
19571964 >>> ds.features
1958- {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None ),
1959- 'passage': Value(dtype='string', id=None ),
1960- 'question': Value(dtype='string', id=None )}
1965+ {'answer': ClassLabel(num_classes=2, names=['False', 'True']),
1966+ 'passage': Value(dtype='string'),
1967+ 'question': Value(dtype='string')}
19611968 ```
19621969 """
19631970 # Sanity checks
@@ -2028,11 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
20282035 >>> from datasets import load_dataset
20292036 >>> ds = load_dataset("rajpurkar/squad", split="train")
20302037 >>> ds.features
2031- {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
2032- 'context': Value(dtype='string', id=None),
2033- 'id': Value(dtype='string', id=None),
2034- 'question': Value(dtype='string', id=None),
2035- 'title': Value(dtype='string', id=None)}
2038+ {'id': Value(dtype='string'),
2039+ 'title': Value(dtype='string'),
2040+ 'context': Value(dtype='string'),
2041+ 'question': Value(dtype='string'),
2042+ 'answers': {'text': List(feature=Value(dtype='string'), length=-1),
2043+ 'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
20362044 >>> ds.flatten()
20372045 Dataset({
20382046 features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
@@ -2100,15 +2108,15 @@ def cast(
21002108 >>> from datasets import load_dataset, ClassLabel, Value
21012109 >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21022110 >>> ds.features
2103- {'label': ClassLabel(names=['neg', 'pos'], id=None ),
2104- 'text': Value(dtype='string', id=None )}
2111+ {'label': ClassLabel(names=['neg', 'pos']),
2112+ 'text': Value(dtype='string')}
21052113 >>> new_features = ds.features.copy()
21062114 >>> new_features['label'] = ClassLabel(names=['bad', 'good'])
21072115 >>> new_features['text'] = Value('large_string')
21082116 >>> ds = ds.cast(new_features)
21092117 >>> ds.features
2110- {'label': ClassLabel(names=['bad', 'good'], id=None ),
2111- 'text': Value(dtype='large_string', id=None )}
2118+ {'label': ClassLabel(names=['bad', 'good']),
2119+ 'text': Value(dtype='large_string')}
21122120 ```
21132121 """
21142122 if sorted (features ) != sorted (self ._data .column_names ):
@@ -2117,6 +2125,7 @@ def cast(
21172125 f"as the columns in the dataset: { self ._data .column_names } "
21182126 )
21192127
2128+ features = _fix_for_backward_compatible_features (features )
21202129 schema = features .arrow_schema
21212130 format = self .format
21222131 dataset = self .with_format ("arrow" )
@@ -2158,14 +2167,15 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
21582167 >>> from datasets import load_dataset, ClassLabel
21592168 >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
21602169 >>> ds.features
2161- {'label': ClassLabel(names=['neg', 'pos'], id=None ),
2162- 'text': Value(dtype='string', id=None )}
2170+ {'label': ClassLabel(names=['neg', 'pos']),
2171+ 'text': Value(dtype='string')}
21632172 >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
21642173 >>> ds.features
2165- {'label': ClassLabel(names=['bad', 'good'], id=None ),
2166- 'text': Value(dtype='string', id=None )}
2174+ {'label': ClassLabel(names=['bad', 'good']),
2175+ 'text': Value(dtype='string')}
21672176 ```
21682177 """
2178+ feature = _fix_for_backward_compatible_features (feature )
21692179 if hasattr (feature , "decode_example" ):
21702180 dataset = copy .deepcopy (self )
21712181 dataset ._info .features [column ] = feature
@@ -3082,6 +3092,9 @@ def map(
30823092 if fn_kwargs is None :
30833093 fn_kwargs = {}
30843094
3095+ if features is not None :
3096+ features = _fix_for_backward_compatible_features (features )
3097+
30853098 if num_proc is not None and num_proc > len (self ):
30863099 num_proc = len (self )
30873100 logger .warning (
@@ -6350,7 +6363,7 @@ def process_label_ids(batch):
63506363 features [label_column ] = (
63516364 ClassLabel (num_classes = len (label_names ), names = label_names )
63526365 if isinstance (label_feature , ClassLabel )
6353- else Sequence (ClassLabel (num_classes = len (label_names ), names = label_names ))
6366+ else List (ClassLabel (num_classes = len (label_names ), names = label_names ))
63546367 )
63556368 return self .map (process_label_ids , features = features , batched = True , desc = "Aligning the labels" )
63566369
0 commit comments