@@ -85,6 +85,7 @@ def array_datatype_converter(
85
85
feature : type_utils .TfdsDType | feature_lib .FeatureConnector | None ,
86
86
field : mlc .Field ,
87
87
dtype_mapping : Mapping [type_utils .TfdsDType , type_utils .TfdsDType ],
88
+ language : str | None = None ,
88
89
):
89
90
"""Includes the given feature in a sequence or tensor feature.
90
91
@@ -97,6 +98,10 @@ def array_datatype_converter(
97
98
field: The mlc.Field object.
98
99
dtype_mapping: A mapping of dtypes to the corresponding dtypes that will be
99
100
used in TFDS.
101
+ language: For Croissant jsonld which include multi-lingual descriptions, the
102
+ language code to use to extract the description to be used in TFDS. If
103
+ None, it will extract the description in English or the first available
104
+ language in the dictionary.
100
105
101
106
Returns:
102
107
A sequence or tensor feature including the inner feature.
@@ -108,7 +113,7 @@ def array_datatype_converter(
108
113
field_dtype = field .data_type
109
114
110
115
description = croissant_utils .extract_localized_string (
111
- field .description , field_name = 'description'
116
+ field .description , language = language , field_name = 'description'
112
117
)
113
118
114
119
if len (field .array_shape_tuple ) == 1 :
@@ -129,6 +134,7 @@ def datatype_converter(
129
134
field : mlc .Field ,
130
135
int_dtype : type_utils .TfdsDType = np .int64 ,
131
136
float_dtype : type_utils .TfdsDType = np .float32 ,
137
+ language : str | None = None ,
132
138
):
133
139
"""Converts a Croissant field to a TFDS-compatible feature.
134
140
@@ -137,6 +143,10 @@ def datatype_converter(
137
143
int_dtype: The dtype to use for TFDS integer features. Defaults to np.int64.
138
144
float_dtype: The dtype to use for TFDS float features. Defaults to
139
145
np.float32.
146
+ language: For Croissant jsonld which include multi-lingual descriptions, the
147
+ language code to use to extract the description to be used in TFDS. If
148
+ None, it will extract the description in English or the first available
149
+ language in the dictionary.
140
150
141
151
Returns:
142
152
Converted datatype for TFDS, or None when a Field does not specify a type.
@@ -156,7 +166,7 @@ def datatype_converter(
156
166
157
167
field_data_type = field .data_type
158
168
description = croissant_utils .extract_localized_string (
159
- field .description , field_name = 'description'
169
+ field .description , language = language , field_name = 'description'
160
170
)
161
171
162
172
if not field_data_type :
@@ -165,7 +175,10 @@ def datatype_converter(
165
175
feature = features_dict .FeaturesDict (
166
176
{
167
177
subfield .id : datatype_converter (
168
- subfield , int_dtype = int_dtype , float_dtype = float_dtype
178
+ subfield ,
179
+ int_dtype = int_dtype ,
180
+ float_dtype = float_dtype ,
181
+ language = language ,
169
182
)
170
183
for subfield in field .sub_fields
171
184
},
@@ -215,6 +228,7 @@ def datatype_converter(
215
228
feature = feature ,
216
229
field = field ,
217
230
dtype_mapping = dtype_mapping ,
231
+ language = language ,
218
232
)
219
233
# If the field is repeated, we return a sequence feature. `field.repeated` is
220
234
# deprecated starting from Croissant 1.1, but we still support it for
0 commit comments