@@ -85,6 +85,7 @@ def array_datatype_converter(
8585 feature : type_utils .TfdsDType | feature_lib .FeatureConnector | None ,
8686 field : mlc .Field ,
8787 dtype_mapping : Mapping [type_utils .TfdsDType , type_utils .TfdsDType ],
88+ language : str | None = None ,
8889):
8990 """Includes the given feature in a sequence or tensor feature.
9091
@@ -97,6 +98,10 @@ def array_datatype_converter(
9798 field: The mlc.Field object.
9899 dtype_mapping: A mapping of dtypes to the corresponding dtypes that will be
99100 used in TFDS.
101+ language: For Croissant jsonld which include multi-lingual descriptions, the
102+ language code to use to extract the description to be used in TFDS. If
103+ None, it will extract the description in English or the first available
104+ language in the dictionary.
100105
101106 Returns:
102107 A sequence or tensor feature including the inner feature.
@@ -108,7 +113,7 @@ def array_datatype_converter(
108113 field_dtype = field .data_type
109114
110115 description = croissant_utils .extract_localized_string (
111- field .description , field_name = 'description'
116+ field .description , language = language , field_name = 'description'
112117 )
113118
114119 if len (field .array_shape_tuple ) == 1 :
@@ -129,6 +134,7 @@ def datatype_converter(
129134 field : mlc .Field ,
130135 int_dtype : type_utils .TfdsDType = np .int64 ,
131136 float_dtype : type_utils .TfdsDType = np .float32 ,
137+ language : str | None = None ,
132138):
133139 """Converts a Croissant field to a TFDS-compatible feature.
134140
@@ -137,6 +143,10 @@ def datatype_converter(
137143 int_dtype: The dtype to use for TFDS integer features. Defaults to np.int64.
138144 float_dtype: The dtype to use for TFDS float features. Defaults to
139145 np.float32.
146+ language: For Croissant jsonld which include multi-lingual descriptions, the
147+ language code to use to extract the description to be used in TFDS. If
148+ None, it will extract the description in English or the first available
149+ language in the dictionary.
140150
141151 Returns:
142152 Converted datatype for TFDS, or None when a Field does not specify a type.
@@ -156,7 +166,7 @@ def datatype_converter(
156166
157167 field_data_type = field .data_type
158168 description = croissant_utils .extract_localized_string (
159- field .description , field_name = 'description'
169+ field .description , language = language , field_name = 'description'
160170 )
161171
162172 if not field_data_type :
@@ -165,7 +175,10 @@ def datatype_converter(
165175 feature = features_dict .FeaturesDict (
166176 {
167177 subfield .id : datatype_converter (
168- subfield , int_dtype = int_dtype , float_dtype = float_dtype
178+ subfield ,
179+ int_dtype = int_dtype ,
180+ float_dtype = float_dtype ,
181+ language = language ,
169182 )
170183 for subfield in field .sub_fields
171184 },
@@ -215,6 +228,7 @@ def datatype_converter(
215228 feature = feature ,
216229 field = field ,
217230 dtype_mapping = dtype_mapping ,
231+ language = language ,
218232 )
219233 # If the field is repeated, we return a sequence feature. `field.repeated` is
220234 # deprecated starting from Croissant 1.1, but we still support it for
0 commit comments