Skip to content

Commit 77d00eb

Browse files
author
The TensorFlow Datasets Authors
committed
Add arguments for multilingual datasets support in the CroissantBuilder.
PiperOrigin-RevId: 800453696
1 parent 4627fce commit 77d00eb

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed

tensorflow_datasets/core/dataset_builders/croissant_builder.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def array_datatype_converter(
8585
feature: type_utils.TfdsDType | feature_lib.FeatureConnector | None,
8686
field: mlc.Field,
8787
dtype_mapping: Mapping[type_utils.TfdsDType, type_utils.TfdsDType],
88+
language: str | None = None,
8889
):
8990
"""Includes the given feature in a sequence or tensor feature.
9091
@@ -97,6 +98,10 @@ def array_datatype_converter(
9798
field: The mlc.Field object.
9899
dtype_mapping: A mapping of dtypes to the corresponding dtypes that will be
99100
used in TFDS.
101+
language: For Croissant jsonld which include multi-lingual descriptions, the
102+
language code to use to extract the description to be used in TFDS. If
103+
None, it will extract the description in English or the first available
104+
language in the dictionary.
100105
101106
Returns:
102107
A sequence or tensor feature including the inner feature.
@@ -108,7 +113,7 @@ def array_datatype_converter(
108113
field_dtype = field.data_type
109114

110115
description = croissant_utils.extract_localized_string(
111-
field.description, field_name='description'
116+
field.description, language=language, field_name='description'
112117
)
113118

114119
if len(field.array_shape_tuple) == 1:
@@ -129,6 +134,7 @@ def datatype_converter(
129134
field: mlc.Field,
130135
int_dtype: type_utils.TfdsDType = np.int64,
131136
float_dtype: type_utils.TfdsDType = np.float32,
137+
language: str | None = None,
132138
):
133139
"""Converts a Croissant field to a TFDS-compatible feature.
134140
@@ -137,6 +143,10 @@ def datatype_converter(
137143
int_dtype: The dtype to use for TFDS integer features. Defaults to np.int64.
138144
float_dtype: The dtype to use for TFDS float features. Defaults to
139145
np.float32.
146+
language: For Croissant jsonld which include multi-lingual descriptions, the
147+
language code to use to extract the description to be used in TFDS. If
148+
None, it will extract the description in English or the first available
149+
language in the dictionary.
140150
141151
Returns:
142152
Converted datatype for TFDS, or None when a Field does not specify a type.
@@ -156,7 +166,7 @@ def datatype_converter(
156166

157167
field_data_type = field.data_type
158168
description = croissant_utils.extract_localized_string(
159-
field.description, field_name='description'
169+
field.description, language=language, field_name='description'
160170
)
161171

162172
if not field_data_type:
@@ -165,7 +175,10 @@ def datatype_converter(
165175
feature = features_dict.FeaturesDict(
166176
{
167177
subfield.id: datatype_converter(
168-
subfield, int_dtype=int_dtype, float_dtype=float_dtype
178+
subfield,
179+
int_dtype=int_dtype,
180+
float_dtype=float_dtype,
181+
language=language,
169182
)
170183
for subfield in field.sub_fields
171184
},
@@ -215,6 +228,7 @@ def datatype_converter(
215228
feature=feature,
216229
field=field,
217230
dtype_mapping=dtype_mapping,
231+
language=language,
218232
)
219233
# If the field is repeated, we return a sequence feature. `field.repeated` is
220234
# deprecated starting from Croissant 1.1, but we still support it for

0 commit comments

Comments
 (0)