|
| 1 | +# coding=utf-8 |
| 2 | +# Copyright 2024 The TensorFlow Datasets Authors. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +"""Utility functions to convert from other formats to TFDS conventions.""" |
| 17 | + |
| 18 | +from collections.abc import Mapping, Sequence |
| 19 | +import datetime |
| 20 | +from typing import Any |
| 21 | + |
| 22 | +from etils import epath |
| 23 | +import numpy as np |
| 24 | +from tensorflow_datasets.core import features as feature_lib |
| 25 | +from tensorflow_datasets.core import lazy_imports_lib |
| 26 | +from tensorflow_datasets.core.utils import dtype_utils |
| 27 | +from tensorflow_datasets.core.utils import py_utils |
| 28 | + |
| 29 | +_DEFAULT_IMG = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc```\x00\x00\x00\x04\x00\x01\xf6\x178U\x00\x00\x00\x00IEND\xaeB`\x82' |
| 30 | + |
| 31 | + |
| 32 | +def to_tfds_name(name: str) -> str: |
| 33 | + """Converts a name to a TFDS compatible dataset name. |
| 34 | +
|
| 35 | + Huggingface names can contain characters that are not supported in |
| 36 | + TFDS. For example, in Huggingface a dataset name like `a/b` is supported, |
| 37 | + while in TFDS `b` would be parsed as the config. |
| 38 | +
|
| 39 | + Examples: |
| 40 | + - `name='codeparrot/github-code'` becomes `codeparrot__github_code`. |
| 41 | +
|
| 42 | + Args: |
| 43 | + name: A name to be converted to a TFDS compatible name. |
| 44 | +
|
| 45 | + Returns: |
| 46 | + The TFDS compatible dataset name (dataset names, config names and split |
| 47 | + names). |
| 48 | + """ |
| 49 | + name = name.lower().replace('/', '__') |
| 50 | + return py_utils.make_valid_name(name) |
| 51 | + |
| 52 | + |
| 53 | +def _get_default_value( |
| 54 | + feature: feature_lib.FeatureConnector, |
| 55 | +) -> Mapping[str, Any] | Sequence[Any] | bytes | int | float | bool: |
| 56 | + """Returns the default value for a feature. |
| 57 | +
|
| 58 | + Non-TFDS features can be loose as far as typing is concerned. For example, |
| 59 | + HuggingFace accepts None values. As long as `tfds.features.Optional` does not |
| 60 | + exist, we default to a constant default value. |
| 61 | +
|
| 62 | + For int and float, we do not return 0 or -1, but rather -inf, as 0 or -1 can |
| 63 | + be contained in the values of the dataset. In practice, you can compare your |
| 64 | + value to: |
| 65 | +
|
| 66 | + ``` |
| 67 | + np.iinfo(np.int32).min # for integers |
| 68 | + np.finfo(np.float32).min # for floats |
| 69 | + ... |
| 70 | + ``` |
| 71 | +
|
| 72 | + For None images, we set a default value which corresponds to a PNG of 1px, |
| 73 | + black. |
| 74 | +
|
| 75 | + Args: |
| 76 | + feature: The TFDS feature from which we want the default value. |
| 77 | +
|
| 78 | + Raises: |
| 79 | + TypeError: If couldn't recognize feature dtype. |
| 80 | + """ |
| 81 | + match feature: |
| 82 | + case feature_lib.FeaturesDict(): |
| 83 | + return { |
| 84 | + name: _get_default_value(inner_feature) |
| 85 | + for name, inner_feature in feature.items() |
| 86 | + } |
| 87 | + case feature_lib.Sequence(): |
| 88 | + match feature.feature: |
| 89 | + case feature_lib.FeaturesDict(): |
| 90 | + return {feature_name: [] for feature_name in feature.feature.keys()} |
| 91 | + case _: |
| 92 | + return [] |
| 93 | + case feature_lib.Image(): |
| 94 | + # Return an empty PNG image of 1x1 pixel, black. |
| 95 | + return _DEFAULT_IMG |
| 96 | + case _: |
| 97 | + if dtype_utils.is_string(feature.np_dtype): |
| 98 | + return b'' |
| 99 | + elif dtype_utils.is_integer(feature.np_dtype): |
| 100 | + return np.iinfo(feature.np_dtype).min |
| 101 | + elif dtype_utils.is_floating(feature.np_dtype): |
| 102 | + return np.finfo(feature.np_dtype).min |
| 103 | + elif dtype_utils.is_bool(feature.np_dtype): |
| 104 | + return False |
| 105 | + else: |
| 106 | + raise TypeError(f'Could not recognize the dtype of {feature}') |
| 107 | + |
| 108 | + |
| 109 | +def to_tfds_value(value: Any, feature: feature_lib.FeatureConnector) -> Any: |
| 110 | + """Converts a value to a TFDS compatible value. |
| 111 | +
|
| 112 | + Args: |
| 113 | + value: The value to be converted to follow TFDS conventions. |
| 114 | + feature: The TFDS feature for which we want the compatible value. |
| 115 | +
|
| 116 | + Returns: |
| 117 | + The TFDS compatible value. |
| 118 | +
|
| 119 | + Raises: |
| 120 | + TypeError: If couldn't recognize the given feature type. |
| 121 | + """ |
| 122 | + match value: |
| 123 | + case None: |
| 124 | + return _get_default_value(feature) |
| 125 | + case datetime.datetime(): |
| 126 | + return int(value.timestamp()) |
| 127 | + |
| 128 | + match feature: |
| 129 | + case feature_lib.ClassLabel() | feature_lib.Scalar(): |
| 130 | + return value |
| 131 | + case feature_lib.FeaturesDict(): |
| 132 | + return { |
| 133 | + name: to_tfds_value(value.get(name), inner_feature) |
| 134 | + for name, inner_feature in feature.items() |
| 135 | + } |
| 136 | + case feature_lib.Sequence(): |
| 137 | + match value: |
| 138 | + case dict(): |
| 139 | + # Should be a dict of lists: |
| 140 | + return { |
| 141 | + name: [ |
| 142 | + to_tfds_value(inner_hf_value, inner_feature) |
| 143 | + for inner_hf_value in value.get(name) |
| 144 | + ] |
| 145 | + for name, inner_feature in feature.feature.items() |
| 146 | + } |
| 147 | + case list(): |
| 148 | + return [ |
| 149 | + to_tfds_value(inner_hf_value, feature.feature) |
| 150 | + for inner_hf_value in value |
| 151 | + ] |
| 152 | + case _: |
| 153 | + return [value] |
| 154 | + case feature_lib.Audio(): |
| 155 | + if array := value.get('array'): |
| 156 | + # Hugging Face uses floats, TFDS uses integers. |
| 157 | + return [int(sample * feature.sample_rate) for sample in array] |
| 158 | + elif (path := value.get('path')) and (path := epath.Path(path)).exists(): |
| 159 | + return path |
| 160 | + case feature_lib.Image(): |
| 161 | + value: lazy_imports_lib.lazy_imports.PIL_Image.Image |
| 162 | + # Ensure RGB format for PNG encoding. |
| 163 | + return value.convert('RGB') |
| 164 | + case feature_lib.Tensor(): |
| 165 | + if isinstance(value, float): |
| 166 | + # In some cases, for example when loading jsonline files using pandas, |
| 167 | + # empty non-float values, such as strings, are converted to float nan. |
| 168 | + # We spot those occurrences as the feature.np_dtype is not float. |
| 169 | + if np.isnan(value) and not dtype_utils.is_floating(feature.np_dtype): |
| 170 | + return _get_default_value(feature) |
| 171 | + return value |
| 172 | + |
| 173 | + raise TypeError( |
| 174 | + f'Conversion of value {value} to feature {feature} is not supported.' |
| 175 | + ) |
0 commit comments