From 98b39c6008f8f1d278b063087df7f9526b86fa2c Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 24 Jun 2025 22:15:06 +0530 Subject: [PATCH 1/2] Add ignore_decode_errors option to Image feature for robust decoding (#7612) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR implements support for robust image decoding in the `Image` feature, as discussed in issue #7612. ## ๐Ÿ”ง What was added - A new boolean field: `ignore_decode_errors` (default: `False`) - If set to `True`, any exceptions during decoding will be caught, and `None` will be returned instead of raising an error ```python features = Features({ "image": Image(decode=True, ignore_decode_errors=True), }) ```` This enables robust iteration over potentially corrupted datasets โ€” especially useful when streaming datasets like WebDataset or image-heavy public sets where sample corruption is common. ## ๐Ÿงช Behavior * If `ignore_decode_errors=False` (default), decoding behaves exactly as before * If `True`, decoding errors are caught, and a warning is emitted: ``` [Image.decode_example] Skipped corrupted image: ... ``` ## ๐Ÿงต Linked issue Closes #7612 Let me know if you'd like a follow-up test PR. Happy to write one! --- src/datasets/features/image.py | 68 ++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index ad2e6bdfaec..6390633f9a6 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -84,6 +84,10 @@ class Image: mode: Optional[str] = None decode: bool = True + + # addition - 1 + ignore_decode_errors: bool = False + id: Optional[str] = field(default=None, repr=False) # Automatically constructed dtype: ClassVar[str] = "PIL.Image.Image" @@ -132,23 +136,10 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, np.ndarray, " f"An image sample should have one of 'path' or 'bytes' but they are missing or None in {value}." ) - def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Image": + def decode_example(self, value: dict, token_per_repo_id=None) -> Optional["PIL.Image.Image"]: """Decode example image file into image data. - Args: - value (`str` or `dict`): - A string with the absolute image file path, a dictionary with - keys: - - - `path`: String with absolute or relative image file path. - - `bytes`: The bytes of the image file. - token_per_repo_id (`dict`, *optional*): - To access and decode - image files from private repositories on the Hub, you can pass - a dictionary repo_id (`str`) -> token (`bool` or `str`). - - Returns: - `PIL.Image.Image` + Returns None if `ignore_decode_errors=True` and decoding fails. """ if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Image(decode=True) instead.") @@ -159,14 +150,15 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Imag else: raise ImportError("To support decoding images, please install 'Pillow'.") - if token_per_repo_id is None: - token_per_repo_id = {} + try: + if token_per_repo_id is None: + token_per_repo_id = {} - path, bytes_ = value["path"], value["bytes"] - if bytes_ is None: - if path is None: - raise ValueError(f"An image should have one of 'path' or 'bytes' but both are None in {value}.") - else: + path, bytes_ = value["path"], value["bytes"] + + if bytes_ is None: + if path is None: + raise ValueError(f"An image should have one of 'path' or 'bytes' but both are None in {value}.") if is_local_path(path): image = PIL.Image.open(path) else: @@ -178,20 +170,34 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Imag ) source_url_fields = string_to_dict(source_url, pattern) token = ( - token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + token_per_repo_id.get(source_url_fields["repo_id"]) + if source_url_fields is not None else None ) download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: bytes_ = BytesIO(f.read()) image = PIL.Image.open(bytes_) - else: - image = PIL.Image.open(BytesIO(bytes_)) - image.load() # to avoid "Too many open files" errors - if image.getexif().get(PIL.Image.ExifTags.Base.Orientation) is not None: - image = PIL.ImageOps.exif_transpose(image) - if self.mode and self.mode != image.mode: - image = image.convert(self.mode) - return image + else: + image = PIL.Image.open(BytesIO(bytes_)) + + image.load() # to avoid "Too many open files" errors + + if image.getexif().get(PIL.Image.ExifTags.Base.Orientation) is not None: + image = PIL.ImageOps.exif_transpose(image) + + if self.mode and self.mode != image.mode: + image = image.convert(self.mode) + + return image + + except Exception as e: + if self.ignore_decode_errors: + warnings.warn(f"[Image.decode_example] Skipped corrupted image: {e}") + return None + else: + raise + + def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" From 92a2fad730d808f58d7b6eda8b4e334f54a24451 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:31:35 +0530 Subject: [PATCH 2/2] feat(image): add robust EXIF handling to `decode_example` for graceful image decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit extends the `ignore_decode_errors=True` behavior in the `Image` feature to separately handle failures in EXIF metadata decoding (e.g., `.getexif()` errors). What was added: - `image.getexif()` and EXIF orientation correction (`ImageOps.exif_transpose`) are now wrapped in a separate try/except block. - If EXIF metadata is malformed (e.g., invalid UTF-8), it will be skipped gracefully *only if* `ignore_decode_errors=True`. - A warning is logged: `[Image.decode_example] Skipped EXIF metadata: ...` - The image will still be returned and used if valid. This change ensures that otherwise-decodable images are not discarded solely due to corrupt metadata. Issues addressed: - Closes #7612 โ€” Enables robust streaming over corrupted image samples - Fully satisfies #7632 โ€” Allows casting image columns without halting on invalid data - Resolves #7668 โ€” Avoids crash on malformed EXIF while retaining the image Backward compatibility: - Existing behavior remains unchanged when `ignore_decode_errors=False` (default) - Only opt-in users will see this behavior --- src/datasets/features/image.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index 6390633f9a6..d59747b0ce1 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -182,8 +182,15 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> Optional["PIL.I image.load() # to avoid "Too many open files" errors - if image.getexif().get(PIL.Image.ExifTags.Base.Orientation) is not None: - image = PIL.ImageOps.exif_transpose(image) + try: + exif = image.getexif() + if exif.get(PIL.Image.ExifTags.Base.Orientation) is not None: + image = PIL.ImageOps.exif_transpose(image) + except Exception as exif_err: + if self.ignore_decode_errors: + warnings.warn(f"[Image.decode_example] Skipped EXIF metadata: {exif_err}") + else: + raise if self.mode and self.mode != image.mode: image = image.convert(self.mode)