The-Obstacle-Is-The-Way · The-Obstacle-Is-The-Way · Dec 16, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/setup.py b/setup.py
@@ -232,7 +232,7 @@
 
 setup(
     name="datasets",
-    version="4.4.2.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.4.3.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="HuggingFace community-driven open-source library of datasets",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "4.4.2.dev0"
+__version__ = "4.4.3.dev0"
 
 from .arrow_dataset import Column, Dataset
 from .arrow_reader import ReadInstruction

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -1514,7 +1514,7 @@ def _rename_shard(shard_and_job: tuple[int]):
                 fpath.replace(SUFFIX, ""),
             )
 
-        if total_original_shards > 1:
+        if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
             split_generator.split_info.original_shard_lengths = [
                 original_shard_length
                 for original_shard_lengths in original_shard_lengths_per_job
@@ -1792,7 +1792,7 @@ def _rename_shard(shard_id_and_job: tuple[int]):
                 fpath.replace(SUFFIX, ""),
             )
 
-        if total_original_shards > 1:
+        if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
             split_generator.split_info.original_shard_lengths = [
                 original_shard_length
                 for original_shard_lengths in original_shard_lengths_per_job

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -167,6 +167,9 @@
 DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
 EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
 
+# Cached dataset info options
+SAVE_ORIGINAL_SHARD_LENGTHS = False
+
 # Download count for the website
 HF_UPDATE_DOWNLOAD_COUNTS = (
     os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -42,7 +42,7 @@
 from ..utils.py_utils import asdict, first_non_null_value, zip_dict
 from .audio import Audio
 from .image import Image, encode_pil_image
-from .nifti import Nifti
+from .nifti import Nifti, encode_nibabel_image
 from .pdf import Pdf, encode_pdfplumber_pdf
 from .translation import Translation, TranslationVariableLanguages
 from .video import Video
@@ -307,6 +307,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas
     if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
         import pdfplumber
 
+    if config.NIBABEL_AVAILABLE and "nibabel" in sys.modules:
+        import nibabel as nib
+
     if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules:
         from torchcodec.decoders import AudioDecoder, VideoDecoder
 
@@ -380,6 +383,8 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas
         return encode_pil_image(obj), True
     elif config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules and isinstance(obj, pdfplumber.pdf.PDF):
         return encode_pdfplumber_pdf(obj), True
+    elif config.NIBABEL_AVAILABLE and "nibabel" in sys.modules and isinstance(obj, nib.analyze.AnalyzeImage):
+        return encode_nibabel_image(obj, force_bytes=True), True
     elif isinstance(obj, pd.Series):
         return (
             _cast_to_python_objects(

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
@@ -27,7 +27,7 @@ class Nifti1ImageWrapper(nib.nifti1.Nifti1Image):
 
         def __init__(self, nifti_image: nib.nifti1.Nifti1Image):
             super().__init__(
-                dataobj=nifti_image.get_fdata(),
+                dataobj=nifti_image.dataobj,
                 affine=nifti_image.affine,
                 header=nifti_image.header,
                 extra=nifti_image.extra,
@@ -300,7 +300,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryA
         return array_cast(storage, self.pa_type)
 
 
-def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str, bytes]]]:
+def encode_nibabel_image(img: "nib.Nifti1Image", force_bytes: bool = False) -> dict[str, Optional[Union[str, bytes]]]:
     """
     Encode a nibabel image object into a dictionary.
 
@@ -309,11 +309,12 @@ def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str
 
     Args:
         img: A nibabel image object (e.g., Nifti1Image).
+        force_bytes: If `True`, always serialize to bytes even if a file path exists. Needed to upload bytes properly.
 
     Returns:
         dict: A dictionary with "path" or "bytes" field.
     """
-    if hasattr(img, "file_map") and img.file_map is not None:
+    if hasattr(img, "file_map") and img.file_map is not None and not force_bytes:
         filename = img.file_map["image"].filename
         return {"path": filename, "bytes": None}
 

diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py
@@ -49,7 +49,33 @@ class _TempCacheDir:
     """
 
     def __init__(self):
-        self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
+        # Check if TMPDIR is set and handle the case where it doesn't exist
+        tmpdir = os.environ.get("TMPDIR") or os.environ.get("TEMP") or os.environ.get("TMP")
+        # Normalize the path to handle any path resolution issues
+        if tmpdir:
+            tmpdir = os.path.normpath(tmpdir)
+            if not os.path.exists(tmpdir):
+                # Auto-create the directory if it doesn't exist
+                # This prevents tempfile from silently falling back to /tmp
+                try:
+                    os.makedirs(tmpdir, exist_ok=True)
+                    logger.info(f"Created TMPDIR directory: {tmpdir}")
+                except OSError as e:
+                    raise OSError(
+                        f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. "
+                        "Please create it manually or unset TMPDIR to fall back to the default temporary directory."
+                    ) from e
+            # If tmpdir exists, verify it's actually a directory and writable
+            elif not os.path.isdir(tmpdir):
+                raise OSError(
+                    f"TMPDIR is set to '{tmpdir}' but it is not a directory. "
+                    "Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory."
+                )
+
+        # Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected
+        # This works even if tempfile.gettempdir() was already called and cached
+        # Pass dir=None if tmpdir is None to use default temp directory
+        self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)
         self._finalizer = weakref.finalize(self, self._cleanup)
 
     def _cleanup(self):

diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -2120,6 +2120,15 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_
 
     if isinstance(array, pa.ExtensionArray):
         array = array.storage
+
+    # Force contiguous copy for sliced list arrays to avoid SIGKILL crash.
+    # When ds.shard() or ds.select() creates a sliced view, array.values returns
+    # values with internal offset references that can cause PyArrow's C++ layer
+    # to crash when processing nested types like Sequence(Nifti()).
+    if pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
+        if hasattr(array, "offset") and array.offset > 0:
+            array = pa.concat_arrays([array])
+
     if hasattr(feature, "embed_storage"):
         return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
     elif pa.types.is_struct(array.type):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -27,6 +27,9 @@ def set_test_cache_config(tmp_path_factory, monkeypatch):
     test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
     monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
 
+    # used in dataset viewer, we may set it to true by default in the future
+    monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True)
+
 
 @pytest.fixture(autouse=True)
 def disable_implicit_token(monkeypatch):

diff --git a/tests/features/test_embed_storage_sliced.py b/tests/features/test_embed_storage_sliced.py
@@ -0,0 +1,122 @@
+"""Tests for embed_array_storage with sliced/sharded arrays.
+
+Regression tests for SIGKILL crash when processing sliced/sharded Arrow tables
+with nested types like Sequence(Nifti()) or Sequence(Image()).
+"""
-"""Tests for embed_array_storage with sliced/sharded arrays.
-
-Regression tests for https://github.com/huggingface/datasets/issues/XXXX
-(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
-"""
+"""Tests for embed_array_storage with sliced/sharded arrays.
+
+Regression tests for https://github.com/huggingface/datasets/issues/6
+(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
+"""
-"""Tests for embed_array_storage with sliced/sharded arrays.
-
-Regression tests for https://github.com/huggingface/datasets/issues/XXXX
-(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
-"""
+"""Tests for embed_array_storage with sliced/sharded arrays.
+
+Regression tests for https://github.com/huggingface/datasets/issues/6
+(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
+"""
+
+import pyarrow as pa
+
+from datasets.features import Image, List
+from datasets.table import embed_array_storage
+
+from ..utils import require_nibabel
+
+
+class TestEmbedArrayStorageSliced:
+    """Tests for embed_array_storage with sliced/sharded arrays."""
+
+    def test_embed_array_storage_sliced_list_image(self, shared_datadir):
+        """embed_array_storage should work on sliced ListArray with Image.
+
+        This is a regression test for SIGKILL when processing sharded datasets
+        with Sequence(Image()) or similar nested types.
+        """
+        image_file = str(shared_datadir / "test_image_rgb.jpg")
+
+        # Create a ListArray with 4 items
+        array = pa.array(
+            [
+                [{"bytes": None, "path": image_file}],
+                [{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
+                [],
+                [{"bytes": None, "path": image_file}],
+            ],
+            type=pa.list_(Image.pa_type),
+        )
+
+        # Slice it (simulates ds.shard() or ds.select())
+        sliced = array.slice(1, 2)  # Items 1 and 2
+
+        # Verify the array is actually sliced (this is the problematic case)
+        assert sliced.offset == 1, "Expected sliced array to have non-zero offset"
+
+        # This should NOT crash with SIGKILL
+        embedded = embed_array_storage(sliced, List(Image()))
+
+        # The fix should make the result contiguous (offset = 0)
+        assert embedded.offset == 0, "Result should be contiguous after fix"
+        assert len(embedded) == 2
+        # Item 0 of sliced = Item 1 of original (has 2 images)
+        assert len(embedded[0].as_py()) == 2
+        # Item 1 of sliced = Item 2 of original (empty list)
+        assert len(embedded[1].as_py()) == 0
+
+    @require_nibabel
+    def test_embed_array_storage_sliced_list_nifti(self, shared_datadir):
+        """embed_array_storage should work on sliced ListArray with Nifti.
+
+        This is the specific case that crashed in the ARC dataset upload.
+        """
+        from datasets.features.nifti import Nifti
+
+        nifti_path = str(shared_datadir / "test_nifti.nii.gz")
+
+        # Create a ListArray with 4 items (Sequence(Nifti()))
+        array = pa.array(
+            [
+                [{"bytes": None, "path": nifti_path}],
+                [{"bytes": None, "path": nifti_path}, {"bytes": None, "path": nifti_path}],
+                [],  # Empty list - this also triggered the crash
+                [{"bytes": None, "path": nifti_path}],
+            ],
+            type=pa.list_(Nifti.pa_type),
+        )
+
+        # Slice it (simulates ds.shard())
+        sliced = array.slice(1, 2)
+
+        # Verify the array is actually sliced
+        assert sliced.offset == 1, "Expected sliced array to have non-zero offset"
+
+        # This should NOT crash with SIGKILL
+        embedded = embed_array_storage(sliced, List(Nifti()))
+
+        # The fix should make the result contiguous (offset = 0)
+        assert embedded.offset == 0, "Result should be contiguous after fix"
+        assert len(embedded) == 2
+        # Verify bytes were embedded
+        assert embedded[0].as_py()[0]["bytes"] is not None
+
+    def test_embed_array_storage_sliced_large_list(self, shared_datadir):
+        """embed_array_storage should work on sliced LargeListArray."""
+        image_file = str(shared_datadir / "test_image_rgb.jpg")
+
+        # Create a LargeListArray with 4 items
+        from datasets.features import LargeList
+
+        array = pa.array(
+            [
+                [{"bytes": None, "path": image_file}],
+                [{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
+                [],
+                [{"bytes": None, "path": image_file}],
+            ],
+            type=pa.large_list(Image.pa_type),
+        )
+
+        # Slice it
+        sliced = array.slice(1, 2)
+
+        # Verify the array is actually sliced
+        assert sliced.offset == 1, "Expected sliced array to have non-zero offset"
+
+        # This should NOT crash with SIGKILL
+        embedded = embed_array_storage(sliced, LargeList(Image()))
+
+        # The fix should make the result contiguous (offset = 0)
+        assert embedded.offset == 0, "Result should be contiguous after fix"
+        assert len(embedded) == 2
+        # Item 0 of sliced = Item 1 of original (has 2 images)
+        assert len(embedded[0].as_py()) == 2
+        # Verify bytes were embedded
+        assert embedded[0].as_py()[0]["bytes"] is not None
diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py
@@ -128,3 +128,22 @@ def test_load_zipped_file_locally(shared_datadir):
 
     ds = load_dataset("niftifolder", data_files=nifti_path)
     assert isinstance(ds["train"][0]["nifti"], nib.nifti1.Nifti1Image)
+
+
+@require_nibabel
+def test_nifti_lazy_loading(shared_datadir):
+    import nibabel as nib
+    import numpy as np
+
+    nifti_path = str(shared_datadir / "test_nifti.nii.gz")
+    nifti = Nifti()
+    encoded_example = nifti.encode_example(nifti_path)
+    decoded_example = nifti.decode_example(encoded_example)
+
+    # Verify that the data object is an ArrayProxy (lazy) and not a numpy array (dense)
+    assert nib.is_proxy(decoded_example.dataobj)
+    assert not isinstance(decoded_example.dataobj, np.ndarray)
+
+    # Verify that we can still access the data
+    data = decoded_example.get_fdata()
+    assert data.shape == (80, 80, 10)