Skip to content
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@

setup(
name="datasets",
version="4.4.2.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.4.3.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description="HuggingFace community-driven open-source library of datasets",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "4.4.2.dev0"
__version__ = "4.4.3.dev0"

from .arrow_dataset import Column, Dataset
from .arrow_reader import ReadInstruction
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1514,7 +1514,7 @@ def _rename_shard(shard_and_job: tuple[int]):
fpath.replace(SUFFIX, ""),
)

if total_original_shards > 1:
if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
split_generator.split_info.original_shard_lengths = [
original_shard_length
for original_shard_lengths in original_shard_lengths_per_job
Expand Down Expand Up @@ -1792,7 +1792,7 @@ def _rename_shard(shard_id_and_job: tuple[int]):
fpath.replace(SUFFIX, ""),
)

if total_original_shards > 1:
if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
split_generator.split_info.original_shard_lengths = [
original_shard_length
for original_shard_lengths in original_shard_lengths_per_job
Expand Down
3 changes: 3 additions & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))

# Cached dataset info options
SAVE_ORIGINAL_SHARD_LENGTHS = False

# Download count for the website
HF_UPDATE_DOWNLOAD_COUNTS = (
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
Expand Down
7 changes: 6 additions & 1 deletion src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from ..utils.py_utils import asdict, first_non_null_value, zip_dict
from .audio import Audio
from .image import Image, encode_pil_image
from .nifti import Nifti
from .nifti import Nifti, encode_nibabel_image
from .pdf import Pdf, encode_pdfplumber_pdf
from .translation import Translation, TranslationVariableLanguages
from .video import Video
Expand Down Expand Up @@ -307,6 +307,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas
if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
import pdfplumber

if config.NIBABEL_AVAILABLE and "nibabel" in sys.modules:
import nibabel as nib

if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules:
from torchcodec.decoders import AudioDecoder, VideoDecoder

Expand Down Expand Up @@ -380,6 +383,8 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas
return encode_pil_image(obj), True
elif config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules and isinstance(obj, pdfplumber.pdf.PDF):
return encode_pdfplumber_pdf(obj), True
elif config.NIBABEL_AVAILABLE and "nibabel" in sys.modules and isinstance(obj, nib.analyze.AnalyzeImage):
return encode_nibabel_image(obj, force_bytes=True), True
elif isinstance(obj, pd.Series):
return (
_cast_to_python_objects(
Expand Down
7 changes: 4 additions & 3 deletions src/datasets/features/nifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Nifti1ImageWrapper(nib.nifti1.Nifti1Image):

def __init__(self, nifti_image: nib.nifti1.Nifti1Image):
super().__init__(
dataobj=nifti_image.get_fdata(),
dataobj=nifti_image.dataobj,
affine=nifti_image.affine,
header=nifti_image.header,
extra=nifti_image.extra,
Expand Down Expand Up @@ -300,7 +300,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryA
return array_cast(storage, self.pa_type)


def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str, bytes]]]:
def encode_nibabel_image(img: "nib.Nifti1Image", force_bytes: bool = False) -> dict[str, Optional[Union[str, bytes]]]:
"""
Encode a nibabel image object into a dictionary.

Expand All @@ -309,11 +309,12 @@ def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str

Args:
img: A nibabel image object (e.g., Nifti1Image).
force_bytes: If `True`, always serialize to bytes even if a file path exists. Needed to upload bytes properly.

Returns:
dict: A dictionary with "path" or "bytes" field.
"""
if hasattr(img, "file_map") and img.file_map is not None:
if hasattr(img, "file_map") and img.file_map is not None and not force_bytes:
filename = img.file_map["image"].filename
return {"path": filename, "bytes": None}

Expand Down
28 changes: 27 additions & 1 deletion src/datasets/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,33 @@ class _TempCacheDir:
"""

def __init__(self):
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
# Check if TMPDIR is set and handle the case where it doesn't exist
tmpdir = os.environ.get("TMPDIR") or os.environ.get("TEMP") or os.environ.get("TMP")
# Normalize the path to handle any path resolution issues
if tmpdir:
tmpdir = os.path.normpath(tmpdir)
if not os.path.exists(tmpdir):
# Auto-create the directory if it doesn't exist
# This prevents tempfile from silently falling back to /tmp
try:
os.makedirs(tmpdir, exist_ok=True)
logger.info(f"Created TMPDIR directory: {tmpdir}")
except OSError as e:
raise OSError(
f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. "
"Please create it manually or unset TMPDIR to fall back to the default temporary directory."
) from e
# If tmpdir exists, verify it's actually a directory and writable
elif not os.path.isdir(tmpdir):
raise OSError(
f"TMPDIR is set to '{tmpdir}' but it is not a directory. "
"Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory."
)

# Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected
# This works even if tempfile.gettempdir() was already called and cached
# Pass dir=None if tmpdir is None to use default temp directory
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)
self._finalizer = weakref.finalize(self, self._cleanup)

def _cleanup(self):
Expand Down
9 changes: 9 additions & 0 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2120,6 +2120,15 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_

if isinstance(array, pa.ExtensionArray):
array = array.storage

# Force contiguous copy for sliced list arrays to avoid SIGKILL crash.
# When ds.shard() or ds.select() creates a sliced view, array.values returns
# values with internal offset references that can cause PyArrow's C++ layer
# to crash when processing nested types like Sequence(Nifti()).
if pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
if hasattr(array, "offset") and array.offset > 0:
array = pa.concat_arrays([array])

if hasattr(feature, "embed_storage"):
return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
elif pa.types.is_struct(array.type):
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def set_test_cache_config(tmp_path_factory, monkeypatch):
test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))

# used in dataset viewer, we may set it to true by default in the future
monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True)


@pytest.fixture(autouse=True)
def disable_implicit_token(monkeypatch):
Expand Down
122 changes: 122 additions & 0 deletions tests/features/test_embed_storage_sliced.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Tests for embed_array_storage with sliced/sharded arrays.

Regression tests for SIGKILL crash when processing sliced/sharded Arrow tables
with nested types like Sequence(Nifti()) or Sequence(Image()).
"""
Comment on lines 1 to 5
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Update placeholder issue number in docstring.

Same as in table.py, the placeholder XXXX should be replaced with the actual issue number.

-Regression tests for https://github.com/huggingface/datasets/issues/XXXX
+Regression tests for https://github.com/huggingface/datasets/issues/6
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"""Tests for embed_array_storage with sliced/sharded arrays.
Regression tests for https://github.com/huggingface/datasets/issues/XXXX
(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
"""
"""Tests for embed_array_storage with sliced/sharded arrays.
Regression tests for https://github.com/huggingface/datasets/issues/6
(SIGKILL in embed_array_storage when processing sliced/sharded Arrow tables)
"""
🤖 Prompt for AI Agents
In tests/features/test_embed_storage_sliced.py lines 1-5, the docstring contains
a placeholder issue number "XXXX"; replace it with the actual GitHub issue
number used in the repo (the same one referenced in table.py). If you don't know
it, search the codebase/history for the corresponding issue reference and update
the docstring to include the real issue number (e.g., "#12345") so the
regression test comment is accurate.


import pyarrow as pa

from datasets.features import Image, List
from datasets.table import embed_array_storage

from ..utils import require_nibabel


class TestEmbedArrayStorageSliced:
"""Tests for embed_array_storage with sliced/sharded arrays."""

def test_embed_array_storage_sliced_list_image(self, shared_datadir):
"""embed_array_storage should work on sliced ListArray with Image.

This is a regression test for SIGKILL when processing sharded datasets
with Sequence(Image()) or similar nested types.
"""
image_file = str(shared_datadir / "test_image_rgb.jpg")

# Create a ListArray with 4 items
array = pa.array(
[
[{"bytes": None, "path": image_file}],
[{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
[],
[{"bytes": None, "path": image_file}],
],
type=pa.list_(Image.pa_type),
)

# Slice it (simulates ds.shard() or ds.select())
sliced = array.slice(1, 2) # Items 1 and 2

# Verify the array is actually sliced (this is the problematic case)
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, List(Image()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Item 0 of sliced = Item 1 of original (has 2 images)
assert len(embedded[0].as_py()) == 2
# Item 1 of sliced = Item 2 of original (empty list)
assert len(embedded[1].as_py()) == 0

@require_nibabel
def test_embed_array_storage_sliced_list_nifti(self, shared_datadir):
"""embed_array_storage should work on sliced ListArray with Nifti.

This is the specific case that crashed in the ARC dataset upload.
"""
from datasets.features.nifti import Nifti

nifti_path = str(shared_datadir / "test_nifti.nii.gz")

# Create a ListArray with 4 items (Sequence(Nifti()))
array = pa.array(
[
[{"bytes": None, "path": nifti_path}],
[{"bytes": None, "path": nifti_path}, {"bytes": None, "path": nifti_path}],
[], # Empty list - this also triggered the crash
[{"bytes": None, "path": nifti_path}],
],
type=pa.list_(Nifti.pa_type),
)

# Slice it (simulates ds.shard())
sliced = array.slice(1, 2)

# Verify the array is actually sliced
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, List(Nifti()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Verify bytes were embedded
assert embedded[0].as_py()[0]["bytes"] is not None

def test_embed_array_storage_sliced_large_list(self, shared_datadir):
"""embed_array_storage should work on sliced LargeListArray."""
image_file = str(shared_datadir / "test_image_rgb.jpg")

# Create a LargeListArray with 4 items
from datasets.features import LargeList

array = pa.array(
[
[{"bytes": None, "path": image_file}],
[{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
[],
[{"bytes": None, "path": image_file}],
],
type=pa.large_list(Image.pa_type),
)

# Slice it
sliced = array.slice(1, 2)

# Verify the array is actually sliced
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, LargeList(Image()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Item 0 of sliced = Item 1 of original (has 2 images)
assert len(embedded[0].as_py()) == 2
# Verify bytes were embedded
assert embedded[0].as_py()[0]["bytes"] is not None
19 changes: 19 additions & 0 deletions tests/features/test_nifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,22 @@ def test_load_zipped_file_locally(shared_datadir):

ds = load_dataset("niftifolder", data_files=nifti_path)
assert isinstance(ds["train"][0]["nifti"], nib.nifti1.Nifti1Image)


@require_nibabel
def test_nifti_lazy_loading(shared_datadir):
import nibabel as nib
import numpy as np

nifti_path = str(shared_datadir / "test_nifti.nii.gz")
nifti = Nifti()
encoded_example = nifti.encode_example(nifti_path)
decoded_example = nifti.decode_example(encoded_example)

# Verify that the data object is an ArrayProxy (lazy) and not a numpy array (dense)
assert nib.is_proxy(decoded_example.dataobj)
assert not isinstance(decoded_example.dataobj, np.ndarray)

# Verify that we can still access the data
data = decoded_example.get_fdata()
assert data.shape == (80, 80, 10)
Loading
Loading