diff --git a/pyproject.toml b/pyproject.toml index 511a9e0d744..abac4a9d90a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,4 +24,5 @@ filterwarnings = [ markers = [ "unit: unit test", "integration: integration test", + "high_memory: tests requiring significant RAM (>=16GB)", ] diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 3174f5cf206..606a007af90 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -524,7 +524,9 @@ def _build_schema(self, inferred_schema: pa.Schema): def _build_writer(self, inferred_schema: pa.Schema): self._schema, self._features = self._build_schema(inferred_schema) - self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema) + self.pa_writer = pa.RecordBatchStreamWriter( + self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=True) + ) @property def schema(self): diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 88259767ae0..93215184d9f 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1535,9 +1535,13 @@ def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> p [0] + [len(arr) for arr in l_arr], dtype=object ) # convert to dtype object to allow None insertion offsets = np.insert(offsets, null_indices, None) - offsets = pa.array(offsets, type=pa.int32()) values = pa.concat_arrays(l_arr) - return pa.ListArray.from_arrays(offsets, values) + try: + offsets = pa.array(offsets, type=pa.int32()) + return pa.ListArray.from_arrays(offsets, values) + except pa.lib.ArrowInvalid: + offsets = pa.array(offsets, type=pa.int64()) + return pa.LargeListArray.from_arrays(offsets, values) def list_of_np_array_to_pyarrow_listarray(l_arr: list[np.ndarray], type: pa.DataType = None) -> pa.ListArray: diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 8e76952d6ca..e40bce33296 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -4783,3 +4783,26 @@ def test_from_polars_save_to_disk_and_load_from_disk_round_trip_with_large_list( def test_polars_round_trip(): ds = Dataset.from_dict({"x": [[1, 2], [3, 4, 5]], "y": ["a", "b"]}) assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset) + + +@pytest.mark.high_memory +def test_map_int32_overflow(): + # GH: 7821 + # This test requires ~4GB RAM to create a large array that triggers int32 overflow + # Marked as high_memory (>=16GB) to prevent CI failures + def process_batch(batch): + res = [] + for _ in batch["id"]: + res.append(np.zeros((2**31)).astype(np.uint16)) + + return {"audio": res} + + ds = Dataset.from_dict({"id": [0]}) + mapped_ds = ds.map( + process_batch, + batched=True, + batch_size=1, + num_proc=0, + remove_columns=ds.column_names, + ) + assert isinstance(mapped_ds, Dataset)