remove unused keys dedupe

lhoestq · lhoestq · commit 67b5bd97fb78 · 2025-12-03T19:43:00.000+01:00
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -16,7 +16,7 @@
 import json
 import sys
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import fsspec
 import numpy as np
@@ -40,7 +40,6 @@
 )
 from .filesystems import is_remote_filesystem
 from .info import DatasetInfo
-from .keyhash import DuplicatedKeysError, KeyHasher
 from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
 from .utils import logging
 from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value
@@ -414,8 +413,6 @@ def __init__(
         stream: Optional[pa.NativeFile] = None,
         fingerprint: Optional[str] = None,
         writer_batch_size: Optional[int] = None,
-        hash_salt: Optional[str] = None,
-        check_duplicates: Optional[bool] = False,
         disable_nullable: bool = False,
         update_features: bool = False,
         with_metadata: bool = True,
@@ -435,13 +432,6 @@ def __init__(
             self._features = None
             self._schema = None
 
-        if hash_salt is not None:
-            # Create KeyHasher instance using split name as hash salt
-            self._hasher = KeyHasher(hash_salt)
-        else:
-            self._hasher = KeyHasher("")
-
-        self._check_duplicates = check_duplicates
         self._disable_nullable = disable_nullable
 
         if stream is None:
@@ -592,51 +582,21 @@ def write_rows_on_file(self):
     def write(
         self,
         example: dict[str, Any],
-        key: Optional[Union[str, int, bytes]] = None,
         writer_batch_size: Optional[int] = None,
     ):
         """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
 
         Args:
             example: the Example to add.
-            key: Optional, a unique identifier(str, int or bytes) associated with each example
         """
-        # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
-        if self._check_duplicates:
-            # Create unique hash from key and store as (key, example) pairs
-            hash = self._hasher.hash(key)
-            self.current_examples.append((example, hash))
-            # Maintain record of keys and their respective hashes for checking duplicates
-            self.hkey_record.append((hash, key))
-        else:
-            # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
-            self.current_examples.append((example, ""))
+        # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
+        self.current_examples.append((example, ""))
 
         if writer_batch_size is None:
             writer_batch_size = self.writer_batch_size
         if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
-            if self._check_duplicates:
-                self.check_duplicate_keys()
-                # Re-initializing to empty list for next batch
-                self.hkey_record = []
-
             self.write_examples_on_file()
 
-    def check_duplicate_keys(self):
-        """Raises error if duplicates found in a batch"""
-        tmp_record = set()
-        for hash, key in self.hkey_record:
-            if hash in tmp_record:
-                duplicate_key_indices = [
-                    str(self._num_examples + index)
-                    for index, (duplicate_hash, _) in enumerate(self.hkey_record)
-                    if duplicate_hash == hash
-                ]
-
-                raise DuplicatedKeysError(key, duplicate_key_indices)
-            else:
-                tmp_record.add(hash)
-
     def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
         """Add a given single-row Table to the write-pool of rows which is written to file.
 
@@ -721,10 +681,6 @@ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = Non
     def finalize(self, close_stream=True):
         self.write_rows_on_file()
         # In case current_examples < writer_batch_size, but user uses finalize()
-        if self._check_duplicates:
-            self.check_duplicate_keys()
-            # Re-initializing to empty list for next batch
-            self.hkey_record = []
         self.write_examples_on_file()
         # If schema is known, infer features even if no examples were written
         if self.pa_writer is None and self.schema:
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -59,7 +59,6 @@
 from .fingerprint import Hasher
 from .info import DatasetInfo, PostProcessedInfo
 from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
-from .keyhash import DuplicatedKeysError
 from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
 from .splits import Split, SplitDict, SplitGenerator, SplitInfo
 from .streaming import extend_dataset_builder_for_streaming
@@ -979,13 +978,6 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_k
                     + "\nOriginal error:\n"
                     + str(e)
                 ) from None
-            # If check_duplicates is set to True , then except DuplicatedKeysError
-            except DuplicatedKeysError as e:
-                raise DuplicatedKeysError(
-                    e.key,
-                    e.duplicate_key_indices,
-                    fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
-                ) from None
             dl_manager.manage_extracted_files()
 
         if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
@@ -1400,7 +1392,6 @@ def _generate_examples(self, **kwargs) -> Iterable[tuple[int, int], dict[str, An
     def _prepare_split(
         self,
         split_generator: SplitGenerator,
-        check_duplicate_keys: bool,
         file_format="arrow",
         num_proc: Optional[int] = None,
         max_shard_size: Optional[Union[int, str]] = None,
@@ -1440,7 +1431,6 @@ def _prepare_split(
             "file_format": file_format,
             "max_shard_size": max_shard_size,
             "split_info": split_info,
-            "check_duplicate_keys": check_duplicate_keys,
         }
 
         if num_proc is None or num_proc == 1:
@@ -1558,7 +1548,6 @@ def _prepare_split_single(
         file_format: str,
         max_shard_size: int,
         split_info: SplitInfo,
-        check_duplicate_keys: bool,
         job_id: int,
     ) -> Iterable[tuple[int, bool, tuple[int, int, Features, int, int, int]]]:
         generator = self._generate_examples(**gen_kwargs)
@@ -1575,8 +1564,6 @@ def _prepare_split_single(
                 features=self.info.features,
                 path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                 writer_batch_size=self._writer_batch_size,
-                hash_salt=split_info.name,
-                check_duplicates=check_duplicate_keys,
                 storage_options=self._fs.storage_options,
                 embed_local_files=embed_local_files,
             )
@@ -1594,13 +1581,11 @@ def _prepare_split_single(
                             features=writer._features,
                             path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                             writer_batch_size=self._writer_batch_size,
-                            hash_salt=split_info.name,
-                            check_duplicates=check_duplicate_keys,
                             storage_options=self._fs.storage_options,
                             embed_local_files=embed_local_files,
                         )
                     example = self.info.features.encode_example(record) if self.info.features is not None else record
-                    writer.write(example, (input_shard_idx, example_idx))
+                    writer.write(example)
                     if len(input_shard_lengths) == input_shard_idx:
                         input_shard_lengths.append(1)
                     else:
@@ -1634,8 +1619,6 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_
         super()._download_and_prepare(
             dl_manager,
             verification_mode,
-            check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
-            or verification_mode == VerificationMode.ALL_CHECKS,
             **prepare_splits_kwargs,
         )
 
diff --git a/src/datasets/keyhash.py b/src/datasets/keyhash.py
diff --git a/src/datasets/utils/info_utils.py b/src/datasets/utils/info_utils.py
@@ -29,8 +29,7 @@ class VerificationMode(enum.Enum):
 
     |                           | Verification checks                                                           |
     |---------------------------|------------------------------------------------------------------------------ |
-    | `ALL_CHECKS`              | Split checks, uniqueness of the keys yielded in case of the GeneratorBuilder  |
-    |                           | and the validity (number of files, checksums, etc.) of downloaded files       |
+    | `ALL_CHECKS`              | Split checks and validity (number of files, checksums) of downloaded files    |
     | `BASIC_CHECKS` (default)  | Same as `ALL_CHECKS` but without checking downloaded files                    |
     | `NO_CHECKS`               | None                                                                          |
 
diff --git a/tests/test_arrow_writer.py b/tests/test_arrow_writer.py
@@ -14,7 +14,6 @@
 from datasets.arrow_writer import ArrowWriter, OptimizedTypedSequence, ParquetWriter, TypedSequence
 from datasets.features import Array2D, ClassLabel, Features, Image, Value
 from datasets.features.features import Array2DExtensionType, cast_to_python_objects
-from datasets.keyhash import DuplicatedKeysError, InvalidKeyError
 
 from .utils import require_pil
 
@@ -133,46 +132,15 @@ def test_write_with_features():
     assert features == Features.from_arrow_schema(schema)
 
 
-@pytest.mark.parametrize("writer_batch_size", [None, 1, 10])
-def test_key_datatype(writer_batch_size):
-    output = pa.BufferOutputStream()
-    with ArrowWriter(
-        stream=output,
-        writer_batch_size=writer_batch_size,
-        hash_salt="split_name",
-        check_duplicates=True,
-    ) as writer:
-        with pytest.raises(InvalidKeyError):
-            writer.write({"col_1": "foo", "col_2": 1}, key=[1, 2])
-            num_examples, num_bytes = writer.finalize()
-
-
-@pytest.mark.parametrize("writer_batch_size", [None, 2, 10])
-def test_duplicate_keys(writer_batch_size):
-    output = pa.BufferOutputStream()
-    with ArrowWriter(
-        stream=output,
-        writer_batch_size=writer_batch_size,
-        hash_salt="split_name",
-        check_duplicates=True,
-    ) as writer:
-        with pytest.raises(DuplicatedKeysError):
-            writer.write({"col_1": "foo", "col_2": 1}, key=10)
-            writer.write({"col_1": "bar", "col_2": 2}, key=10)
-            num_examples, num_bytes = writer.finalize()
-
-
 @pytest.mark.parametrize("writer_batch_size", [None, 2, 10])
 def test_write_with_keys(writer_batch_size):
     output = pa.BufferOutputStream()
     with ArrowWriter(
         stream=output,
         writer_batch_size=writer_batch_size,
-        hash_salt="split_name",
-        check_duplicates=True,
     ) as writer:
-        writer.write({"col_1": "foo", "col_2": 1}, key=1)
-        writer.write({"col_1": "bar", "col_2": 2}, key=2)
+        writer.write({"col_1": "foo", "col_2": 1})
+        writer.write({"col_1": "bar", "col_2": 2})
         num_examples, num_bytes = writer.finalize()
     assert num_examples == 2
     assert num_bytes > 0
diff --git a/tests/test_builder.py b/tests/test_builder.py
@@ -666,26 +666,6 @@ def test_generator_based_download_and_prepare(self):
                 os.path.exists(os.path.join(tmp_dir, builder.dataset_name, "default", "0.0.0", "dataset_info.json"))
             )
 
-        # Test that duplicated keys are ignored if verification_mode is "no_checks"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir)
-            with patch("datasets.builder.ArrowWriter", side_effect=ArrowWriter) as mock_arrow_writer:
-                builder.download_and_prepare(
-                    download_mode=DownloadMode.FORCE_REDOWNLOAD, verification_mode=VerificationMode.NO_CHECKS
-                )
-                mock_arrow_writer.assert_called_once()
-                args, kwargs = mock_arrow_writer.call_args_list[0]
-                self.assertFalse(kwargs["check_duplicates"])
-
-                mock_arrow_writer.reset_mock()
-
-                builder.download_and_prepare(
-                    download_mode=DownloadMode.FORCE_REDOWNLOAD, verification_mode=VerificationMode.BASIC_CHECKS
-                )
-                mock_arrow_writer.assert_called_once()
-                args, kwargs = mock_arrow_writer.call_args_list[0]
-                self.assertTrue(kwargs["check_duplicates"])
-
     def test_cache_dir_no_args(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_dir=None, data_files=None)