Skip to content

Commit 58dda42

Browse files
authored
Don't save original_shard_lengths by default for backward compat (#7906)
don't save original_shard_lengths by default for backward compat
1 parent 8ea8fe5 commit 58dda42

File tree

3 files changed

+8
-2
lines changed

3 files changed

+8
-2
lines changed

src/datasets/builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,7 +1514,7 @@ def _rename_shard(shard_and_job: tuple[int]):
15141514
fpath.replace(SUFFIX, ""),
15151515
)
15161516

1517-
if total_original_shards > 1:
1517+
if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
15181518
split_generator.split_info.original_shard_lengths = [
15191519
original_shard_length
15201520
for original_shard_lengths in original_shard_lengths_per_job
@@ -1792,7 +1792,7 @@ def _rename_shard(shard_id_and_job: tuple[int]):
17921792
fpath.replace(SUFFIX, ""),
17931793
)
17941794

1795-
if total_original_shards > 1:
1795+
if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:
17961796
split_generator.split_info.original_shard_lengths = [
17971797
original_shard_length
17981798
for original_shard_lengths in original_shard_lengths_per_job

src/datasets/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@
167167
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
168168
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
169169

170+
# Cached dataset info options
171+
SAVE_ORIGINAL_SHARD_LENGTHS = False
172+
170173
# Download count for the website
171174
HF_UPDATE_DOWNLOAD_COUNTS = (
172175
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ def set_test_cache_config(tmp_path_factory, monkeypatch):
2727
test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
2828
monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
2929

30+
# used in dataset viewer, we may set it to true by default in the future
31+
monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True)
32+
3033

3134
@pytest.fixture(autouse=True)
3235
def disable_implicit_token(monkeypatch):

0 commit comments

Comments
 (0)