19
19
import contextlib
20
20
import copy
21
21
import fnmatch
22
+ import hashlib
22
23
import inspect
24
+ import json
23
25
import os
24
26
import posixpath
25
27
import shutil
89
91
from .utils .sharding import _number_of_shards_in_gen_kwargs , _split_gen_kwargs
90
92
from .utils .track import tracked_list
91
93
92
-
93
94
if TYPE_CHECKING :
94
95
from .load import DatasetModule
95
96
96
97
97
98
logger = logging .get_logger (__name__ )
98
99
99
100
101
+ def hash_dict (d ):
102
+ """Hash a dictionary into a short hex string (8 characters)."""
103
+ def sanitize (obj ):
104
+ if isinstance (obj , dict ):
105
+ return {str (k ): sanitize (v ) for k , v in obj .items ()}
106
+ elif isinstance (obj , (list , tuple )):
107
+ return [sanitize (i ) for i in obj ]
108
+ else :
109
+ return str (obj )
110
+ normalized = json .dumps (sanitize (d ), sort_keys = True )
111
+ return hashlib .sha256 (normalized .encode ("utf-8" )).hexdigest ()[:8 ]
112
+
113
+
100
114
class InvalidConfigName (ValueError ):
101
115
pass
102
116
@@ -391,7 +405,7 @@ def __init__(
391
405
if not is_remote_url (self ._cache_dir_root ):
392
406
os .makedirs (self ._cache_dir_root , exist_ok = True )
393
407
lock_path = os .path .join (
394
- self ._cache_dir_root , Path (self ._cache_dir ).as_posix ().replace ("/" , "_" ) + ".lock"
408
+ self ._cache_dir_root , Path (self ._relative_data_dir () ).as_posix ().replace ("/" , "_" ) + ".lock"
395
409
)
396
410
with FileLock (lock_path ):
397
411
if os .path .exists (self ._cache_dir ): # check if data exist
@@ -577,11 +591,27 @@ def _create_builder_config(
577
591
download_config = DownloadConfig (token = self .token , storage_options = self .storage_options ),
578
592
)
579
593
580
- # compute the config id that is going to be used for caching
594
+ runtime_only_config_keys = {"drop_metadata" , "drop_labels" , "drop_audio" , "drop_text" , "drop_images" }
595
+ hashable_config_kwargs = {k : v for k , v in config_kwargs .items () if k not in runtime_only_config_keys }
596
+
581
597
config_id = builder_config .create_config_id (
582
- config_kwargs ,
598
+ hashable_config_kwargs ,
583
599
custom_features = custom_features ,
584
600
)
601
+
602
+ if (
603
+ builder_config .name in self .builder_configs
604
+ and builder_config != self .builder_configs [builder_config .name ]
605
+ ):
606
+ builder_config .name = f"custom-{ hash_dict (hashable_config_kwargs )} "
607
+ while builder_config .name in self .builder_configs :
608
+ builder_config .name += "-x"
609
+ config_id = builder_config .create_config_id (
610
+ hashable_config_kwargs ,
611
+ custom_features = custom_features ,
612
+ )
613
+ logger .info (f"Renamed conflicting config to: { builder_config .name } " )
614
+
585
615
is_custom = (config_id not in self .builder_configs ) and config_id != "default"
586
616
if is_custom :
587
617
logger .info (f"Using custom data configuration { config_id } " )
@@ -1659,15 +1689,19 @@ def _prepare_split_single(
1659
1689
shard_id = 0
1660
1690
num_examples_progress_update = 0
1661
1691
try :
1692
+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1693
+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1694
+ os .makedirs (os .path .dirname (path ), exist_ok = True )
1662
1695
writer = writer_class (
1663
1696
features = self .info .features ,
1664
- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1697
+ path = path ,
1665
1698
writer_batch_size = self ._writer_batch_size ,
1666
1699
hash_salt = split_info .name ,
1667
1700
check_duplicates = check_duplicate_keys ,
1668
1701
storage_options = self ._fs .storage_options ,
1669
1702
embed_local_files = embed_local_files ,
1670
1703
)
1704
+
1671
1705
try :
1672
1706
_time = time .time ()
1673
1707
for key , record in generator :
@@ -1678,9 +1712,12 @@ def _prepare_split_single(
1678
1712
total_num_examples += num_examples
1679
1713
total_num_bytes += num_bytes
1680
1714
shard_id += 1
1715
+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1716
+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1717
+ os .makedirs (os .path .dirname (path ), exist_ok = True )
1681
1718
writer = writer_class (
1682
1719
features = writer ._features ,
1683
- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1720
+ path = path ,
1684
1721
writer_batch_size = self ._writer_batch_size ,
1685
1722
hash_salt = split_info .name ,
1686
1723
check_duplicates = check_duplicate_keys ,
@@ -1908,9 +1945,12 @@ def _prepare_split_single(
1908
1945
shard_id = 0
1909
1946
num_examples_progress_update = 0
1910
1947
try :
1948
+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1949
+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1950
+ os .makedirs (os .path .dirname (path ), exist_ok = True )
1911
1951
writer = writer_class (
1912
1952
features = self .info .features ,
1913
- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1953
+ path = path ,
1914
1954
writer_batch_size = self ._writer_batch_size ,
1915
1955
storage_options = self ._fs .storage_options ,
1916
1956
embed_local_files = embed_local_files ,
@@ -1925,9 +1965,12 @@ def _prepare_split_single(
1925
1965
total_num_examples += num_examples
1926
1966
total_num_bytes += num_bytes
1927
1967
shard_id += 1
1968
+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1969
+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1970
+ os .makedirs (os .path .dirname (path ), exist_ok = True )
1928
1971
writer = writer_class (
1929
1972
features = writer ._features ,
1930
- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1973
+ path = path ,
1931
1974
writer_batch_size = self ._writer_batch_size ,
1932
1975
storage_options = self ._fs .storage_options ,
1933
1976
embed_local_files = embed_local_files ,
0 commit comments