1919import contextlib
2020import copy
2121import fnmatch
22+ import hashlib
2223import inspect
24+ import json
2325import os
2426import posixpath
2527import shutil
8991from .utils .sharding import _number_of_shards_in_gen_kwargs , _split_gen_kwargs
9092from .utils .track import tracked_list
9193
92-
9394if TYPE_CHECKING :
9495 from .load import DatasetModule
9596
9697
9798logger = logging .get_logger (__name__ )
9899
99100
101+ def hash_dict (d ):
102+ """Hash a dictionary into a short hex string (8 characters)."""
103+ def sanitize (obj ):
104+ if isinstance (obj , dict ):
105+ return {str (k ): sanitize (v ) for k , v in obj .items ()}
106+ elif isinstance (obj , (list , tuple )):
107+ return [sanitize (i ) for i in obj ]
108+ else :
109+ return str (obj )
110+ normalized = json .dumps (sanitize (d ), sort_keys = True )
111+ return hashlib .sha256 (normalized .encode ("utf-8" )).hexdigest ()[:8 ]
112+
113+
100114class InvalidConfigName (ValueError ):
101115 pass
102116
@@ -391,7 +405,7 @@ def __init__(
391405 if not is_remote_url (self ._cache_dir_root ):
392406 os .makedirs (self ._cache_dir_root , exist_ok = True )
393407 lock_path = os .path .join (
394- self ._cache_dir_root , Path (self ._cache_dir ).as_posix ().replace ("/" , "_" ) + ".lock"
408+ self ._cache_dir_root , Path (self ._relative_data_dir () ).as_posix ().replace ("/" , "_" ) + ".lock"
395409 )
396410 with FileLock (lock_path ):
397411 if os .path .exists (self ._cache_dir ): # check if data exist
@@ -577,11 +591,27 @@ def _create_builder_config(
577591 download_config = DownloadConfig (token = self .token , storage_options = self .storage_options ),
578592 )
579593
580- # compute the config id that is going to be used for caching
594+ runtime_only_config_keys = {"drop_metadata" , "drop_labels" , "drop_audio" , "drop_text" , "drop_images" }
595+ hashable_config_kwargs = {k : v for k , v in config_kwargs .items () if k not in runtime_only_config_keys }
596+
581597 config_id = builder_config .create_config_id (
582- config_kwargs ,
598+ hashable_config_kwargs ,
583599 custom_features = custom_features ,
584600 )
601+
602+ if (
603+ builder_config .name in self .builder_configs
604+ and builder_config != self .builder_configs [builder_config .name ]
605+ ):
606+ builder_config .name = f"custom-{ hash_dict (hashable_config_kwargs )} "
607+ while builder_config .name in self .builder_configs :
608+ builder_config .name += "-x"
609+ config_id = builder_config .create_config_id (
610+ hashable_config_kwargs ,
611+ custom_features = custom_features ,
612+ )
613+ logger .info (f"Renamed conflicting config to: { builder_config .name } " )
614+
585615 is_custom = (config_id not in self .builder_configs ) and config_id != "default"
586616 if is_custom :
587617 logger .info (f"Using custom data configuration { config_id } " )
@@ -1659,15 +1689,19 @@ def _prepare_split_single(
16591689 shard_id = 0
16601690 num_examples_progress_update = 0
16611691 try :
1692+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1693+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1694+ os .makedirs (os .path .dirname (path ), exist_ok = True )
16621695 writer = writer_class (
16631696 features = self .info .features ,
1664- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1697+ path = path ,
16651698 writer_batch_size = self ._writer_batch_size ,
16661699 hash_salt = split_info .name ,
16671700 check_duplicates = check_duplicate_keys ,
16681701 storage_options = self ._fs .storage_options ,
16691702 embed_local_files = embed_local_files ,
16701703 )
1704+
16711705 try :
16721706 _time = time .time ()
16731707 for key , record in generator :
@@ -1678,9 +1712,12 @@ def _prepare_split_single(
16781712 total_num_examples += num_examples
16791713 total_num_bytes += num_bytes
16801714 shard_id += 1
1715+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1716+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1717+ os .makedirs (os .path .dirname (path ), exist_ok = True )
16811718 writer = writer_class (
16821719 features = writer ._features ,
1683- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1720+ path = path ,
16841721 writer_batch_size = self ._writer_batch_size ,
16851722 hash_salt = split_info .name ,
16861723 check_duplicates = check_duplicate_keys ,
@@ -1908,9 +1945,12 @@ def _prepare_split_single(
19081945 shard_id = 0
19091946 num_examples_progress_update = 0
19101947 try :
1948+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1949+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1950+ os .makedirs (os .path .dirname (path ), exist_ok = True )
19111951 writer = writer_class (
19121952 features = self .info .features ,
1913- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1953+ path = path ,
19141954 writer_batch_size = self ._writer_batch_size ,
19151955 storage_options = self ._fs .storage_options ,
19161956 embed_local_files = embed_local_files ,
@@ -1925,9 +1965,12 @@ def _prepare_split_single(
19251965 total_num_examples += num_examples
19261966 total_num_bytes += num_bytes
19271967 shard_id += 1
1968+ path = fpath .replace ("SSSSS" , f"{ shard_id :05d} " ).replace ("JJJJJ" , f"{ job_id :05d} " )
1969+ logger .debug ("Creating directory: %s" , os .path .dirname (path ))
1970+ os .makedirs (os .path .dirname (path ), exist_ok = True )
19281971 writer = writer_class (
19291972 features = writer ._features ,
1930- path = fpath . replace ( "SSSSS" , f" { shard_id :05d } " ). replace ( "JJJJJ" , f" { job_id :05d } " ) ,
1973+ path = path ,
19311974 writer_batch_size = self ._writer_batch_size ,
19321975 storage_options = self ._fs .storage_options ,
19331976 embed_local_files = embed_local_files ,
0 commit comments