@@ -31,6 +31,7 @@ class FolderBasedBuilderConfig(datasets.BuilderConfig):
3131 features : Optional [datasets .Features ] = None
3232 drop_labels : bool = None
3333 drop_metadata : bool = None
34+ metadata_filenames : list [str ] = None
3435 filters : Optional [Union [ds .Expression , list [tuple ], list [list [tuple ]]]] = None
3536
3637 def __post_init__ (self ):
@@ -76,6 +77,7 @@ def _split_generators(self, dl_manager):
7677 do_analyze = not self .config .drop_labels or not self .config .drop_metadata
7778 labels , path_depths = set (), set ()
7879 metadata_files = collections .defaultdict (set )
80+ metadata_filenames = self .config .metadata_filenames or self .METADATA_FILENAMES
7981
8082 def analyze (files_or_archives , downloaded_files_or_dirs , split ):
8183 if len (downloaded_files_or_dirs ) == 0 :
@@ -91,12 +93,12 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
9193 if not self .config .drop_labels :
9294 labels .add (os .path .basename (os .path .dirname (original_file )))
9395 path_depths .add (count_path_segments (original_file ))
94- elif os .path .basename (original_file ) in self . METADATA_FILENAMES :
96+ elif os .path .basename (original_file ) in metadata_filenames :
9597 metadata_files [split ].add ((original_file , downloaded_file ))
9698 else :
9799 original_file_name = os .path .basename (original_file )
98100 logger .debug (
99- f"The file '{ original_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { self . METADATA_FILENAMES } either."
101+ f"The file '{ original_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { metadata_filenames } either."
100102 )
101103 else :
102104 archives , downloaded_dirs = files_or_archives , downloaded_files_or_dirs
@@ -108,13 +110,13 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
108110 if not self .config .drop_labels :
109111 labels .add (os .path .basename (os .path .dirname (downloaded_dir_file )))
110112 path_depths .add (count_path_segments (downloaded_dir_file ))
111- elif os .path .basename (downloaded_dir_file ) in self . METADATA_FILENAMES :
113+ elif os .path .basename (downloaded_dir_file ) in metadata_filenames :
112114 metadata_files [split ].add ((None , downloaded_dir_file ))
113115 else :
114116 archive_file_name = os .path .basename (archive )
115117 original_file_name = os .path .basename (downloaded_dir_file )
116118 logger .debug (
117- f"The file '{ original_file_name } ' from the archive '{ archive_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { self . METADATA_FILENAMES } either."
119+ f"The file '{ original_file_name } ' from the archive '{ archive_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { metadata_filenames } either."
118120 )
119121
120122 data_files = self .config .data_files
@@ -257,11 +259,12 @@ def _set_feature(feature):
257259
258260 def _split_files_and_archives (self , data_files ):
259261 files , archives = [], []
262+ metadata_filenames = self .config .metadata_filenames or self .METADATA_FILENAMES
260263 for data_file in data_files :
261264 _ , data_file_ext = os .path .splitext (data_file )
262265 if data_file_ext .lower () in self .EXTENSIONS :
263266 files .append (data_file )
264- elif os .path .basename (data_file ) in self . METADATA_FILENAMES :
267+ elif os .path .basename (data_file ) in metadata_filenames :
265268 files .append (data_file )
266269 else :
267270 archives .append (data_file )
0 commit comments