@@ -486,21 +486,27 @@ def from_sacr_dir(
486486 tokenizer : PreTrainedTokenizerFast ,
487487 max_span_size : int ,
488488 lang : str ,
489+ ignored_files : Optional [List [str ]] = None ,
489490 ** kwargs ,
490491 ) -> CoreferenceDataset :
491492 """
492493 :param path: path to a directory containing .sacr files
493494 :param tokenizer:
494495 :param max_span_size:
495496 :param lang: MosesTokenizer language ('en', 'fr', 'de'...)
497+ :param ignored_files: list of filenames to ignore
496498 :param kwargs: passed to ``open``
497499 """
498500 path = os .path .expanduser (path )
499501
500502 documents = []
501503 m_tokenizer = MosesTokenizer (lang = lang )
502504
503- for fpath in tqdm (sorted (glob .glob (f"{ path } /*.sacr" ))):
505+ paths = sorted (glob .glob (f"{ path } /*.sacr" ))
506+ if not ignored_files is None :
507+ paths = [p for p in paths if not os .path .basename (p ) in ignored_files ]
508+
509+ for fpath in tqdm (paths ):
504510 with open (fpath , ** kwargs ) as f :
505511 text = f .read ().replace ("\n " , " " )
506512
@@ -614,7 +620,11 @@ def load_fr_litbank_dataset(
614620):
615621 root_path = os .path .expanduser (root_path .rstrip ("/" ))
616622 return CoreferenceDataset .from_sacr_dir (
617- f"{ root_path } /sacr/All_Entites" , tokenizer , max_span_size , "en"
623+ f"{ root_path } /sacr/All_Entites" ,
624+ tokenizer ,
625+ max_span_size ,
626+ "en" ,
627+ ignored_files = ["schema.sacr" ],
618628 )
619629
620630
0 commit comments