Skip to content

Commit a839f0c

Browse files
committed
ignore non-sacr file when loading fr-litbank
1 parent 2d0fd07 commit a839f0c

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

tibert/bertcoref.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,21 +486,27 @@ def from_sacr_dir(
486486
tokenizer: PreTrainedTokenizerFast,
487487
max_span_size: int,
488488
lang: str,
489+
ignored_files: Optional[List[str]] = None,
489490
**kwargs,
490491
) -> CoreferenceDataset:
491492
"""
492493
:param path: path to a directory containing .sacr files
493494
:param tokenizer:
494495
:param max_span_size:
495496
:param lang: MosesTokenizer language ('en', 'fr', 'de'...)
497+
:param ignored_files: list of filenames to ignore
496498
:param kwargs: passed to ``open``
497499
"""
498500
path = os.path.expanduser(path)
499501

500502
documents = []
501503
m_tokenizer = MosesTokenizer(lang=lang)
502504

503-
for fpath in tqdm(sorted(glob.glob(f"{path}/*.sacr"))):
505+
paths = sorted(glob.glob(f"{path}/*.sacr"))
506+
if not ignored_files is None:
507+
paths = [p for p in paths if not os.path.basename(p) in ignored_files]
508+
509+
for fpath in tqdm(paths):
504510
with open(fpath, **kwargs) as f:
505511
text = f.read().replace("\n", " ")
506512

@@ -614,7 +620,11 @@ def load_fr_litbank_dataset(
614620
):
615621
root_path = os.path.expanduser(root_path.rstrip("/"))
616622
return CoreferenceDataset.from_sacr_dir(
617-
f"{root_path}/sacr/All_Entites", tokenizer, max_span_size, "en"
623+
f"{root_path}/sacr/All_Entites",
624+
tokenizer,
625+
max_span_size,
626+
"en",
627+
ignored_files=["schema.sacr"],
618628
)
619629

620630

0 commit comments

Comments
 (0)