Skip to content

Commit bfbd9a1

Browse files
committed
Print out where the biomed files come from and their sizes
1 parent 531feaf commit bfbd9a1

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

stanza/utils/datasets/prepare_tokenizer_treebank.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,9 @@ def build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_na
12981298
else:
12991299
sents = []
13001300
bio_file = os.path.join(paths["BIO_UD_DIR"], "UD_English-%s" % bio_dataset.upper(), "en_%s-ud-%s.conllu" % (bio_dataset.lower(), dataset))
1301-
sents.extend(read_sentences_from_conllu(bio_file))
1301+
new_sents = read_sentences_from_conllu(bio_file)
1302+
print("Read %d sentences from %s" % (len(new_sents), bio_file))
1303+
sents.extend(new_sents)
13021304
write_sentences_to_conllu(output_conllu, sents)
13031305

13041306
def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment):

0 commit comments

Comments
 (0)