Skip to content

Commit abe76f7

Browse files
committed
add subset arg to pipeline
1 parent 7b3b2db commit abe76f7

File tree

2 files changed

+6
-5
lines changed

2 files changed

+6
-5
lines changed

tools/nemo/generate_dataset_list_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
logger = logging.getLogger(__name__)
88

99

10-
def generate_dataset_list_files(dateset_list, dataset_folder, dest, mode, subset_pattern):
10+
def generate_dataset_list_files(dataset_list, dataset_folder, dest, mode, subset_pattern):
1111
if os.path.exists(dest):
1212
logger.info(f"Reading dataset list from {dest} (already exists)")
1313
with open(dest) as f:
1414
return f.read().strip().split("\n")
1515
new_list = []
16-
with open(dateset_list) as f:
16+
with open(dataset_list) as f:
1717
datasets = f.read().strip().split("\n")
1818

1919
patterns = ""

tools/nemo/pipeline_prepare_nemo_data.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
parser.add_argument("--datasets_folder", help="Dataset folder", type=str, default=None)
2020
parser.add_argument("--output_wav_dir", help="Output wav directory", type=str, default="processed_dataset")
2121
parser.add_argument("--manifest_dir", default="input_manifests")
22+
parser.add_argument("--subset_pattern", default="nocasepunc_max30", type=str)
2223
# Options for creating a tokenizer using all splits
2324
parser.add_argument("--create_tokenizer", default=None, help="Folder to save tokenizer (if not set, no tokenizer is created)")
2425
parser.add_argument("--vocab_size", help="Vocab size", type=int, default=1024)
@@ -78,7 +79,7 @@
7879
datasets_folder,
7980
dest=os.path.join(tmp_manifest_dir, "datasets_list", "train_datasets"),
8081
mode="train",
81-
subset_pattern="nocasepunc_max30",
82+
subset_pattern=args.subset_pattern,
8283
)
8384
if args.test_input_datasets:
8485
splits_to_process.append("test")
@@ -87,7 +88,7 @@
8788
datasets_folder,
8889
dest=os.path.join(tmp_manifest_dir, "datasets_list", "test_datasets"),
8990
mode="test",
90-
subset_pattern="nocasepunc_max30",
91+
subset_pattern=args.subset_pattern,
9192
)
9293
if args.dev_input_datasets:
9394
splits_to_process.append("dev")
@@ -96,7 +97,7 @@
9697
datasets_folder,
9798
dest=os.path.join(tmp_manifest_dir, "datasets_list", "dev_datasets"),
9899
mode="dev",
99-
subset_pattern="nocasepunc_max30",
100+
subset_pattern=args.subset_pattern,
100101
)
101102
if len(splits_to_process) == 0:
102103
raise ValueError("No splits to process")

0 commit comments

Comments
 (0)