|
19 | 19 | parser.add_argument("--datasets_folder", help="Dataset folder", type=str, default=None) |
20 | 20 | parser.add_argument("--output_wav_dir", help="Output wav directory", type=str, default="processed_dataset") |
21 | 21 | parser.add_argument("--manifest_dir", default="input_manifests") |
| 22 | + parser.add_argument("--subset_pattern", default="nocasepunc_max30", type=str) |
22 | 23 | # Options for creating a tokenizer using all splits |
23 | 24 | parser.add_argument("--create_tokenizer", default=None, help="Folder to save tokenizer (if not set, no tokenizer is created)") |
24 | 25 | parser.add_argument("--vocab_size", help="Vocab size", type=int, default=1024) |
|
78 | 79 | datasets_folder, |
79 | 80 | dest=os.path.join(tmp_manifest_dir, "datasets_list", "train_datasets"), |
80 | 81 | mode="train", |
81 | | - subset_pattern="nocasepunc_max30", |
| 82 | + subset_pattern=args.subset_pattern, |
82 | 83 | ) |
83 | 84 | if args.test_input_datasets: |
84 | 85 | splits_to_process.append("test") |
|
87 | 88 | datasets_folder, |
88 | 89 | dest=os.path.join(tmp_manifest_dir, "datasets_list", "test_datasets"), |
89 | 90 | mode="test", |
90 | | - subset_pattern="nocasepunc_max30", |
| 91 | + subset_pattern=args.subset_pattern, |
91 | 92 | ) |
92 | 93 | if args.dev_input_datasets: |
93 | 94 | splits_to_process.append("dev") |
|
96 | 97 | datasets_folder, |
97 | 98 | dest=os.path.join(tmp_manifest_dir, "datasets_list", "dev_datasets"), |
98 | 99 | mode="dev", |
99 | | - subset_pattern="nocasepunc_max30", |
| 100 | + subset_pattern=args.subset_pattern, |
100 | 101 | ) |
101 | 102 | if len(splits_to_process) == 0: |
102 | 103 | raise ValueError("No splits to process") |
|
0 commit comments