Skip to content

Commit 7156f0b

Browse files
committed
add docs
Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
1 parent 400d50e commit 7156f0b

File tree

1 file changed

+51
-22
lines changed

1 file changed

+51
-22
lines changed
Lines changed: 51 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,52 @@
1+
documentation: |
2+
Unlabeled data processing pipeline
3+
############################
4+
5+
This pipeline processes unlabeled data for iterative-pseudo labelling training.
6+
7+
The pipeline performs the following steps:
8+
1. Creates initial manifest by searching all WAV files in raw_data_dir folder
9+
2. Counts duration of each WAV file
10+
3. Identify language using langid_ambernet NeMo model
11+
4. Filers out audios that are marked by different language tag
12+
5. Filers out audios that are longer than it can be processed
13+
6. Applies VAD algorithm from NeMo repository
14+
7. Forms segments by joining adjacent segments up to duration threshold
15+
8. Splits long audios into short segments
16+
9. Remove empty files and extra fields from manifest
17+
18+
Required inputs:
19+
- workspace_dir: Directory for intermediate files that contanes subfolders:
20+
${workspace_dir}/wavs/ - folder with sourse long files
21+
${workspace_dir}/sdp/ - folder to keep manifests
22+
${workspace_dir}/sdp/vad/ - folder to keep temprorary files from VAD algorithm
23+
${workspace_dir}/splited_wavs/ - folder to keep splited short files
24+
25+
- language_short: 2-letter language code
26+
- nemo_path: Path to NeMo installation
27+
- final_manifest: Path for final output manifest
28+
129
processors_to_run: "0:"
2-
workspace_dir: /mnt/ssd8/multilang/portuguese/yt/sdp
3-
final_manifest: ${workspace_dir}/final_manifest.json
4-
nemo_path: /home/nkarpov/workspace/NeMo_old
30+
workspace_dir: ??? # /mnt/ssd8/multilang/portuguese/yt
31+
manifest_dir: ${workspace_dir}/sdp
32+
language_short: pt
33+
nemo_path: ??? # /home/nkarpov/workspace/NeMo_old
34+
final_manifest: ${manifest_dir}/final_manifest.json
535

636
processors:
737
- _target_: sdp.processors.CreateInitialManifestByExt
8-
raw_data_dir: /mnt/ssd8/multilang/portuguese/yt/wavs
38+
raw_data_dir: ${workspace_dir}/wavs
939
extension: wav
1040
output_file_key: audio_filepath
11-
output_manifest_file: ${workspace_dir}/manifest0.json
41+
output_manifest_file: ${manifest_dir}/manifest0.json
1242

1343
- _target_: sdp.processors.GetAudioDuration
1444
audio_filepath_key: audio_filepath
1545
duration_key: duration
16-
output_manifest_file: ${workspace_dir}/manifest1.json
46+
output_manifest_file: ${manifest_dir}/manifest1.json
1747

1848
- _target_: sdp.processors.AudioLid
19-
output_manifest_file: ${workspace_dir}/manifest2.json
49+
output_manifest_file: ${manifest_dir}/manifest2.json
2050
input_audio_key: audio_filepath
2151
output_lang_key: audio_lang
2252
device: cuda
@@ -25,53 +55,52 @@ processors:
2555
num_segments: 3
2656

2757
- _target_: sdp.processors.PreserveByValue
28-
output_manifest_file: ${workspace_dir}/manifest3.json
58+
output_manifest_file: ${manifest_dir}/manifest3.json
2959
input_value_key: audio_lang
30-
target_value: pt
60+
target_value: ${language_short}
3161

3262
- _target_: sdp.processors.PreserveByValue
33-
output_manifest_file: ${workspace_dir}/manifest4.json
63+
output_manifest_file: ${manifest_dir}/manifest4.json
3464
input_value_key: duration
3565
operator: le
3666
target_value: 20000.0
3767

3868
- _target_: sdp.processors.Subprocess
39-
cmd: "rm -rf ${workspace_dir}/vad/*"
69+
cmd: "rm -rf ${manifest_dir}/vad/*"
4070

4171
- _target_: sdp.processors.Subprocess
42-
input_manifest_file: ${workspace_dir}/manifest4.json
43-
output_manifest_file: ${workspace_dir}/vad
72+
input_manifest_file: ${manifest_dir}/manifest4.json
73+
output_manifest_file: ${manifest_dir}/vad
4474
input_manifest_arg: "manifest_filepath"
4575
output_manifest_arg: "output_dir"
46-
cmd: "python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav \
47-
vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
76+
cmd: "python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
4877

4978
- _target_: sdp.processors.RenameFields
50-
input_manifest_file: ${workspace_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
51-
output_manifest_file: ${workspace_dir}/manifest7.json
79+
input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
80+
output_manifest_file: ${manifest_dir}/manifest7.json
5281
rename_fields: {"audio_filepath":"source_filepath"}
5382

5483
- _target_: sdp.processors.nemo.rttm.GetRttmSegments
55-
output_manifest_file: ${workspace_dir}/manifest8.json
84+
output_manifest_file: ${manifest_dir}/manifest8.json
5685
rttm_key: rttm_file
5786
output_file_key: audio_segments
5887
duration_key: duration
5988
duration_threshold: 20.0
6089

6190
- _target_: sdp.processors.nemo.rttm.SplitAudioFile
62-
output_manifest_file: ${workspace_dir}/manifest9.json
63-
splited_audio_dir: /mnt/ssd8/multilang/portuguese/yt/splited_wavs/
91+
output_manifest_file: ${manifest_dir}/manifest9.json
92+
splited_audio_dir: ${workspace_dir}/splited_wavs/
6493
segments_key: audio_segments
6594
duration_key: duration
6695
input_file_key: source_filepath
6796
output_file_key: audio_filepath
6897

6998
- _target_: sdp.processors.PreserveByValue
70-
output_manifest_file: ${workspace_dir}/manifest10.json
99+
output_manifest_file: ${manifest_dir}/manifest10.json
71100
input_value_key: duration
72101
operator: gt
73102
target_value: 0.0
74103

75104
- _target_: sdp.processors.KeepOnlySpecifiedFields
76-
output_manifest_file: ${workspace_dir}/manifest11.json
105+
output_manifest_file: ${final_manifest}
77106
fields_to_keep: ["audio_filepath", "duration"]

0 commit comments

Comments
 (0)