1+ documentation : |
2+ Unlabeled data processing pipeline
3+ ############################
4+
5+ This pipeline processes unlabeled data for iterative-pseudo labelling training.
6+
7+ The pipeline performs the following steps:
8+ 1. Creates initial manifest by searching all WAV files in raw_data_dir folder
9+ 2. Counts duration of each WAV file
10+ 3. Identify language using langid_ambernet NeMo model
11+ 4. Filers out audios that are marked by different language tag
12+ 5. Filers out audios that are longer than it can be processed
13+ 6. Applies VAD algorithm from NeMo repository
14+ 7. Forms segments by joining adjacent segments up to duration threshold
15+ 8. Splits long audios into short segments
16+ 9. Remove empty files and extra fields from manifest
17+
18+ Required inputs:
19+ - workspace_dir: Directory for intermediate files that contanes subfolders:
20+ ${workspace_dir}/wavs/ - folder with sourse long files
21+ ${workspace_dir}/sdp/ - folder to keep manifests
22+ ${workspace_dir}/sdp/vad/ - folder to keep temprorary files from VAD algorithm
23+ ${workspace_dir}/splited_wavs/ - folder to keep splited short files
24+
25+ - language_short: 2-letter language code
26+ - nemo_path: Path to NeMo installation
27+ - final_manifest: Path for final output manifest
28+
129processors_to_run : " 0:"
2- workspace_dir : /mnt/ssd8/multilang/portuguese/yt/sdp
3- final_manifest : ${workspace_dir}/final_manifest.json
4- nemo_path : /home/nkarpov/workspace/NeMo_old
30+ workspace_dir : ??? # /mnt/ssd8/multilang/portuguese/yt
31+ manifest_dir : ${workspace_dir}/sdp
32+ language_short : pt
33+ nemo_path : ??? # /home/nkarpov/workspace/NeMo_old
34+ final_manifest : ${manifest_dir}/final_manifest.json
535
636processors :
737 - _target_ : sdp.processors.CreateInitialManifestByExt
8- raw_data_dir : /mnt/ssd8/multilang/portuguese/yt /wavs
38+ raw_data_dir : ${workspace_dir} /wavs
939 extension : wav
1040 output_file_key : audio_filepath
11- output_manifest_file : ${workspace_dir }/manifest0.json
41+ output_manifest_file : ${manifest_dir }/manifest0.json
1242
1343 - _target_ : sdp.processors.GetAudioDuration
1444 audio_filepath_key : audio_filepath
1545 duration_key : duration
16- output_manifest_file : ${workspace_dir }/manifest1.json
46+ output_manifest_file : ${manifest_dir }/manifest1.json
1747
1848 - _target_ : sdp.processors.AudioLid
19- output_manifest_file : ${workspace_dir }/manifest2.json
49+ output_manifest_file : ${manifest_dir }/manifest2.json
2050 input_audio_key : audio_filepath
2151 output_lang_key : audio_lang
2252 device : cuda
@@ -25,53 +55,52 @@ processors:
2555 num_segments : 3
2656
2757 - _target_ : sdp.processors.PreserveByValue
28- output_manifest_file : ${workspace_dir }/manifest3.json
58+ output_manifest_file : ${manifest_dir }/manifest3.json
2959 input_value_key : audio_lang
30- target_value : pt
60+ target_value : ${language_short}
3161
3262 - _target_ : sdp.processors.PreserveByValue
33- output_manifest_file : ${workspace_dir }/manifest4.json
63+ output_manifest_file : ${manifest_dir }/manifest4.json
3464 input_value_key : duration
3565 operator : le
3666 target_value : 20000.0
3767
3868 - _target_ : sdp.processors.Subprocess
39- cmd : " rm -rf ${workspace_dir }/vad/*"
69+ cmd : " rm -rf ${manifest_dir }/vad/*"
4070
4171 - _target_ : sdp.processors.Subprocess
42- input_manifest_file : ${workspace_dir }/manifest4.json
43- output_manifest_file : ${workspace_dir }/vad
72+ input_manifest_file : ${manifest_dir }/manifest4.json
73+ output_manifest_file : ${manifest_dir }/vad
4474 input_manifest_arg : " manifest_filepath"
4575 output_manifest_arg : " output_dir"
46- cmd : " python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav \
47- vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
76+ cmd : " python ${nemo_path}/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet vad_config=${nemo_path}/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
4877
4978 - _target_ : sdp.processors.RenameFields
50- input_manifest_file : ${workspace_dir }/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
51- output_manifest_file : ${workspace_dir }/manifest7.json
79+ input_manifest_file : ${manifest_dir }/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
80+ output_manifest_file : ${manifest_dir }/manifest7.json
5281 rename_fields : {"audio_filepath":"source_filepath"}
5382
5483 - _target_ : sdp.processors.nemo.rttm.GetRttmSegments
55- output_manifest_file : ${workspace_dir }/manifest8.json
84+ output_manifest_file : ${manifest_dir }/manifest8.json
5685 rttm_key : rttm_file
5786 output_file_key : audio_segments
5887 duration_key : duration
5988 duration_threshold : 20.0
6089
6190 - _target_ : sdp.processors.nemo.rttm.SplitAudioFile
62- output_manifest_file : ${workspace_dir }/manifest9.json
63- splited_audio_dir : /mnt/ssd8/multilang/portuguese/yt /splited_wavs/
91+ output_manifest_file : ${manifest_dir }/manifest9.json
92+ splited_audio_dir : ${workspace_dir} /splited_wavs/
6493 segments_key : audio_segments
6594 duration_key : duration
6695 input_file_key : source_filepath
6796 output_file_key : audio_filepath
6897
6998 - _target_ : sdp.processors.PreserveByValue
70- output_manifest_file : ${workspace_dir }/manifest10.json
99+ output_manifest_file : ${manifest_dir }/manifest10.json
71100 input_value_key : duration
72101 operator : gt
73102 target_value : 0.0
74103
75104 - _target_ : sdp.processors.KeepOnlySpecifiedFields
76- output_manifest_file : ${workspace_dir}/manifest11.json
105+ output_manifest_file : ${final_manifest}
77106 fields_to_keep : ["audio_filepath", "duration"]
0 commit comments