diff --git a/README.md b/README.md index 7132a031..d56406d7 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,20 @@ The full API is described in the documentation page [https://hyperion-ml.readthe ### Prerequisites We use anaconda or miniconda, though you should be able to make it work in other python distributions - To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: + To start, you should create a new enviroment and install PyTorch: ``` -conda create --name ${your_env} python=3.8 +conda create --name ${your_env} python=3.11 conda activate ${your_env} -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch +conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ``` -In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions. + +For systems with cuda 10.2 driver: +``` +conda create --name ${your_env} python=3.10 +conda activate ${your_env} +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch +``` + ### Installing Hyperion diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh new file mode 100755 index 00000000..697d5219 --- /dev/null +++ b/egs/commonvoice/v1/cmd.sh @@ -0,0 +1,33 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +elif [ "$(hostname -d)" == "cm.cluster" ];then + export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G" + export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G" + export cuda_eval_cmd="$train_cmd" +else + export train_cmd="run.pl" + export cuda_cmd="run.pl" + export cuda_eval_cmd="$train_cmd" + #export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + #export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + #export cuda_eval_cmd="$train_cmd" +fi + diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf new file mode 100644 index 00000000..1c75f327 --- /dev/null +++ b/egs/commonvoice/v1/conf/clsp.conf @@ -0,0 +1,16 @@ + +# Default configuration +command sbatch --export=PATH +#command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* --mem-per-cpu $0 +# option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 +option gpu=* -p GPU-shared --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +#option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +#option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' + diff --git a/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/commonvoice/v1/conf/coe_gpu_long.conf b/egs/commonvoice/v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/commonvoice/v1/conf/coe_gpu_rtx.conf b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/commonvoice/v1/conf/coe_gpu_short.conf b/egs/commonvoice/v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/commonvoice/v1/conf/coe_gpu_v100.conf b/egs/commonvoice/v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/commonvoice/v1/conf/infer.yaml b/egs/commonvoice/v1/conf/infer.yaml new file mode 100644 index 00000000..1f0ebfa7 --- /dev/null +++ b/egs/commonvoice/v1/conf/infer.yaml @@ -0,0 +1,2 @@ +beam_width: 5 +decoding_method: time_sync_beam_search \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/reverb_noise_aug.yaml b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf new file mode 100644 index 00000000..423d9133 --- /dev/null +++ b/egs/commonvoice/v1/conf/slurm.conf @@ -0,0 +1,15 @@ +# Default configuration +command sbatch --export=PATH +option name=* --job-name $0 +default time=24:00:00 +option time=* --time $0 +option mem=* --mem-per-cpu $0 +option mem=0 +option num_threads=* --cpus-per-task $0 +option num_threads=1 --cpus-per-task 1 +option num_nodes=* --nodes $0 +default gpu=0 +option gpu=0 +option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +# note: the --max-jobs-run option is supported as a special case +# by slurm.pl and you don't have to handle it in the config file. diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml new file mode 100644 index 00000000..15e06f93 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.5 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + joiner_type: film_joiner + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml new file mode 100644 index 00000000..ba71c8ff --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml new file mode 100644 index 00000000..b391f50c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + transducer: + decoder: + prune_range: 15 + joiner: + joiner_type: film_joiner +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml new file mode 100644 index 00000000..a9a755ee --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + reduction: mean + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml new file mode 100644 index 00000000..48ad726c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml new file mode 100644 index 00000000..d6c995e8 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml new file mode 100644 index 00000000..db1005b1 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml new file mode 100644 index 00000000..e436c876 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml new file mode 100644 index 00000000..208a094c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml @@ -0,0 +1,94 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + film_type: tanh + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + film_type: tanh + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml new file mode 100644 index 00000000..6d7317f7 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.8 + decay_steps: 45000 + hold_steps: 40000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml new file mode 100644 index 00000000..a3f25ffd --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.8 + decay_steps: 45000 + hold_steps: 40000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml new file mode 100644 index 00000000..d2f01bd9 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 4500 + hold_steps: 4000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml new file mode 100644 index 00000000..9ab275a6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + # override_condition: true + use_condition: true + condition_size: 128 + condition_components: + - attention + condition_type: "one-hot" + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 30000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml new file mode 100644 index 00000000..7a5b5dd1 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15 + max_audio_length: 12. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15 + max_audio_length: 12. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + transducer: + decoder: + prune_range: 15 + reduction: mean +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml new file mode 100644 index 00000000..465cfcdb --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml @@ -0,0 +1,87 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 0.6 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + reduction: mean + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml new file mode 100644 index 00000000..39c61fa7 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml new file mode 100644 index 00000000..4718389d --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml new file mode 100644 index 00000000..f41f8dad --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml new file mode 100644 index 00000000..fbadc196 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.2 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 10 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml new file mode 100644 index 00000000..7e059b3b --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml new file mode 100644 index 00000000..54ccd48e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml @@ -0,0 +1,88 @@ +# for LoRA ASR +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + use_lora: true + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-lora + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml new file mode 100644 index 00000000..9db63d77 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml new file mode 100644 index 00000000..85970fa6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.2 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml new file mode 100644 index 00000000..2833099f --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false + reduction: mean +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml new file mode 100644 index 00000000..43e6ba3a --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml @@ -0,0 +1,135 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + reduction: mean + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_weight_transducer: 0.1 + loss_weight_lid: 1.0 + lid_length: 3.0 + # feat_fusion_method: weighted-avg + feat_fusion_start: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml new file mode 100644 index 00000000..faa265a3 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml new file mode 100644 index 00000000..cf1a549f --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml new file mode 100644 index 00000000..12b8c371 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml new file mode 100644 index 00000000..a15272d4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_do0.5.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml new file mode 100644 index 00000000..221698d0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.2 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + languageid: + cos_scale: 32.0 +trainer: + optim: + opt_type: sgd + lr: 0.0005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..afe885a3 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + data_loader: + num_workers: 1 +model: wav2vec2xlsr300m_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..c06e46e8 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.5 + + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.5 + data_loader: + num_workers: 1 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml new file mode 100644 index 00000000..0bb34b23 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml new file mode 100644 index 00000000..8c62ac1b --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 420000 + hold_steps: 300000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml new file mode 100644 index 00000000..d409fb47 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + drop_last: false + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + drop_last: false + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 30000 + hold_steps: 16000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml new file mode 100644 index 00000000..dc654278 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml new file mode 100644 index 00000000..962af029 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 0.5 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml new file mode 100644 index 00000000..3918b04f --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 0.5 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml new file mode 100644 index 00000000..17a13388 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml new file mode 100644 index 00000000..061014e0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat6.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 1.0 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml new file mode 100644 index 00000000..4bd1ad28 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat12.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 1.0 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml new file mode 100644 index 00000000..a40db186 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + languageid: + cos_scale: 32.0 +trainer: + optim: + opt_type: sgd + lr: 0.0001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml new file mode 100644 index 00000000..b03a0282 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml new file mode 100644 index 00000000..523bf6fd --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 28000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml new file mode 100644 index 00000000..39b94671 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 0.3 + class_name: language + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1.0 + class_name: language + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 4 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 28000 + hold_steps: 20000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 512 + train_mode: full \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml new file mode 100644 index 00000000..edc0af5e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml new file mode 100644 index 00000000..aefddc7e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_enclast.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml new file mode 100644 index 00000000..49077fd6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml new file mode 100644 index 00000000..9f070bbe --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml new file mode 100644 index 00000000..d787a373 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml new file mode 100644 index 00000000..e9fe0b05 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 2 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 2 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml new file mode 100644 index 00000000..35b2b47c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml new file mode 100644 index 00000000..855bfc98 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml new file mode 100644 index 00000000..0f328e08 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml @@ -0,0 +1,56 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' + min_batch_size: 4 + batch_size: 4 + iters_per_epoch: 6 + drop_last: true + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' + min_batch_size: 2 + batch_size: 2 + iters_per_epoch: 6 + drop_last: true + data_loader: + num_workers: 8 +model: {} +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml new file mode 100644 index 00000000..2e5a9ea5 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml @@ -0,0 +1,58 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + transducer: + decoder: + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml new file mode 100644 index 00000000..88073958 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml @@ -0,0 +1,61 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 1 +model: + transducer: + decoder: + override_dropouts: true + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml new file mode 100644 index 00000000..da03a499 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml @@ -0,0 +1,139 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 0.1 + loss_weight_lid: 1.0 + lid_length: 3.0 + + feat_fusion_method_transducer: film-weighted-avg + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml new file mode 100644 index 00000000..6c06c29b --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 0.05 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 9000 + hold_steps: 6000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml new file mode 100644 index 00000000..7347e8b4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml new file mode 100644 index 00000000..7347e8b4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml new file mode 100644 index 00000000..f7a430a7 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 20 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml new file mode 100644 index 00000000..4d6b8bed --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml @@ -0,0 +1,94 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + reduction: mean + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 20.0 + loss_weight_embed: 20 + loss_reg_weight_transducer: 0.0 + loss_reg_weight_lid: 10.0 + # lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: freeze-gt-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml new file mode 100644 index 00000000..4197c653 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml @@ -0,0 +1,94 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + reduction: mean + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + loss_reg_weight_transducer: 0.0 + loss_reg_weight_lid: 1.0 + # lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-transducer + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml new file mode 100644 index 00000000..a7be4925 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + reduction: mean + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + # lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-transducer + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml new file mode 100644 index 00000000..08964a38 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml @@ -0,0 +1,43 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml new file mode 100644 index 00000000..a647c80b --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml new file mode 100644 index 00000000..803dc396 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.2 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml new file mode 100644 index 00000000..86d1e7c0 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.0 + margin_warmup_epochs: 5 + intertop_margin: 0.0 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml new file mode 100644 index 00000000..c40bcb1f --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml new file mode 100644 index 00000000..7d6d9473 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.0 + margin_warmup_epochs: 5 + intertop_margin: 0.0 + dropout_rate: 0.2 + +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml new file mode 100644 index 00000000..1abfea29 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml new file mode 100644 index 00000000..a7071b8c --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + #embedding_dim: 128 + #num_layers: 1 + #hidden_dim: 64 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml new file mode 100644 index 00000000..19aaac2c --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml new file mode 100644 index 00000000..baa6cde3 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml new file mode 100644 index 00000000..3a5ff1f5 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml new file mode 100644 index 00000000..9c07f5e7 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml @@ -0,0 +1,15 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml new file mode 100644 index 00000000..1d46c33c --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml @@ -0,0 +1,11 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + joiner: + num_layers: 1 +feat_fusion_method: last + diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh new file mode 100644 index 00000000..a1430c8b --- /dev/null +++ b/egs/commonvoice/v1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + commonvoice_root=/scratch4/jvillal7/ylu125/corpora/commonvoice + musan_root=/export/corpora5/JHU/musan + echo "Put your database paths here" + exit 1 +elif [ "$(hostname --domain)" == "cm.cluster" ];then + commonvoice_root=/data/jvillal7/corpora/commonvoice + musan_root=/data/jvillal7/corpora/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + echo "Put your database paths here" + exit 1 +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/commonvoice/v1/default_config.sh b/egs/commonvoice/v1/default_config.sh new file mode 120000 index 00000000..6f5a2dfb --- /dev/null +++ b/egs/commonvoice/v1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_transducer_v3.3_it.sh \ No newline at end of file diff --git a/egs/commonvoice/v1/feats b/egs/commonvoice/v1/feats new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/commonvoice/v1/feats @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh new file mode 100644 index 00000000..851cbc18 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh new file mode 100644 index 00000000..9d35d162 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.1_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0002.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh new file mode 100644 index 00000000..4e24596c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.2_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0013.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v2.2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh new file mode 100644 index 00000000..40516709 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v3.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v3.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v3.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh new file mode 100644 index 00000000..08a9f950 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=en_fr_it_train_proc_audio +dev_data=en_fr_it_dev_proc_audio +test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio" + +lans="en fr it" +language=en_fr_it + +bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v3.3_en_fr_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh new file mode 100644 index 00000000..ba42ad38 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh new file mode 100644 index 00000000..9a154499 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s4 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh new file mode 100644 index 00000000..9b398388 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.1_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.1_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh new file mode 100644 index 00000000..1989a904 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v4.2_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh new file mode 100644 index 00000000..28404ba5 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.0_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0034.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh new file mode 100644 index 00000000..f9d932e4 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.2_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0024.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh new file mode 100644 index 00000000..cedfb6e3 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.3_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0033.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.3.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.3_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh new file mode 100644 index 00000000..5124da23 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.4_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.4.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.4_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh new file mode 100644 index 00000000..13ef37b4 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v7.0_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v7.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh new file mode 100644 index 00000000..b00c7bb0 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v7.1_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v7.1_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh new file mode 100644 index 00000000..69dcb809 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data=" ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0016.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh new file mode 100644 index 00000000..6fe79ec1 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh new file mode 100644 index 00000000..4f2c95b4 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" +#sl_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0001.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh new file mode 100644 index 00000000..f4ccf18e --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0006.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.2.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh new file mode 100644 index 00000000..aca7859c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + +nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s2_transducer_name=$nnet_transducer_name.s2 +nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name +nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh new file mode 100644 index 00000000..b3a07306 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + +nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s2_transducer_name=$nnet_transducer_name.s2 +nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name +nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v2.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0009.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0047.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh new file mode 100644 index 00000000..6391fc98 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh new file mode 100644 index 00000000..f7480a61 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio" #ca_test_proc_audio +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0001.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh new file mode 100644 index 00000000..5de2bb92 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh new file mode 100644 index 00000000..0134e84f --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.3_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh new file mode 100644 index 00000000..99b5d16c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.4_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.4.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh new file mode 100644 index 00000000..09a139ab --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0042.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v5.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh new file mode 100644 index 00000000..f0db5fb6 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.6_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.6.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v5.6.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh new file mode 100644 index 00000000..28f381ea --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="en_test_proc_audio ca_test_proc_audio" +#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0005.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0005.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v6.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0011.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh new file mode 100644 index 00000000..b101854c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="en_test_proc_audio ca_test_proc_audio" +#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v7.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v7.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0005.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v7.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0011.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh new file mode 100644 index 00000000..ffa2a057 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer_resnet1d + +# nnet_s1_transducer_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml +# nnet_s1_transducer_args="" + +# nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2 +# nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name +# nnet_transducer=$nnet_transducer_dir/model_ep0008.pth + +# nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3 +# nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name +# nnet_lid=$nnet_lid_dir/model_ep0003.pth + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_transducer_ecapadnn1024x3.v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh new file mode 100644 index 00000000..fb6709db --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3_13_langs_16000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0002.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh new file mode 100644 index 00000000..0f66c12a --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.0_13_langs_4000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0019.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh new file mode 100644 index 00000000..3fb2f93a --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.1_13_langs_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0010.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh new file mode 100644 index 00000000..4a990e2c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.2_13_langs_16000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0001.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh new file mode 100644 index 00000000..56d4e594 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh @@ -0,0 +1,48 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +<<<<<<< HEAD +nnet_s2=$nnet_s2_dir/model_ep0014.pth +======= +nnet_s2=$nnet_s2_dir/model_ep0015.pth +>>>>>>> 562498f69dca3cfab24a8ee452a1e86c58ee85c0 + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh new file mode 100644 index 00000000..424c2649 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh @@ -0,0 +1,46 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=4_langs_train_proc_audio +dev_data=4_langs_dev_proc_audio + +test_data="tr_test_proc_audio fr_test_proc_audio de_test_proc_audio it_test_proc_audio" + + +lans="tr de fr it" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_4_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0015.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh new file mode 100644 index 00000000..cce21f4c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v6.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0015.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh new file mode 100644 index 00000000..d820ac2d --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio_overlap_spk +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v1.3.1_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v1.3.1_13_langs.s2 +nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh new file mode 100644 index 00000000..2e583f03 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio_overlap_spk +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v1.3_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v1.3_13_langs.s2 +nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh new file mode 100644 index 00000000..4800e6fe --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=sv_train_proc_audio +dev_data=sv_dev_proc_audio +test_data=sv_test_proc_audio + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0120.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh new file mode 100644 index 00000000..3c8efca9 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=en_fr_it_train_proc_audio +dev_data=en_fr_it_dev_proc_audio +test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio" + +lans="en fr it" +language=en_fr_it + +bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3_en_fr_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh new file mode 100644 index 00000000..d62fcef4 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=it_train_proc_audio +dev_data=it_dev_proc_audio +test_data=it_test_proc_audio + +language=it + +bpe_model=data/it_lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0042.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth + diff --git a/egs/commonvoice/v1/hyp_utils b/egs/commonvoice/v1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/commonvoice/v1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/commonvoice/v1/local/data_prep.sh b/egs/commonvoice/v1/local/data_prep.sh new file mode 100755 index 00000000..f21fea8d --- /dev/null +++ b/egs/commonvoice/v1/local/data_prep.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +. ./cmd.sh +. ./path.sh + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 ${language} /export/c06/ylu125/GSP/corpora/CommonVoice data/" + exit 1 +fi + +language=$1 +src=$2 +dst=$3 + +if [ ! -d $src/cv-corpus-12.0-2022-12-07/${language} ]; then + wget https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-12.0-2022-12-07/cv-corpus-12.0-2022-12-07-${language}.tar.gz + tar -xvzf cv-corpus-12.0-2022-12-07-${language}.tar.gz -C $src + rm cv-corpus-12.0-2022-12-07-${language}.tar.gz +fi + + +lhotse prepare commonvoice -l ${language} $src/cv-corpus-12.0-2022-12-07/ ${dst}/${language} + + +for part in dev test train +do + lhotse kaldi export ${dst}/${language}/cv-${language}_recordings_${part}.jsonl.gz ${dst}/${language}/cv-${language}_supervisions_${part}.jsonl.gz ${dst}/${language}_${part} + utils/utt2spk_to_spk2utt.pl ${dst}/${language}_${part}/utt2spk > ${dst}/${language}_${part}/spk2utt + utils/fix_data_dir.sh ${dst}/${language}_${part} + # steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_} +done + diff --git a/egs/commonvoice/v1/local/initailize_film_model.py b/egs/commonvoice/v1/local/initailize_film_model.py new file mode 100644 index 00000000..2b15c236 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_film_model.py @@ -0,0 +1,56 @@ +import torch +import sys + +# arguments example +# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +pretrained_model = torch.load(sys.argv[1]) +film_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def update_film_lstm_parameters(film_state_dict, pretrained_state_dict): + for i in range(2): + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone() + return film_state_dict + + +def copy_model_parameters(pretrained_model, film_model): + pretrained_state_dict = pretrained_model["model_state_dict"] + film_state_dict = film_model["model_state_dict"] + + update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape} + new_film_state_dict = film_state_dict.copy() + new_film_state_dict.update(update_state_dict) + + new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict) + + film_model["model_state_dict"] = new_film_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in film_state_dict.items(): + if torch.all(torch.eq(param, new_film_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in pretrained_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + film_model["epoch"] =1 + torch.save(film_model, output_model) + + +unchanged_parameters = copy_model_parameters(pretrained_model, film_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/local/initailize_film_model_bias.py b/egs/commonvoice/v1/local/initailize_film_model_bias.py new file mode 100644 index 00000000..6abedf57 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_film_model_bias.py @@ -0,0 +1,67 @@ +import torch +import sys + +# arguments example +# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +pretrained_model = torch.load(sys.argv[1]) +film_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def update_film_lstm_parameters(film_state_dict, pretrained_state_dict): + for i in range(2): + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone() + return film_state_dict + +def copy_model_parameters(pretrained_model, film_model): + pretrained_state_dict = pretrained_model["model_state_dict"] + film_state_dict = film_model["model_state_dict"] + update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape} + + film_update_state_dict = {} + for name, param in film_state_dict.items(): + if "linear_scale.weight" in name: + film_update_state_dict[name] = torch.zeros_like(param) + elif "linear_scale.bias" in name: + film_update_state_dict[name] = torch.ones_like(param) + elif "linear_shift.weight" in name or "linear_shift.bias" in name: + film_update_state_dict[name] = torch.zeros_like(param) + # import pdb; pdb.set_trace() + new_film_state_dict = film_state_dict.copy() + new_film_state_dict.update(update_state_dict) + new_film_state_dict.update(film_update_state_dict) + + + new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict) + + film_model["model_state_dict"] = new_film_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in film_state_dict.items(): + if torch.all(torch.eq(param, new_film_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in pretrained_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + film_model["epoch"] =1 + torch.save(film_model, output_model) + + + +unchanged_parameters = copy_model_parameters(pretrained_model, film_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py new file mode 100644 index 00000000..3bc5148f --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py @@ -0,0 +1,74 @@ +import torch +import sys +# arguments example +# + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) +joint_model = torch.load(sys.argv[3]) + +output_model = sys.argv[4] + + +def check_update_parameters(joint_state_dict, new_joint_state_dict): + shape_changed_parameters = [] + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in joint_state_dict.items(): + new_param = new_joint_state_dict[name].to(param.device) + if param.shape != new_param.shape: + shape_changed_parameters.append(name) + elif torch.all(torch.eq(param, new_param)): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + print("Shape changed parameters: {}".format(shape_changed_parameters)) + print("Unchanged parameters: {}".format(unchanged_parameters)) + print("Changed parameters: {}".format(changed_parameters)) + + + +def copy_model_parameters(ASR_model, LID_model, joint_model, output_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + + LID_state_dict = {"module." + name: param for name, param in LID_state_dict.items()} + + joint_state_dict = joint_model["model_state_dict"] + + hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} + transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and ("transducer" in name or "film" in name)} + languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} + + + film_update_state_dict = {} + for name, param in joint_state_dict.items(): + if "linear_scale.weight" in name and "lid_film" in name: + film_update_state_dict[name] = torch.zeros_like(param) + elif "linear_scale.bias" in name and "lid_film" in name: + film_update_state_dict[name] = torch.ones_like(param) + elif ("linear_shift.weight" in name or "linear_shift.bias" in name) and "lid_film" in name: + film_update_state_dict[name] = torch.zeros_like(param) + + new_joint_state_dict = joint_state_dict.copy() + new_joint_state_dict.update(hf_feats_update_state_dict) + new_joint_state_dict.update(transducer_update_state_dict) + new_joint_state_dict.update(languageid_update_state_dict) + new_joint_state_dict.update(film_update_state_dict) + + # import pdb;pdb.set_trace() + + new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"] + new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"] + + + joint_model["model_state_dict"] = new_joint_state_dict + joint_model["epoch"] =1 + + check_update_parameters(joint_state_dict, new_joint_state_dict) + torch.save(joint_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model, joint_model, output_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/local/initailize_joint_model.py b/egs/commonvoice/v1/local/initailize_joint_model.py new file mode 100644 index 00000000..fd98d3f2 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_joint_model.py @@ -0,0 +1,56 @@ +import torch +import sys +# arguments example +# + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) +joint_model = torch.load(sys.argv[3]) + +output_model = sys.argv[4] + + +def check_update_parameters(joint_state_dict, new_joint_state_dict): + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in joint_state_dict.items(): + new_param = new_joint_state_dict[name].to(param.device) + if torch.all(torch.eq(param, new_param)): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + print("Unchanged parameters: {}".format(unchanged_parameters)) + print("Changed parameters: {}".format(changed_parameters)) + + + +def copy_model_parameters(ASR_model, LID_model, joint_model, output_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + joint_state_dict = joint_model["model_state_dict"] + + hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} + transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name} + languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} + + + new_joint_state_dict = joint_state_dict.copy() + new_joint_state_dict.update(hf_feats_update_state_dict) + new_joint_state_dict.update(transducer_update_state_dict) + new_joint_state_dict.update(languageid_update_state_dict) + # import pdb;pdb.set_trace() + + new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"] + new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"] + + + joint_model["model_state_dict"] = new_joint_state_dict + joint_model["epoch"] =1 + + check_update_parameters(joint_state_dict, new_joint_state_dict) + torch.save(joint_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model, joint_model, output_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py new file mode 100644 index 00000000..1862333c --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_lid_model.py @@ -0,0 +1,53 @@ +import torch +import sys +# arguments example +# ASR_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# LID_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0008.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s1/model_ep0034.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s3/model_ep0001.pth + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def copy_model_parameters(ASR_model, LID_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + + # LID_state_dict = {name.replace("module.", ""): param for name, param in LID_state_dict.items()} + + # ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} + + update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name} + # remove feature fuser + + new_LID_state_dict = LID_state_dict.copy() + new_LID_state_dict.update(update_state_dict) + + LID_model["model_state_dict"] = new_LID_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in LID_state_dict.items(): + if torch.all(torch.eq(param, new_LID_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in ASR_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + LID_model["epoch"] =1 + torch.save(LID_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model) diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/commonvoice/v1/local/make_musan.py similarity index 100% rename from egs/voxceleb/v1/local/make_musan.py rename to egs/commonvoice/v1/local/make_musan.py diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/commonvoice/v1/local/make_musan.sh similarity index 100% rename from egs/voxceleb/v1/local/make_musan.sh rename to egs/commonvoice/v1/local/make_musan.sh diff --git a/egs/voxceleb/v1/local/make_rirs_data.sh b/egs/commonvoice/v1/local/make_rirs_data.sh similarity index 100% rename from egs/voxceleb/v1/local/make_rirs_data.sh rename to egs/commonvoice/v1/local/make_rirs_data.sh diff --git a/egs/commonvoice/v1/local/prepare_lang.py b/egs/commonvoice/v1/local/prepare_lang.py new file mode 100755 index 00000000..39d76146 --- /dev/null +++ b/egs/commonvoice/v1/local/prepare_lang.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script takes as input a lexicon file "data/lang_phone/lexicon.txt" +consisting of words and tokens (i.e., phones) and does the following: + +1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt + +2. Generate tokens.txt, the token table mapping a token to a unique integer. + +3. Generate words.txt, the word table mapping a word to a unique integer. + +4. Generate L.pt, in k2 format. It can be loaded by + + d = torch.load("L.pt") + lexicon = k2.Fsa.from_dict(d) + +5. Generate L_disambig.pt, in k2 format. +""" +import argparse +import math +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import k2 +import torch + +from hyperion.utils.lexicon import read_lexicon, write_lexicon + +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain a file lexicon.txt. + Generated files by this script are saved into this directory. + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + """, + ) + + return parser.parse_args() + + +def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: + """Write a symbol to ID mapping to a file. + + Note: + No need to implement `read_mapping` as it can be done + through :func:`k2.SymbolTable.from_file`. + + Args: + filename: + Filename to save the mapping. + sym2id: + A dict mapping symbols to IDs. + Returns: + Return None. + """ + with open(filename, "w", encoding="utf-8") as f: + for sym, i in sym2id.items(): + f.write(f"{sym} {i}\n") + + +def get_tokens(lexicon: Lexicon) -> List[str]: + """Get tokens from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique tokens. + """ + ans = set() + for _, tokens in lexicon: + ans.update(tokens) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def get_words(lexicon: Lexicon) -> List[str]: + """Get words from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique words. + """ + ans = set() + for word, _ in lexicon: + ans.add(word) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]: + """It adds pseudo-token disambiguation symbols #1, #2 and so on + at the ends of tokens to ensure that all pronunciations are different, + and that none is a prefix of another. + + See also add_lex_disambig.pl from kaldi. + + Args: + lexicon: + It is returned by :func:`read_lexicon`. + Returns: + Return a tuple with two elements: + + - The output lexicon with disambiguation symbols + - The ID of the max disambiguation symbol that appears + in the lexicon + """ + + # (1) Work out the count of each token-sequence in the + # lexicon. + count = defaultdict(int) + for _, tokens in lexicon: + count[" ".join(tokens)] += 1 + + # (2) For each left sub-sequence of each token-sequence, note down + # that it exists (for identifying prefixes of longer strings). + issubseq = defaultdict(int) + for _, tokens in lexicon: + tokens = tokens.copy() + tokens.pop() + while tokens: + issubseq[" ".join(tokens)] = 1 + tokens.pop() + + # (3) For each entry in the lexicon: + # if the token sequence is unique and is not a + # prefix of another word, no disambig symbol. + # Else output #1, or #2, #3, ... if the same token-seq + # has already been assigned a disambig symbol. + ans = [] + + # We start with #1 since #0 has its own purpose + first_allowed_disambig = 1 + max_disambig = first_allowed_disambig - 1 + last_used_disambig_symbol_of = defaultdict(int) + + for word, tokens in lexicon: + tokenseq = " ".join(tokens) + assert tokenseq != "" + if issubseq[tokenseq] == 0 and count[tokenseq] == 1: + ans.append((word, tokens)) + continue + + cur_disambig = last_used_disambig_symbol_of[tokenseq] + if cur_disambig == 0: + cur_disambig = first_allowed_disambig + else: + cur_disambig += 1 + + if cur_disambig > max_disambig: + max_disambig = cur_disambig + last_used_disambig_symbol_of[tokenseq] = cur_disambig + tokenseq += f" #{cur_disambig}" + ans.append((word, tokenseq.split())) + return ans, max_disambig + + +def generate_id_map(symbols: List[str]) -> Dict[str, int]: + """Generate ID maps, i.e., map a symbol to a unique ID. + + Args: + symbols: + A list of unique symbols. + Returns: + A dict containing the mapping between symbols and IDs. + """ + return {sym: i for i, sym in enumerate(symbols)} + + +def add_self_loops(arcs: List[List[Any]], disambig_token: int, + disambig_word: int) -> List[List[Any]]: + """Adds self-loops to states of an FST to propagate disambiguation symbols + through it. They are added on each state with non-epsilon output symbols + on at least one arc out of the state. + + See also fstaddselfloops.pl from Kaldi. One difference is that + Kaldi uses OpenFst style FSTs and it has multiple final states. + This function uses k2 style FSTs and it does not need to add self-loops + to the final state. + + The input label of a self-loop is `disambig_token`, while the output + label is `disambig_word`. + + Args: + arcs: + A list-of-list. The sublist contains + `[src_state, dest_state, label, aux_label, score]` + disambig_token: + It is the token ID of the symbol `#0`. + disambig_word: + It is the word ID of the symbol `#0`. + + Return: + Return new `arcs` containing self-loops. + """ + states_needs_self_loops = set() + for arc in arcs: + src, dst, ilabel, olabel, score = arc + if olabel != 0: + states_needs_self_loops.add(src) + + ans = [] + for s in states_needs_self_loops: + ans.append([s, s, disambig_token, disambig_word, 0]) + + return arcs + ans + + +def lexicon_to_fst( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + sil_token: str = "SIL", + sil_prob: float = 0.5, + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format) with optional silence at + the beginning and end of each word. + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + sil_token: + The silence token. + sil_prob: + The probability for adding a silence at the beginning and end + of the word. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + assert sil_prob > 0.0 and sil_prob < 1.0 + # CAUTION: we use score, i.e, negative cost. + sil_score = math.log(sil_prob) + no_sil_score = math.log(1.0 - sil_prob) + + start_state = 0 + loop_state = 1 # words enter and leave from here + sil_state = 2 # words terminate here when followed by silence; this state + # has a silence transition to loop_state. + next_state = 3 # the next un-allocated state, will be incremented as we go. + arcs = [] + + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + sil_token = token2id[sil_token] + + arcs.append([start_state, loop_state, eps, eps, no_sil_score]) + arcs.append([start_state, sil_state, eps, eps, sil_score]) + arcs.append([sil_state, loop_state, sil_token, eps, 0]) + + for word, tokens in lexicon: + assert len(tokens) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + tokens = [token2id[i] for i in tokens] + + for i in range(len(tokens) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, tokens[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last token of this word + # It has two out-going arcs, one to the loop state, + # the other one to the sil_state. + i = len(tokens) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score]) + arcs.append([cur_state, sil_state, tokens[i], w, sil_score]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + lexicon_filename = lang_dir / "lexicon.txt" + sil_token = "SIL" + sil_prob = 0.5 + + lexicon = read_lexicon(lexicon_filename) + tokens = get_tokens(lexicon) + words = get_words(lexicon) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in tokens + tokens.append(f"#{i}") + + assert "" not in tokens + tokens = [""] + tokens + + assert "" not in words + assert "#0" not in words + assert "" not in words + assert "" not in words + + words = [""] + words + ["#0", "", ""] + + token2id = generate_id_map(tokens) + word2id = generate_id_map(words) + + write_mapping(lang_dir / "tokens.txt", token2id) + write_mapping(lang_dir / "words.txt", word2id) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst( + lexicon, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + ) + + L_disambig = lexicon_to_fst( + lexicon_disambig, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/prepare_lang_bpe.py b/egs/commonvoice/v1/local/prepare_lang_bpe.py new file mode 100755 index 00000000..7838b6a0 --- /dev/null +++ b/egs/commonvoice/v1/local/prepare_lang_bpe.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) +""" + +This script takes as input `lang_dir`, which should contain:: + + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: + + - lexicon.txt + - lexicon_disambig.txt + - L.pt + - L_disambig.pt + - tokens.txt +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch +from prepare_lang import ( + Lexicon, + add_disambig_symbols, + add_self_loops, + write_lexicon, + write_mapping, +) + + +def lexicon_to_fst_no_sil( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format). + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + loop_state = 0 # words enter and leave from here + next_state = 1 # the next un-allocated state, will be incremented as we go + + arcs = [] + + # The blank symbol is defined in local/train_bpe_model.py + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + for word, pieces in lexicon: + assert len(pieces) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + pieces = [token2id[i] for i in pieces] + + for i in range(len(pieces) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, pieces[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last piece of this word + i = len(pieces) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, pieces[i], w, 0]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def generate_lexicon(model_file: str, + words: List[str]) -> Tuple[Lexicon, Dict[str, int]]: + """Generate a lexicon from a BPE model. + + Args: + model_file: + Path to a sentencepiece model. + words: + A list of strings representing words. + Returns: + Return a tuple with two elements: + - A dict whose keys are words and values are the corresponding + word pieces. + - A dict representing the token symbol, mapping from tokens to IDs. + """ + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + + # Convert word to word piece IDs instead of word piece strings + # to avoid OOV tokens. + words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int) + + # Now convert word piece IDs back to word piece strings. + words_pieces: List[List[str]] = [ + sp.id_to_piece(ids) for ids in words_pieces_ids + ] + + lexicon = [] + for word, pieces in zip(words, words_pieces): + lexicon.append((word, pieces)) + + # The OOV word is + lexicon.append(("", [sp.id_to_piece(sp.unk_id())])) + + token2id: Dict[str, int] = dict() + for i in range(sp.vocab_size()): + token2id[sp.id_to_piece(i)] = i + + return lexicon, token2id + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + + See "test/test_bpe_lexicon.py" for usage. + """, + ) + + return parser.parse_args() + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + model_file = lang_dir / "bpe.model" + + word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + words = word_sym_table.symbols + + excluded = [ + "", "!SIL", "", "", "#0", "", "" + ] + for w in excluded: + if w in words: + words.remove(w) + + lexicon, token_sym_table = generate_lexicon(model_file, words) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + next_token_id = max(token_sym_table.values()) + 1 + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in token_sym_table + token_sym_table[disambig] = next_token_id + next_token_id += 1 + + word_sym_table.add("#0") + word_sym_table.add("") + word_sym_table.add("") + + write_mapping(lang_dir / "tokens.txt", token_sym_table) + + write_lexicon(lang_dir / "lexicon.txt", lexicon) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst_no_sil( + lexicon, + token2id=token_sym_table, + word2id=word_sym_table, + ) + + L_disambig = lexicon_to_fst_no_sil( + lexicon_disambig, + token2id=token_sym_table, + word2id=word_sym_table, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/train_bpe_model.py b/egs/commonvoice/v1/local/train_bpe_model.py new file mode 100755 index 00000000..42aba957 --- /dev/null +++ b/egs/commonvoice/v1/local/train_bpe_model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# You can install sentencepiece via: +# +# pip install sentencepiece +# +# Due to an issue reported in +# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030 +# +# Please install a version >=0.1.96 + +import argparse +import shutil +from pathlib import Path + +import sentencepiece as spm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + The generated bpe.model is saved to this directory. + """, + ) + + parser.add_argument( + "--transcript", + type=str, + help="Training transcript.", + ) + + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + + model_type = "unigram" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = args.transcript + character_coverage = 1.0 + input_sentence_size = 100000000 + + user_defined_symbols = ["", ""] + unk_id = len(user_defined_symbols) + # Note: unk_id is fixed to 2. + # If you change it, you should also change other + # places that are using it. + + model_file = Path(model_prefix + ".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=train_text, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=model_prefix, + input_sentence_size=input_sentence_size, + character_coverage=character_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=-1, + eos_id=-1, + ) + + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/validate_bpe_lexicon.py b/egs/commonvoice/v1/local/validate_bpe_lexicon.py new file mode 100755 index 00000000..36962933 --- /dev/null +++ b/egs/commonvoice/v1/local/validate_bpe_lexicon.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script checks that there are no OOV tokens in the BPE-based lexicon. + +Usage example: + + python3 ./local/validate_bpe_lexicon.py \ + --lexicon /path/to/lexicon.txt \ + --bpe-model /path/to/bpe.model +""" + +import argparse +from pathlib import Path +from typing import List, Tuple + +import sentencepiece as spm + +from hyperion.utils.lexicon import read_lexicon + +# Map word to word pieces +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--lexicon", + required=True, + type=Path, + help="Path to lexicon.txt", + ) + + parser.add_argument( + "--bpe-model", + required=True, + type=Path, + help="Path to bpe.model", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + assert args.lexicon.is_file(), args.lexicon + assert args.bpe_model.is_file(), args.bpe_model + + lexicon = read_lexicon(args.lexicon) + + sp = spm.SentencePieceProcessor() + sp.load(str(args.bpe_model)) + + word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size())))) + for word, pieces in lexicon: + for p in pieces: + if p not in word_pieces: + raise ValueError(f"The word {word} contains an OOV token {p}") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/path.sh b/egs/commonvoice/v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/commonvoice/v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh new file mode 100755 index 00000000..d4873f0f --- /dev/null +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh +. $config_file + + +nj=6 + +mkdir -p data + + + +if [ ${stage} -le 1 ]; then + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 1: Data preparation" + for lan in $lans + do + # use underscore-separated names in data directories. + local/data_prep.sh ${lan} $commonvoice_root data/ + done +fi + +if [ ${stage} -le 2 ]; then + echo "stage 2: Data conversion" + # for part in $test_data $dev_data $nnet_data + for lan in $lans + do + for part in ${lan}_train # ${lan}_test ${lan}_dev + do + echo ${part} + steps_transducer/preprocess_audios_for_nnet_train.sh --nj 16 --cmd "$train_cmd" \ + --storage_name commonvoice-v1-$(date +'%m_%d_%H_%M') --use-bin-vad false \ + --osr 16000 data/${part} data/${part}_proc_audio exp/${part}_proc_audio + utils/fix_data_dir.sh data/${part}_proc_audio || true + done + done +fi + +if [ ${stage} -le 3 ]; then + echo "stage 3: Combine Multilingual Data" + + dev_folders="" + train_folders="" + for lan in $lans + do + dev_folders+="data/${lan}_dev_proc_audio " + train_folders+="data/${lan}_train_proc_audio " + done + + combine_data.sh data/${dev_data}/ $dev_folders + combine_data.sh data/${nnet_data}/ $train_folders + awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_dev_proc_audio/utt2lang data/13_langs_dev_proc_audio/utt2spk > data/13_langs_dev_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv + awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_train_proc_audio/utt2lang data/13_langs_train_proc_audio/utt2spk > data/13_langs_train_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv + + # cut -d' ' -f1 --complement data/${nnet_data}/text > data/lm/${lan}_transcript_words.txt +fi \ No newline at end of file diff --git a/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..6bdcb4f2 --- /dev/null +++ b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh new file mode 100755 index 00000000..ee14ca2b --- /dev/null +++ b/egs/commonvoice/v1/run_004_compute_bpe.sh @@ -0,0 +1,103 @@ +#!/bin/bash + + +. ./cmd.sh +. ./path.sh +set -e + +vocab_sizes=( + 8000 + 16000 +) + +dl_dir=$PWD/download + +stage=1 +stop_stage=4 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh +. $config_file + + +# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then +# echo "Stage 1: Dump transcripts for LM training" +# mkdir -p data/lm +# gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ +# | jq '.text' \ +# | sed 's:"::g' \ +# > data/lm/${language}_transcript_words.txt +# fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + echo "Stage 2: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/${language}_lang_bpe_${vocab_size} + mkdir -p $lang_dir + + # Add special words to words.txt + echo " 0" > $lang_dir/words.txt + echo "!SIL 1" >> $lang_dir/words.txt + echo " 2" >> $lang_dir/words.txt + + # # Add regular words to words.txt + # gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ + # | jq '.text' \ + # | sed 's:"::g' \ + # | sed 's: :\n:g' \ + # | sort \ + # | uniq \ + # | sed '/^$/d' \ + # | awk '{print $0,NR+2}' \ + # >> $lang_dir/words.txt + + # Add remaining special word symbols expected by LM scripts. + num_words=$(cat $lang_dir/words.txt | wc -l) + echo " ${num_words}" >> $lang_dir/words.txt + num_words=$(cat $lang_dir/words.txt | wc -l) + echo " ${num_words}" >> $lang_dir/words.txt + num_words=$(cat $lang_dir/words.txt | wc -l) + echo "#0 ${num_words}" >> $lang_dir/words.txt + + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript data/lm/${language}_transcript_words.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + fi + done +fi + +# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then +# echo "Stage 3: Train LM" +# lm_dir=data/lm + +# if [ ! -f $lm_dir/G.arpa ]; then +# ./shared/make_kn_lm.py \ +# -ngram-order 3 \ +# -text $lm_dir/transcript_words.txt \ +# -lm $lm_dir/G.arpa +# fi + +# if [ ! -f $lm_dir/G_3_gram.fst.txt ]; then +# python3 -m kaldilm \ +# --read-symbol-table="data/lang_phone/words.txt" \ +# --disambig-symbol='#0' \ +# --max-order=3 \ +# $lm_dir/G.arpa > $lm_dir/G_3_gram.fst.txt +# fi +# fi + +# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then +# echo "Stage 4: Compile HLG" +# ./local/compile_hlg.py --lang-dir data/lang_phone + +# for vocab_size in ${vocab_sizes[@]}; do +# lang_dir=data/lang_bpe_${vocab_size} +# ./local/compile_hlg.py --lang-dir $lang_dir +# done +# fi \ No newline at end of file diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh new file mode 100755 index 00000000..b6a50e7f --- /dev/null +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + +# export CUDA_VISIBLE_DEVICES=0,1 +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1236 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + # --master-port 1236 \ + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh new file mode 100755 index 00000000..bf14500e --- /dev/null +++ b/egs/commonvoice/v1/run_012_train_lid.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=3 + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1234 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $val_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh new file mode 100755 index 00000000..e86cf62d --- /dev/null +++ b/egs/commonvoice/v1/run_015_train_film_asr.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +# export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_film_transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1237 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_film_transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1237 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_film_transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1237 \ + --num-gpus $ngpu + +fi + diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh new file mode 100755 index 00000000..88c56fe2 --- /dev/null +++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_transducer_languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1238 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer_languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-transducer $nnet_transducer \ + --in-model-lid $nnet_lid \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh new file mode 100755 index 00000000..3f79a4cb --- /dev/null +++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1238 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh new file mode 100755 index 00000000..64c2e39f --- /dev/null +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +rm -f $transducer_dir/overall_wer_char.txt + +# test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in $test_data +do + nj=20 + steps_transducer/decode_wav2vec2rnn_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model +done + diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh new file mode 100755 index 00000000..0f64a008 --- /dev/null +++ b/egs/commonvoice/v1/run_031_inference_film.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +rm -f $transducer_dir/overall_wer_char.txt + +# test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in $test_data +do + nj=16 + steps_transducer/decode_wav2vec2rnn_film_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model data/$nnet_data/langs +done + diff --git a/egs/commonvoice/v1/run_032_identificate.sh b/egs/commonvoice/v1/run_032_identificate.sh new file mode 100755 index 00000000..76b98c34 --- /dev/null +++ b/egs/commonvoice/v1/run_032_identificate.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=0 +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + lid_args="--use-gpu true" + lid_cmd="$cuda_eval_cmd --mem 6G" +else + lid_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +lid_dir=exp/resnet1d/$nnet_name + +rm -f $lid_dir/overall_lid_score.txt + +# Extracts x-vectors for evaluation +for name in $test_data # $dev_data $test_data + do + nj=40 + steps_lid/identificate_wav2vec2resnet1d.sh \ + --cmd "$lid_cmd" --nj $nj ${lid_args} \ + $nnet data/$name \ + $lid_dir/$name data/$nnet_data/langs + done + +exit diff --git a/egs/commonvoice/v1/steps b/egs/commonvoice/v1/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/commonvoice/v1/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_be b/egs/commonvoice/v1/steps_be new file mode 120000 index 00000000..b2098c2a --- /dev/null +++ b/egs/commonvoice/v1/steps_be @@ -0,0 +1 @@ +../v1/steps_be \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh new file mode 100755 index 00000000..5a9a30c8 --- /dev/null +++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +lang_file=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/identificate_wav2languageid.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + identificate_wav2languageid.py \ + --part-idx JOB --num-parts $nj ${args} \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --lang-file $lang_file \ + --output $output_dir/languageid.JOB + set -e +fi + +if [ $stage -le 1 ];then + echo "compute error rate" + + cat $output_dir/languageid.* > $output_dir/langs + python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score + + echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt + cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt + echo " " >> $output_dir/../overall_lid_score.txt + # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text + # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text + +fi diff --git a/egs/commonvoice/v1/steps_pyfe b/egs/commonvoice/v1/steps_pyfe new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/commonvoice/v1/steps_pyfe @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_transducer b/egs/commonvoice/v1/steps_transducer new file mode 120000 index 00000000..c9fd1392 --- /dev/null +++ b/egs/commonvoice/v1/steps_transducer @@ -0,0 +1 @@ +hyp_utils/steps_transducer \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_xvec b/egs/commonvoice/v1/steps_xvec new file mode 120000 index 00000000..289276b7 --- /dev/null +++ b/egs/commonvoice/v1/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors/ \ No newline at end of file diff --git a/egs/commonvoice/v1/utils b/egs/commonvoice/v1/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/commonvoice/v1/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/commonvoice/v1/xvectors b/egs/commonvoice/v1/xvectors new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/commonvoice/v1/xvectors @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 99b0065e..81ebbeae 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/librispeech/v1/run_011_train_asr_old.sh b/egs/librispeech/v1/run_011_train_asr_old.sh index 3d0e6eb1..3c9f4f5b 100755 --- a/egs/librispeech/v1/run_011_train_asr_old.sh +++ b/egs/librispeech/v1/run_011_train_asr_old.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 14e3fc20..b6252df7 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -2,15 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import logging import numpy as np from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import softmax +from hyperion.utils.math_funcs import softmax from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import LNorm from hyperion.np.clustering import AHC @@ -23,9 +19,6 @@ def lnorm(x): def cosine_scr(x1, x2): - # t = LNorm() - # x1 = t.predict(x1) - # x2 = t.predict(x2) x1 = lnorm(x1) x2 = lnorm(x2) return np.dot(x1, x2.T) diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 907509fd..c9657a66 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm as SNorm diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index b661cbde..24ef731b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index 8e7715e0..bdef3fc3 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 12f1725b..51795676 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 234f966c..79c1cd6f 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index e35577d7..d90dc0a4 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -7,6 +7,20 @@ The systems runs at 16 kHz, telephone data is upsampled to 16k using SoX This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -88,8 +102,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_013_prepare_langid_train_data.sh` @@ -110,8 +122,8 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_040_eval_be_v1.sh, run_041_eval_be_v2.sh, run_042_eval_be_v3.sh, run_042b_eval_be_v3.sh` - Evals different back-end versions: - V1: Back-end trained on all data without adaptation - - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, global PLDA adapted to SRE-Vox-CHN - - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, source dependent PLDA adapted to SRE-CHN or Vox-CHN + - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, global PLDA adapted to SRE-Vox-CHN + - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, source dependent PLDA adapted to SRE-CHN or Vox-CHN - V3b: V3 with hyperparmeters tuned for x-vectors trained on VoxCeleb only - `run_fus*.sh` @@ -120,4 +132,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.57 | 0.135 | 0.237 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.23 | 0.136 | 0.187 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.38 | 0.147 | 0.189 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.91 | 0.393 | 0.409 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 5.22 | 0.370 | 0.377 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.79 | 0.309 | 0.325 | + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.68 | 0.395 | 0.401 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.92 | 0.405 | 0.412 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.80 | 0.357 | 0.360 | diff --git a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml deleted file mode 100644 index 5451702f..00000000 --- a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml +++ /dev/null @@ -1,59 +0,0 @@ -min_chunk_length: 4.0 -max_chunk_length: 4.0 -return_fullseqs: false -wav_scale: 32767 -batch_size: 512 -var_batch_size: false -iters_per_epoch: 6.0 -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -feats: fbank64_stmn_nb_16k.yaml -pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 32 -embed_dim: 32 -num_embed_layers: 1 -hid_act: relu6 -loss_type: arc-softmax -s: 30.0 -margin: 0.3 -margin_warmup_epochs: 30.0 -dropout_rate: 0.0 -in_feats: 64 -resnet_type: lresnet34 -in_channels: 1 -conv_channels: 64 -base_channels: 64 -in_kernel_size: 3 -in_stride: 1 -in_norm: false -no_maxpool: true -optim: - opt_type: adam - lr: 0.02 - # lr: 0.01 - beta1: 0.9 - beta2: 0.95 - amsgrad: true - weight_decay: 1e-5 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 8000 - hold_steps: 10000 - min_lr: 1.0e-05 - warmup_steps: 1000 - update_lr_on_opt_step: true -grad_acc_steps: 1 -epochs: 70 -log_interval: 100 -use_tensorboard: false -use_wandb: false -wandb: - mode: online -ddp_type: ddp -use_amp: true -swa_start: 0 -swa_lr: 0.001 -swa_anneal_epochs: 10 -num_gpus: 4 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..d68ea26e --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..e7f9969b --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml new file mode 100644 index 00000000..c46365db --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 +feats: fbank64_stmn_nb_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 64 + conv_channels: 64 + in_kernel_size: 3 + in_stride: 1 + in_norm: false + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 32 + embed_dim: 32 + num_embed_layers: 1 + hid_act: relu6 + loss_type: arc-softmax + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 30.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + beta1: 0.9 + beta2: 0.95 + amsgrad: true + weight_decay: 1e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + epochs: 70 + log_interval: 100 + use_amp: true + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..7a9234b6 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,80 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: mean+stddev + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 50 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..9884bb4c --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 21 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..4c427202 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..10607607 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index c8732c36..1da68697 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -88,7 +50,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh index 1903369e..6d14f27d 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -1,4 +1,4 @@ -# LResNet34 x-vector with mixed precision training +# Res2Net50 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,50 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.05 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 +nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s4_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0071.pth - +nnet=$nnet_dir/model_ep0061.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=21 ft_margin=0.5 -ft_margin_warmup=5 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0021.pth @@ -61,7 +44,4 @@ ft_nnet=$ft_nnet_dir/model_ep0021.pth plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh deleted file mode 100644 index 344e1288..00000000 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ /dev/null @@ -1,67 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -#nnet=$nnet_dir/swa_model_ep0061.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index cae32b57..0b62008e 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,103 +9,40 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 s=30 margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=10 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# xvector last-layer finetuning in-domain -reg_layers_classif=0 -reg_layers_enc="0 1 2 3 4" -nnet_adapt_data=voxcelebcat_sre_alllangs_mixfs_chnspks - -# ft2_batch_size_1gpu=4 -# ft2_eff_batch_size=128 # effective batch size -# ft2_ipe=4 -# ft2_lr=0.01 -# ft2_nnet_num_epochs=12 -# ft2_margin_warmup=3 -# ft2_reg_weight_embed=0.1 -# ft2_min_chunk=10 -# ft2_max_chunk=60 - -# ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -# ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -# ft2_nnet_name=${ft_nnet_name}.ft_eaffine_rege_w${ft2_reg_weigth_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v2 -# ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -# ft2_nnet=$ft2_nnet_dir/model_ep0010.pth - - -# xvector full nnet finetuning -ft2_batch_size_1gpu=6 -ft2_eff_batch_size=128 # effective batch size -ft2_ipe=1 -ft2_lr=0.01 -ft2_nnet_num_epochs=15 -ft2_margin=0.5 -ft2_margin_warmup=3 -ft2_reg_weight_embed=0.1 -ft2_reg_weight_enc=0.1 -ft2_min_chunk=10 -ft2_max_chunk=10 - -ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft2_nnet_name=${ft_nnet_name}.ft_reg_wenc${ft2_reg_weight_enc}_we${ft2_reg_weight_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v1 -ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -ft2_nnet=$ft2_nnet_dir/model_ep0012.pth - - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 96475c53..a57f16d9 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,21 +9,15 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=tseres2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 se_r=256 s=30 @@ -31,13 +25,8 @@ margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0075.pth nnet=$nnet_dir/swa_model_ep0076.pth @@ -49,12 +38,9 @@ ft_min_chunk=10 ft_max_chunk=15 ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -69,7 +55,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh new file mode 100644 index 00000000..b5863308 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -0,0 +1,49 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxcelebcat + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_args="--model.pool_net.pool-type mean+stddev" +nnet_name=${feat_type}_res2net50w26s8_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +#nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl index 27b1f152..18b6d40c 100755 --- a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl +++ b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl @@ -31,7 +31,7 @@ my $meta_path = "$data_base/vox1_meta.csv"; if (! -e "$meta_path") { $meta_path = "$out_dir/vox1_meta.csv"; - system("wget -O $meta_path $meta_url"); + system("wget --no-check-certificate -O $meta_path $meta_url"); } open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; @@ -53,7 +53,7 @@ my $lid_path = "$data_base/lang_vox1_final.csv"; if (! -e "$lid_path") { $lid_path = "$out_dir/lang_vox1_final.csv"; - system("wget -O $lid_path $lid_url"); + system("wget --no-check-certificate -O $lid_path $lid_url"); } open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; my %utt2lang = (); diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh index a5bc03eb..e56906f6 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh +++ b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh @@ -18,7 +18,7 @@ echo "Score SRE21 ${track} ${subset} for $score_dir" soft_dir=./sre21/scoring_software -if [ ! -f $s_dir/sre_scorer.py ];then +if [ ! -f $soft_dir/sre_scorer.py ];then echo "downloading scoring tool" local/download_sre21_scoring_tool.sh fi diff --git a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh index f7aa7828..08f655ea 100755 --- a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh +++ b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh @@ -75,41 +74,3 @@ if [ $stage -le 3 ];then done fi -# #Enroll multi-speaker Datasets with time marks -# if [ $stage -le 3 ];then -# for name in sre18_dev_enroll_vast sre18_eval_enroll_vast sre19_av_a_dev_enroll sre19_av_a_eval_enroll -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# local/sre18_diar_to_vad.sh data/${name} exp/make_vad $vaddir -# utils/fix_data_dir.sh data/${name} -# done -# fi - -# #Dihard Datasets -# if [ $stage -le 4 ];then -# for name in dihard2_train_dev dihard2_train_eval -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# hyp_utils/rttm_to_bin_vad.sh --nj 5 data/$name/vad.rttm data/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# done - -# fi - -# if [ $stage -le 5 ];then -# utils/combine_data.sh --extra-files "utt2num_frames" data/dihard2_train data/dihard2_train_dev data/dihard2_train_eval -# utils/fix_data_dir.sh data/dihard2_train -# fi - - diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 0608929c..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -10,28 +10,66 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + # Network Training if [ $stage -le 1 ]; then diff --git a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh index 6251de97..35d2c0bc 100755 --- a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh +++ b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh @@ -10,19 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 -lid_ipe=1 +num_workers="" + . parse_options.sh || exit 1; . $config_file . datapath.sh list_dir=data/train_lid_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -33,22 +31,20 @@ lid_nnet_dir=exp/lid_nnets/lresnet34_lid_v1 # Network Training if [ $stage -le 1 ]; then - train_exec=torch-train-resnet-xvec-from-wav.py mkdir -p $lid_nnet_dir/log $cuda_cmd \ --gpu $ngpu $lid_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --cfg conf/lresnet34_lid_v1.yaml \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_train_lid/train.scp \ - --val-list $list_dir/lists_train_lid/val.scp \ - --class-file $list_dir/lists_train_lid/class2int \ - --iters-per-epoch $lid_ipe \ - --num-workers $num_workers \ - --num-gpus $ngpu \ - --exp-path $lid_nnet_dir $args - + train_xvector_from_wav.py resnet \ + --cfg conf/train_lresnet34_lid_v1.yaml \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_train_lid/train.scp \ + --data.train.dataset.class-file $list_dir/lists_train_lid/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_train_lid/val.scp \ + --trainer.exp-path $lid_nnet_dir $extra_args \ + --num-gpus $ngpu fi -exit diff --git a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh index 0941951f..73cb9a3d 100755 --- a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh @@ -195,7 +195,7 @@ if [ $stage -le 5 ]; then #SRE superset and 16 echo "SRE Superset Dev" steps_be/eval_be_plda_snorm_v2_cts.sh \ - --cmd "$train_cmd --mem 8G" \ + --cmd "$train_cmd --mem 12G" \ --plda_type $plda_type --ncoh $ncoh --num-parts 100 \ data/sre_cts_superset_16k_dev/trials \ data/sre_cts_superset_16k_dev/utt2enroll \ diff --git a/egs/sre21-av-a/v1.8k/README.md b/egs/sre21-av-a/v1.8k/README.md index a105128c..b55f9bf0 100644 --- a/egs/sre21-av-a/v1.8k/README.md +++ b/egs/sre21-av-a/v1.8k/README.md @@ -10,6 +10,20 @@ copy the utt2est_lang files from the 16k data dirs to the VoxCeleb and SRE21 dat This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -91,8 +105,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_030_extract_xvectors.sh` @@ -111,4 +123,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.922 | 0.154 | 0.200 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.168 | 0.127 | 0.134 | + + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.39 | 0.072 | 0.095 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.175 | 0.057 | 0.069 | + + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 6.65 | 0.418 | 0.436 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 3.73 | 0.319 | 0.325 | + + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.44 | 0.388 | 0.390 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.21 | 0.356 | 0.377 | + diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..bc311234 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 30000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..031e9ca3 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..416926d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..16203033 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..2d74799c --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/default_config.sh b/egs/sre21-av-a/v1.8k/default_config.sh index 91a20745..74b76b0a 120000 --- a/egs/sre21-av-a/v1.8k/default_config.sh +++ b/egs/sre21-av-a/v1.8k/default_config.sh @@ -1 +1 @@ -global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file +global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh \ No newline at end of file diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 69ad025b..65c2c924 100644 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_8k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 64 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 30000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -82,8 +44,10 @@ ft_nnet=$ft_nnet_dir/model_ep0007.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..824361d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,48 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index e1a923d7..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,68 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank64_stmn_8k.yaml -feat_type=fbank64_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 64 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..58010842 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,58 @@ +# Time SE Res2Net50 w26s4 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 +se_r=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0075.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_batch_size_1gpu=8 +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_ipe=1 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda + diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index 9f5c8e70..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,76 +0,0 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_8k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=tseres2net50 -dropout=0 -embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 -se_r=256 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0075.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=15 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=21 -ft_nnet_num_epochs=45 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0014.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh index 9891e812..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh @@ -10,22 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -35,6 +30,49 @@ fi # Network Training if [ $stage -le 1 ]; then + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + +# Network Training +if [ $stage -le 1 ]; then + if [[ ${nnet_type} =~ resnet1d ]]; then train_exec=torch-train-resnet1d-xvec-from-wav.py elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then diff --git a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh index a55761ae..92cbd887 100755 --- a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh +++ b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh @@ -153,7 +153,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh index f8eae0a1..6890eba9 100755 --- a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh @@ -187,7 +187,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 @@ -311,7 +311,7 @@ fi if [ $stage -le 7 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh index 263d7bbe..35afbb27 100755 --- a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh +++ b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh @@ -185,7 +185,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh index 37a91211..aa779902 100755 --- a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh index 70bab280..420ac59d 100755 --- a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh @@ -54,11 +54,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh index 12f1e5fd..4f2c137b 100755 --- a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh index 971b88a3..a1acb1f6 100755 --- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh @@ -40,11 +40,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh index 71c0c89f..b453260f 100755 --- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh index a928ae29..de811505 100755 --- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh index bed225a3..aa17a1ae 100755 --- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh @@ -48,11 +48,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 55cb8459..3b93fabd 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ @@ -293,7 +293,7 @@ if [ $stage -le 13 ]; then awk '!/benign/' $list_someknown_dir/train/utt2spk > $list_someknown_dir/train_nobenign/utt2spk steps_backend/train_be_v1.sh --cmd "$train_cmd" \ --plda-type splda \ - --y-dim 6 \ + --y-dim 5 \ $sign_dir/train/xvector.scp \ $list_someknown_dir/train_nobenign \ $be_dir diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh index 3886c339..12d42c99 100755 --- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh +++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh @@ -52,11 +52,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh index 392bffb5..cbfaaa81 100755 --- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh +++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 85e82149..48094d0f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index d5cd6a55..49720cb5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 23e0a26f..efdb77c1 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -1,5 +1,7 @@ # VoxCeleb V1.1 +This recipe will be deprecated, use V1.2 + Recipe for the VoxCeleb Speaker Verification Task ## Differences w.r.t VoxCeleb V1 recipe @@ -104,15 +106,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | | | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| | | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | -| | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | ### VoxCeleb 1 Entire-Clean trial list @@ -134,15 +139,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | | | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | | | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -163,15 +171,19 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | | | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | | | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + ### VoxSRC2022 dev @@ -192,15 +204,19 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | | | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | | | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | + ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml index 9e302200..1016087d 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -68,5 +68,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 30 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..5dda7913 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..e98d6c13 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..5c9af011 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + # dropout_rate: 0.0 + dropout_rate: 0.2 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml index 1d864080..31dcaf9a 100644 --- a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -47,7 +47,7 @@ model: dropout_rate: 0.1 norm_before: false hid_act: swish - se_r: 128 + se_r: 256 trainer: optim: opt_type: adam @@ -67,5 +67,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 25 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml index 5fb0111c..a8d7b4d4 100644 --- a/egs/voxceleb/v1.1/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml @@ -6,3 +6,4 @@ vad_energy_threshold: 5.5 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 32767 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh index 32c91da2..fdb3147f 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=false #true -do_qmf=false #true +do_snorm=true +do_qmf=true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh index 62b02c28..7aa61f00 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=true -do_qmf=true +do_snorm=false #true +do_qmf=false #true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh index c49936e0..b194d1bd 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -16,7 +16,7 @@ nnet_name=${feat_type}_resnet34.v3.0 nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 -nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 nnet_s1=$nnet_s1_dir/model_ep0035.pth nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh index 42af2d52..00622772 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -17,7 +17,7 @@ nnet_name=${feat_type}_tseresnet34.v3.0 nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0035.pth +nnet_s1=$nnet_s1_dir/model_ep0025.pth nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml nnet_s2_name=${nnet_name}.s2 diff --git a/egs/voxceleb/v1.1/local b/egs/voxceleb/v1.1/local deleted file mode 120000 index 740b697d..00000000 --- a/egs/voxceleb/v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1.1/local/attack_analysis.py similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.py rename to egs/voxceleb/v1.1/local/attack_analysis.py diff --git a/egs/voxceleb/v1/local/attack_analysis.sh b/egs/voxceleb/v1.1/local/attack_analysis.sh similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.sh rename to egs/voxceleb/v1.1/local/attack_analysis.sh diff --git a/egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1.1/local/make_musan.py b/egs/voxceleb/v1.1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/v1.1/local/make_musan.sh b/egs/voxceleb/v1.1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/voxceleb/v1.1/local/make_rirs_data.sh b/egs/voxceleb/v1.1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1.1/local/make_some_figs.py similarity index 100% rename from egs/voxceleb/v1/local/make_some_figs.py rename to egs/voxceleb/v1.1/local/make_some_figs.py diff --git a/egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh b/egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh similarity index 100% rename from egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh rename to egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/v1.1/local/make_trials_subset.py similarity index 100% rename from egs/voxceleb/v1/local/make_trials_subset.py rename to egs/voxceleb/v1.1/local/make_trials_subset.py diff --git a/egs/voxceleb/v1/local/make_vox2_trials.py b/egs/voxceleb/v1.1/local/make_vox2_trials.py similarity index 100% rename from egs/voxceleb/v1/local/make_vox2_trials.py rename to egs/voxceleb/v1.1/local/make_vox2_trials.py diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_old.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_old.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_old.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_old.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1.1/local/make_voxceleb2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2cat.pl diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_dev.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_test.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1.1/local/score_dcf.py similarity index 100% rename from egs/voxceleb/v1/local/score_dcf.py rename to egs/voxceleb/v1.1/local/score_dcf.py diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1.1/local/score_voxceleb1.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh diff --git a/egs/voxceleb/v1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxsrc22_dev.sh rename to egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index 7a2a9be5..27260be3 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -24,7 +24,6 @@ if [ $stage -le 1 ]; then dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ - utils/create_split_dir.pl \ /export/b{04,05,06,07}/$dir_name $vaddir/storage elif [ "$nodes" == "b1" ];then utils/create_split_dir.pl \ @@ -41,7 +40,6 @@ if [ $stage -le 1 ]; then fi fi -#Train datasets if [ $stage -le 2 ];then if [ "$do_voxsrc22" == "true" ];then extra_data="voxsrc22_dev" diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index a051c136..c8ab552e 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir \ @@ -67,11 +67,11 @@ if [ $stage -le 2 ]; then --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 8c0949f4..f933a7b2 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh use_gpu=false xvec_chunk_length=12800 @@ -85,4 +85,4 @@ if [ $stage -le 2 ]; then done fi -exit + diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 0780584c..6bdbdf92 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index 1ee9468f..6e8ba07a 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -1,4 +1,4 @@ -# VoxCeleb V1.1 +# VoxCeleb V1.2 Recipe for the VoxCeleb Speaker Verification Task @@ -9,7 +9,7 @@ In recipe version V1: - Augmentation is performed using Kaldi scripts and wav-reverbate tool - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. -In this recipe: +In V1.1: - We compute speech augmentations and acoustic features are computed always on-the-fly, we don't dump any features to disk. - Augmentation is performed using Hyperin SpeechAugment class. @@ -18,6 +18,11 @@ In this recipe: which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - Babble noise is created offline by mixing 3-10 single speaker files. +In V1.2: + - Feaure extractor is embedded into the pytorch model in classes derived from Wav2XVector base class. + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points ## Citing @@ -30,13 +35,11 @@ In this recipe: ## Test data - Test data is VoxCeleb 1 - - We evaluate 6 conditions: + - We evaluate the 3 conditions (with cleaned lists): - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers - - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 - - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. - - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + ## Usage @@ -44,9 +47,9 @@ In this recipe: - By default it will use Light ResNet (16 base channels) - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_005_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_006_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh ``` - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` @@ -66,25 +69,26 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. - - `run_010_prepare_xvec_train_data.sh` + - `run_004_prepare_xvec_train_data.sh` - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. - Removes silence from the audios - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - Creates training and validation lists for x-vector training - - `run_011_train_xvector.sh` + - `run_005_train_xvector.sh` - Trains the x-vector network - - `run_030_extract_xvectors.sh` + - `run_006_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training - Exctracts x-vectors for VoxCeleb1 test sets - - `run_040_eval_be.sh` + - `run_007_eval_be.sh` - Trains PLDA and evals PLDA and cosine scoring back-ends ## Results + ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -95,9 +99,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | || | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | + ### VoxCeleb 1 Entire-Clean trial list @@ -109,9 +132,27 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -123,9 +164,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + ### VoxSRC2022 dev @@ -137,127 +197,24 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | - -## Results before 2023 - -### VoxCeleb 1 Original-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - - -### VoxCeleb 1 Entire-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - - -### VoxCeleb 1 Hard-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | diff --git a/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f4306e2e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..b5458f9d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml index 1633f4a2..2cf31713 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -29,48 +29,50 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 2048 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - - 1 - resb_channels: - - 2048 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - - 5 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 4096 - norm_before: false + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 dropout_rate: 0.2 - hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.2 - norm_before: false + norm_before: false trainer: optim: opt_type: adam diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 877736b3..21f0db8b 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -37,15 +37,15 @@ data: num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0.25 + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml index f15d453d..03a7f736 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -29,47 +29,48 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 512 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - resb_channels: - - 512 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 1536 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 norm_before: false - dropout_rate: 0.002 hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.0 - norm_before: false - hid_act: swish trainer: optim: opt_type: adam @@ -91,3 +92,5 @@ trainer: log_interval: 1000 epochs: 40 eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml index 45e55d97..9788bb7c 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,31 +21,31 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0. + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. trainer: optim: opt_type: sgd @@ -67,3 +67,5 @@ trainer: swa_start: 31 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..01b2cc50 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..74553395 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..11d33ae2 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..6659b2f6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..58d22733 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 256 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml index 5fb0111c..e5a6bb82 100644 --- a/egs/voxceleb/v1.2/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml @@ -2,7 +2,8 @@ sample_frequency: 16000 frame_shift: 10 frame_length: 25 snip_edges: false -vad_energy_threshold: 5.5 +vad_energy_threshold: -4.89 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..56d18bd0 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Channel-freq-wise-SE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..68849f78 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# Channel-wise ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..f962c2b3 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..6ea334b4 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..bb5d990c --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..2528d13f --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# TSE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/hyp_utils b/egs/voxceleb/v1.2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v1.2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index 831eb1bc..563d3c2d 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -16,35 +16,31 @@ config_file=default_config.sh if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. - hyp_utils/conda_env.sh \ - prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ - --cat-videos --use-kaldi-ids \ - --output-dir data/voxceleb2cat_train - #local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train fi -exit + if [ $stage -le 2 ];then # prepare voxceleb1 for test - # This script is for the old version of the dataset - # local/make_voxceleb1_oeh.pl $voxceleb1_root data - # Use this for the newer version of voxceleb1: - local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test fi if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then - local/prepare_voxsrc22_dev.py \ - --vox1-corpus-dir $voxceleb1_root \ - --voxsrc22-corpus-dir $voxsrc22_root \ - --output-dir data/voxsrc22_dev + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -# local/prepare_voxsrc22_test.py \ -# --corpus-dir $voxsrc22_root \ -# --output-dir data/voxsrc22_test + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test # fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then - # # split vox2 into 2 parts, for cohort and qmf training - local/make_vox2_trials.py --data-dir data/voxceleb2cat_train + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh new file mode 100755 index 00000000..2479d565 --- /dev/null +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh new file mode 100755 index 00000000..0dc58048 --- /dev/null +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh new file mode 100755 index 00000000..53621488 --- /dev/null +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py index 1f9978ee..a9bc03d1 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index 7034126a..bf66d72b 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores, Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.utils.list_utils import ismember from hyperion.helpers import TrialDataReader as TDR diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py index dad89ced..0eca769d 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/v2.1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2.1/conf/clsp.conf b/egs/voxceleb/v2.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_long.conf b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_short.conf b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ffd2f374 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml new file mode 100644 index 00000000..7dcc56ef --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..3f5c46bc --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..9e1d0928 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..0d0dc398 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..8504db9e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..dda0c632 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..46ee7d18 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..db36f8ee --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..dda0c632 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml new file mode 100644 index 00000000..8504db9e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml new file mode 100644 index 00000000..ad56e80d --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..40341a27 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..8504db9e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..ad56e80d --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..b5b9b6b6 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + override_lora: true + use_lora: true + lora_rank: 4 + lora_components: + - q_proj + - v_proj + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..a39445ff --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/vad_16k.yaml b/egs/voxceleb/v2.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml new file mode 100644 index 00000000..d9c9b782 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/datapath.sh b/egs/voxceleb/v2.1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v2.1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2.1/default_config.sh b/egs/voxceleb/v2.1/default_config.sh new file mode 120000 index 00000000..f2d8812d --- /dev/null +++ b/egs/voxceleb/v2.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh new file mode 100644 index 00000000..b4130fad --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1985b8e6 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,55 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_name=${hf_model_name}_loraqv_ecapatdnn512x3_v2.0 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +#do_snorm=true +#do_qmf=true +#do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/hyp_utils b/egs/voxceleb/v2.1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2.1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2.1/path.sh b/egs/voxceleb/v2.1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2.1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2.1/run_001_prepare_data.sh b/egs/voxceleb/v2.1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/v2.1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2.1/run_002_compute_evad.sh b/egs/voxceleb/v2.1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v2.1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh new file mode 100755 index 00000000..eb1c591e --- /dev/null +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Finetune full model +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi + +# Finetune full model +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh new file mode 100755 index 00000000..72b019cd --- /dev/null +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=3 +config_file=default_config.sh +use_gpu=false +hf_chunk_length=120.0 #seconds +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh new file mode 100755 index 00000000..53621488 --- /dev/null +++ b/egs/voxceleb/v2.1/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index 5b5b93e5..0bafe85e 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -1,24 +1,9 @@ -# VoxCeleb V1.1 +# VoxCeleb V2 -Recipe for the VoxCeleb Speaker Verification Task +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors ## Differences w.r.t VoxCeleb V1 recipe -In recipe version V1: - - We compute speech augmentations and acoustic features offline and dump them to disk. - - Augmentation is performed using Kaldi scripts and wav-reverbate tool - - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. - -In this recipe: - - We compute speech augmentations and acoustic features are computed always on-the-fly, - we don't dump any features to disk. - - Augmentation is performed using Hyperin SpeechAugment class. - - The behavior of this class is controlled - by the the configuration file `conf/reverb_noise_aug.yml`, - which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - - Babble noise is created offline by mixing 3-10 single speaker files. - - ## Citing ## Training Data @@ -41,15 +26,14 @@ In this recipe: ## Usage - Run the run_0*.sh scripts in sequence - - By default it will use Light ResNet (16 base channels) - - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh + - For better performance use ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_011_train_xvector.sh --config-file global_conf/other_config.sh +run_030_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_040_eval_be.sh --config-file global_conf/other_config.sh ``` - - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` ## Recipe Steps: @@ -73,7 +57,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates training and validation lists for x-vector training - `run_011_train_xvector.sh` - - Trains the x-vector network + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model - `run_030_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training @@ -89,117 +75,90 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml deleted file mode 100644 index 8574a1cf..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: wavlmbaseplus_ecapatdnn512x3.yaml -trainer: trainer_phase1_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml deleted file mode 100644 index 87b01a1f..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml +++ /dev/null @@ -1,12 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.2 - margin_warmup_epochs: 0 - intertop_k: 5 - intertop_margin: 0.1 -trainer: trainer_phase2_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml deleted file mode 100644 index d13931e0..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml +++ /dev/null @@ -1,11 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.4 - margin_warmup_epochs: 0 - intertop_margin: 0. -trainer: trainer_phase3_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml index 34c6e8dc..d4db70a7 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -41,29 +41,6 @@ data: num_hard_prototypes: 0 data_loader: num_workers: 8 - -train: - dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 - val: - dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 model: wavlmbaseplus_ecapatdnn512x3.yaml trainer: optim: @@ -84,5 +61,4 @@ trainer: epochs: 60 eff_batch_size: 1024 train_mode: hf-feats-frozen-nograd - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml new file mode 100644 index 00000000..ebeedde6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4850 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-4 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..7287188c --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh index abcc2a2e..f2d8812d 120000 --- a/egs/voxceleb/v2/default_config.sh +++ b/egs/voxceleb/v2/default_config.sh @@ -1 +1 @@ -global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh \ No newline at end of file +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh deleted file mode 100644 index 942fb336..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh +++ /dev/null @@ -1,55 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/model_ep0030.pth -nnet=$nnet_dir/model_ep0040.pth -nnet=$nnet_dir/model_ep0020.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local index 740b697d..2ac14857 120000 --- a/egs/voxceleb/v2/local +++ b/egs/voxceleb/v2/local @@ -1 +1 @@ -../v1/local/ \ No newline at end of file +../v1.1/local \ No newline at end of file diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh index 7bf15448..44385610 100755 --- a/egs/voxceleb/v2/run_001_prepare_data.sh +++ b/egs/voxceleb/v2/run_001_prepare_data.sh @@ -12,7 +12,7 @@ config_file=default_config.sh . parse_options.sh || exit 1; . datapath.sh - +. $config_file if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. @@ -26,3 +26,21 @@ if [ $stage -le 2 ];then # Use this for the newer version of voxceleb1: local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh index eeae00ac..1248ad39 100755 --- a/egs/voxceleb/v2/run_002_compute_evad.sh +++ b/egs/voxceleb/v2/run_002_compute_evad.sh @@ -19,39 +19,40 @@ config_file=default_config.sh if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" fi + fi fi -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done fi - diff --git a/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index 0eddb1a6..bc3b5420 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -47,11 +47,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir $args \ @@ -71,11 +71,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ @@ -96,11 +96,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s2 \ diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index 67122f85..16f29841 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -stage=2 +stage=1 +nnet_stage=3 config_file=default_config.sh use_gpu=false -nnet_stage=3 hf_chunk_length=120 #seconds xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; @@ -36,20 +36,20 @@ fi xvector_dir=exp/xvectors/$nnet_name -if [ $stage -le 1 ]; then +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then # Extract xvectors for training LDA/PLDA for name in voxceleb2cat_train do if [ $plda_num_augs -eq 0 ]; then steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ $nnet data/${name} \ $xvector_dir/${name} else steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ --aug-config $plda_aug_config --num-augs $plda_num_augs \ $nnet data/${name} \ $xvector_dir/${name}_augx${plda_num_augs} \ @@ -60,7 +60,10 @@ fi if [ $stage -le 2 ]; then # Extracts x-vectors for evaluation - for name in voxceleb1_test + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) @@ -71,4 +74,3 @@ if [ $stage -le 2 ]; then done fi -exit diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh index ac561344..0982abeb 100755 --- a/egs/voxceleb/v2/run_040_eval_be.sh +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring -stage=3 -config_file=default_config.sh +stage=1 nnet_stage=3 +config_file=default_config.sh + . parse_options.sh || exit 1; . $config_file @@ -25,6 +25,15 @@ elif [ $nnet_stage -eq 2 ];then elif [ $nnet_stage -eq 3 ];then nnet=$nnet_s3 nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name fi plda_label=${plda_type}y${plda_y_dim}_v1 @@ -35,8 +44,12 @@ be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf -if [ $stage -le 1 ]; then + +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then echo "Train PLDA on Voxceleb2" steps_be/train_be_v1.sh \ --cmd "$train_cmd" \ @@ -45,14 +58,12 @@ if [ $stage -le 1 ]; then --y_dim $plda_y_dim --z_dim $plda_z_dim \ $xvector_dir/$plda_data/xvector.scp \ data/$plda_data \ - $be_dir & - - wait -fi - - -if [ $stage -le 2 ];then - + $be_dir + + fi + + + if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" steps_be/eval_be_v1.sh \ --cmd "$train_cmd" --plda_type $plda_type \ @@ -62,7 +73,7 @@ if [ $stage -le 2 ];then $be_dir/lda_lnorm.h5 \ $be_dir/plda.h5 \ $score_plda_dir/voxceleb1_scores - + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir @@ -72,32 +83,267 @@ if [ $stage -le 2 ];then cat $f echo "" done - + fi fi -score_plda_dir=$score_cosine_dir + if [ $stage -le 3 ];then - echo "Eval Voxceleb 1 with Cosine scoring" + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores - for f in $(ls $score_plda_dir/*_results); + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb2cat_train/utt2speech_dur \ + > $xvector_dir/voxceleb2cat_train/utt2num_frames + + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + + fi + + if [ $stage -le 8 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb1_test/utt2speech_dur \ + > $xvector_dir/voxceleb1_test/utt2num_frames + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done + + fi + + if [ $stage -le 9 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxsrc22_dev/utt2speech_dur \ + > $xvector_dir/voxsrc22_dev/utt2num_frames + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # awk '{ print $1, $2*100}' \ + # $xvector_dir/voxsrc22_test/utt2speech_dur \ + # > $xvector_dir/voxsrc22_test/utt2num_frames + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); do echo $f cat $f echo "" done + fi + +fi + +if [ "$do_pca" != "true" ];then + exit 0 +fi + + +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r fi -exit +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index ceee4e93..90ffa369 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -52,22 +52,24 @@ fi # echo "LRU_CACHE_CAPACITY=$LRU_CACHE_CAPACITY" conda activate $conda_env -command="python" +command="" if [ $num_gpus -gt 0 ];then - # set CUDA_VISIBLE_DEVICES - if [ ! -z "$SGE_HGR_gpu" ]; then - echo "SGE_HGR_gpu=$SGE_HGR_gpu" - export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') - else - # seach location of free-gpu program in the PATH or hyp_utils directory - free_gpu=$(which free-gpu) - if [ -z "$free_gpu" ];then - free_gpu=$(which hyp_utils/free-gpu) - fi - - if [ ! -z "$free_gpu" ];then - # if free-gpu found set env var, otherwise we assume that you can use any gpu - export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + if [ -z "$CUDA_VISIBLE_DEVICES" ];then + # set CUDA_VISIBLE_DEVICES + if [ ! -z "$SGE_HGR_gpu" ]; then + echo "SGE_HGR_gpu=$SGE_HGR_gpu" + export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') + else + # seach location of free-gpu program in the PATH or hyp_utils directory + free_gpu=$(which free-gpu) + if [ -z "$free_gpu" ];then + free_gpu=$(which hyp_utils/free-gpu) + fi + + if [ ! -z "$free_gpu" ];then + # if free-gpu found set env var, otherwise we assume that you can use any gpu + export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + fi fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" @@ -79,7 +81,7 @@ if [ $num_gpus -gt 0 ];then #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then - [[ $(type -P "$torchrun") ]] && command="torchrun" \ + [[ $(type -P "torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" fi diff --git a/hyp_utils/create_audios_split_links.sh b/hyp_utils/create_audios_split_links.sh new file mode 100755 index 00000000..7125a2c4 --- /dev/null +++ b/hyp_utils/create_audios_split_links.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/xvector_audios/voxceleb data/voxceleb/recordings.csv flac" +fi +echo "$0 $@" # Print the command line for logging +output_dir=$1 +rec_file=$2 +file_format=$3 + +if [[ $(hostname -f) != *.clsp.jhu.edu ]]; then + exit 0 +fi + +for f in $(awk -F "," '$1!="id" { print $1}' $rec_file); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + hyp_utils/create_data_link.pl $output_dir/$f.$file_format +done + + + diff --git a/hyp_utils/create_data_link.pl b/hyp_utils/create_data_link.pl new file mode 100755 index 00000000..850f29f0 --- /dev/null +++ b/hyp_utils/create_data_link.pl @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0. +# +# This script distributes data onto different file systems by making symbolic +# links. It is supposed to use together with utils/create_split_dir.pl, which +# creates a "storage" directory that links to different file systems. +# +# If a sub-directory egs/storage does not exist, it does nothing. If it exists, +# then it selects pseudo-randomly a number from those available in egs/storage/* +# creates a link such as +# +# egs/egs.3.4.ark -> storage/4/egs.3.4.ark +# +use strict; +use warnings; +use File::Basename; +use File::Spec; +use Getopt::Long; + +sub GetGCD { + my ($a, $b) = @_; + while ($a != $b) { + if ($a > $b) { + $a = $a - $b; + } else { + $b = $b - $a; + } + } + return $a; +} + +my $Usage = < storage/4/egs.3.4.ark + +Usage: utils/create_data_link.pl [ ... ] + e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark + (note: the dirname, e.g. foo/bar/, must be the same in all cases). + +See also utils/remove_data_links.sh +EOU + +GetOptions(); + +if (@ARGV == 0) { + die $Usage; +} + +my $example_fullpath = $ARGV[0]; + +# Check if the storage has been created. If so, do nothing. +my $dirname = dirname($example_fullpath); +if (! -d "$dirname/storage") { + exit(0); +} + +# Storage exists, create symbolic links in the next few steps. + +# First, get a list of the available storage directories, and check if they are +# properly created. +opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; +my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); +closedir($dh); +my $num_storage = scalar(@storage_dirs); +for (my $x = 1; $x <= $num_storage; $x++) { + (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; +} + +# Second, get the coprime list. +my @coprimes; +for (my $n = 1; $n <= $num_storage; $n++) { + if (GetGCD($n, $num_storage) == 1) { + push(@coprimes, $n); + } +} + +my $ret = 0; + +foreach my $fullpath (@ARGV) { + if ($dirname ne dirname($fullpath)) { + die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; + } + + # Finally, work out the directory index where we should put the data to. + my $basename = basename($fullpath); + my $filename_numbers = $basename; + $filename_numbers =~ s/[^0-9]+/ /g; + my @filename_numbers = split(" ", $filename_numbers); + my $total = 0; + my $index = 0; + foreach my $x (@filename_numbers) { + if ($index >= scalar(@coprimes)) { + $index = 0; + } + $total += $x * $coprimes[$index]; + $index++; + } + my $dir_index = $total % $num_storage + 1; + + # Make the symbolic link. + if (-e $fullpath) { + unlink($fullpath); + } + if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure + $ret = 1; # will exit with error status. + } +} + +exit($ret); + +## testing: +# rm -rf foo bar +# mkdir -p bar/{1,2,3,4} +# mkdir -p foo/storage +# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done +# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark +# ls -l foo +# total 0 +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark +# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh new file mode 100755 index 00000000..b8aad6c8 --- /dev/null +++ b/hyp_utils/create_data_split_dirs.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + + + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" +fi + +output_dir=$1 +storage_dir=$2 +nodes=$3 + +link_dir=$output_dir/storage + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "$0 $@" # Print the command line for logging + echo "Prepare to distribute data over multiple $nodes nodes" + dir_name=$storage_dir/$storage_name/storage + if [ "$nodes" == "b0" ];then + hyp_utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $link_dir + elif [ "$nodes" == "b1" ];then + hyp_utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $link_dir + elif [ "$nodes" == "c0" ];then + hyp_utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $link_dir + elif [ "$nodes" == "fs01" ];then + hyp_utils/create_split_dir.pl \ + /export/fs01/$dir_name $link_dir + else + echo "we don't distribute data between multiple machines" + fi +fi + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh new file mode 100755 index 00000000..c7cfa3eb --- /dev/null +++ b/hyp_utils/create_data_split_links.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir/vad.JOB.ark 40" +fi +echo "$0 $@" # Print the command line for logging +output_file_pattern=$1 +nj=$2 + +for n in $(seq $nj); do + # the next command does nothing unless output_dir/storage exists, see + # utils/create_data_link.pl for more info. + output_file=$(echo $output_file_pattern | sed 's@\.JOB\.[^\.]*$@.'$n'.@') + hyp_utils/create_data_link.pl $output_file +done + diff --git a/hyp_utils/create_split_dir.pl b/hyp_utils/create_split_dir.pl new file mode 100755 index 00000000..ab952357 --- /dev/null +++ b/hyp_utils/create_split_dir.pl @@ -0,0 +1,92 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# Apache 2.0. +# +# This script creates storage directories on different file systems, and creates +# symbolic links to those directories. For example, a command +# +# utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage +# +# will mkdir -p all of those directories, and will create links +# +# egs/storage/1 -> /export/gpu-03/egs/storage +# egs/storage/2 -> /export/gpu-03/egs/storage +# ... +# +use strict; +use warnings; +use File::Spec; +use Getopt::Long; + +my $Usage = < + e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage + +Allowed options: + --suffix : Common suffix to (string, default = "") + +See also create_data_link.pl, which is intended to work with the resulting +directory structure, and remove_data_links.sh +EOU + +my $suffix=""; +GetOptions('suffix=s' => \$suffix); + +if (@ARGV < 2) { + die $Usage; +} + +my $ans = 1; + +my $dir = pop(@ARGV); +system("mkdir -p $dir 2>/dev/null"); + +my @all_actual_storage = (); +foreach my $file (@ARGV) { + push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); +} + +my $index = 1; +foreach my $actual_storage (@all_actual_storage) { + my $pseudo_storage = "$dir/$index"; + + # If the symbolic link already exists, delete it. + if (-l $pseudo_storage) { + print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; + next; + } + + # Create the destination directory and make the link. + system("mkdir -p $actual_storage 2>/dev/null"); + if ($? != 0) { + print STDERR "$0: error creating directory $actual_storage\n"; + exit(1); + } + { # create a README file for easier deletion. + open(R, ">$actual_storage/README.txt"); + my $storage_dir = File::Spec->rel2abs($dir); + print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; + print R "# The full list of directories where this data resides is:\n"; + foreach my $d (@all_actual_storage) { + print R "$d\n"; + } + close(R); + } + my $ret = symlink($actual_storage, $pseudo_storage); + + # Process the returned values + $ans = $ans && $ret; + if (! $ret) { + print STDERR "Error linking $actual_storage to $pseudo_storage\n"; + } + + $index++; +} + +exit($ans == 1 ? 0 : 1); diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh index 373fc4a6..16ddbf74 100755 --- a/hyp_utils/feats/make_evad.sh +++ b/hyp_utils/feats/make_evad.sh @@ -87,7 +87,7 @@ fi $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \ hyp_utils/conda_env.sh \ compute_energy_vad.py --cfg $vad_config $opt_args \ - --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ + --recordings-file $scp --output-spec ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ --part-idx JOB --num-parts $nj || exit 1 # concatenate the .scp files together. diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh new file mode 100755 index 00000000..17378c29 --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ] && [ $# != 6 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 +lang_file=$5 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_film_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --lang_input $data_dir/utt2lang \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --lang-file $lang_file \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char + +fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh new file mode 100755 index 00000000..18d6ad4c --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ] && [ $# != 5 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char +fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh new file mode 100755 index 00000000..0363eaf1 --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ] && [ $# != 6 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 +lang_file=$5 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer_languageid.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --lang-file $lang_file \ + --output_transducer $output_dir/transducer.JOB.text \ + --output_languageid $output_dir/languageid.JOB $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + cat $output_dir/languageid.* > $output_dir/langs + python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt + cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt + echo " " >> $output_dir/../overall_lid_score.txt + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char.txt + + +fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh new file mode 100755 index 00000000..4a23d9fa --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2transducer.py \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text + set -e +fi + +if [ $stage -le 1 ];then + echo "compute wer, cer" + + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text + +fi diff --git a/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh new file mode 100755 index 00000000..ef54ceed --- /dev/null +++ b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +set -e +nj=40 +cmd="run.pl" +stage=0 +file_format=flac +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +proc_opts="--remove-dc-offset" +use_bin_vad=false +osr=16000 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --file-format # Output format supported by soundfile (flac,ogg,wav,...)" + echo " --proc-opts # Extra arguments for proc-audio-files.py" + echo " --use-bin-vad # Removes silence using binary vad" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/wav.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +output_dir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then + dir_name=$USER/hyp-data/$storage_name/xvector_audio/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $output_dir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17,18}/$dir_name $output_dir/storage + elif [ "$nodes" == "s01" ];then + utils/create_split_dir.pl \ + /export/s01/$dir_name $output_dir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{01,06,07,08,09}/$dir_name $output_dir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/fs05/$dir_name $output_dir/storage + fi + + for f in $(awk '{ print $1}' $data_in/wav.scp); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $output_dir/$f.$file_format + done +fi + + +for f in reco2dur segments spk2utt text utt2dur utt2gender utt2lang utt2spk wav.scp spk2gender +do + if [ -f $data_in/$f ];then + cp $data_in/$f $data_out/$f + fi +done + +args="" +if [ "$use_bin_vad" == "true" ];then + args="${args} --vad scp:$data_in/vad.scp" +else + f=vad.scp + if [ -f $data_in/$f ];then + cp $data_in/$f $data_out/$f + fi +fi + +$cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ + --write-time-durs $output_dir/utt2dur.${name}.JOB \ + --part-idx JOB --num-parts $nj \ + --output-sampling-rate $osr \ + --input $data_in/wav.scp \ + --output-path $output_dir \ + --output-script $output_dir/wav.${name}.JOB.scp + +for n in $(seq $nj); do + cat $output_dir/wav.${name}.$n.scp || exit 1; +done > ${data_out}/wav.scp || exit 1 + +for n in $(seq $nj); do + cat $output_dir/utt2dur.${name}.$n || exit 1; +done > ${data_out}/utt2dur || exit 1 + +echo "$0: Succeeded processing audios for $name" diff --git a/hyp_utils/steps_transducer/word2char.py b/hyp_utils/steps_transducer/word2char.py new file mode 100644 index 00000000..062832c4 --- /dev/null +++ b/hyp_utils/steps_transducer/word2char.py @@ -0,0 +1,24 @@ +import os +import sys + +word_file = sys.argv[1] # "data/it_test_proc_audio/text" +char_file = sys.argv[2] # "data/it_test_proc_audio/text_char" + + +# word_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer.text" +# char_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer_char.text" + +output_chars = [] +with open(word_file, "r") as fi: + for line in fi.readlines(): + words = line.split(" ") + chars = [words[0]] + for wrd in words[1:]: + for c in wrd: + chars.append(c) + output_chars.append(chars) + +with open(char_file, "w") as fo: + for chars in output_chars: + fo.writelines(" ".join(chars)) + diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh index 6c6f0fdf..d8ae2e55 100755 --- a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -87,9 +87,9 @@ if [ $stage -le 0 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -109,9 +109,9 @@ if [ $stage -le 1 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index 0b5227cc..b763a25c 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -87,10 +87,10 @@ if [ $stage -le 0 ];then hyp_utils/conda_env.sh --num-gpus $num_gpus \ extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ - --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --part-idx JOB --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -110,9 +110,9 @@ if [ $stage -le 1 ];then extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh index 27c77454..4530ad3b 100755 --- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh +++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh @@ -8,9 +8,7 @@ nj=1 cmd="run.pl" stage=0 file_format=flac -nodes=b1 storage_name=$(date +'%m_%d_%H_%M') -#proc_opts="--remove-dc-offset" min_spks=3 max_spks=10 num_reuses=5 @@ -23,10 +21,8 @@ if [ $# != 3 ]; then echo "Usage: $0 " echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" echo "Options: " - #echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --file-format # Output file_format supported by soundfile (flac,ogg,wav,...)" - #echo " --proc-opts # Extra arguments for proc-audio-files.py" echo " --min-spks # max number of spks per utterance" echo " --max-spks # max number of spks per utterance" echo " --num-reuses # number of times a signal is reused to create babble" @@ -51,22 +47,12 @@ output_dir=$(utils/make_absolute.sh $dir) args="" $cmd $dir/log/make_babble_noise_${name}.log \ hyp_utils/conda_env.sh \ - make_babble_noise_audio_files.py ${args} \ - --output-audio-format $file_format $args $proc_opts \ + make_babble_noise_audio_files.py \ + --audio-format $file_format $args $proc_opts \ --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \ --write-time-durs $data_out/utt2dur \ - --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $data_out/wav.scp - - - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 + --output-recordings-file $data_out/wav.scp echo "$0: Succeeded making babble noise for $name" diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh index c6634135..437cd208 100755 --- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh +++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh @@ -66,13 +66,4 @@ $cmd $dir/log/pack_rirs_${name}.log \ pack_wav_rirs.py ${args} --input $data_in/wav.scp \ --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1; - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 - echo "$0: Succeeded packing RIRs for $name" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 8321169f..948841fa 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -11,6 +11,7 @@ nodes=b1 storage_name=$(date +'%m_%d_%H_%M') proc_opts="--remove-dc-offset" use_bin_vad=false +osr=0 echo "$0 $@" # Print the command line for logging @@ -89,15 +90,17 @@ else cp $data_in/$f $data_out/$f fi fi - +echo $cmd $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ + preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ - --input $data_in/wav.scp \ + # --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $output_dir/wav.${name}.JOB.scp + --output-recordings-file $output_dir/wav.${name}.JOB.scp + #--output-script $output_dir/wav.${name}.JOB.scp for n in $(seq $nj); do cat $output_dir/wav.${name}.$n.scp || exit 1; diff --git a/hyperion/bin/__init__.py b/hyperion/bin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index 7be882e0..ea3d3b80 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -11,11 +11,15 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.adv_attacks import AttackFactory @@ -44,7 +48,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -139,7 +142,6 @@ def init_attack(feat_extractor, model, wav_scale, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -232,8 +234,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Fine-tune x-vector model from audio files with adversarial training""" @@ -267,6 +268,10 @@ def make_parser(xvec_class): train_xvec(gpu_id, args_sc) +if __name__ == "__main__": + main() + + # def init_data( # audio_path, # train_list, diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index a2456dc9..f8299edc 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -28,7 +32,6 @@ def process_feats( output_spec, vad_spec, write_num_frames_spec, - scp_sep, path_prefix, vad_path_prefix, part_idx, @@ -37,7 +40,6 @@ def process_feats( compression_method, **kwargs ): - logging.info("initializing") mvn_args = MVN.filter_args(**kwargs) mvn = MVN(**mvn_args) @@ -54,21 +56,19 @@ def process_feats( output_spec, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, ) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create( input_spec, path_prefix=path_prefix, - scp_sep=scp_sep, part_idx=part_idx, num_parts=num_parts, ) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = RDRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + vad_spec, + path_prefix=vad_path_prefix, ) while not reader.eof(): @@ -102,8 +102,7 @@ def process_feats( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Apply CMVN and remove silence") parser.add_argument("--input", dest="input_spec", required=True) @@ -112,28 +111,22 @@ def process_feats( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument( - "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator") - ) parser.add_argument( "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") ) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument( "--part-idx", - dest="part_idx", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), ) parser.add_argument( "--num-parts", - dest="num_parts", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), @@ -141,14 +134,12 @@ def process_feats( parser.add_argument( "--compress", - dest="compress", default=False, action="store_true", help="Lossy compress the features", ) parser.add_argument( "--compression-method", - dest="compression_method", default="auto", choices=compression_methods, help=( @@ -171,3 +162,7 @@ def process_feats( logging.debug(args) process_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index 38e8dff2..8ef6b5c1 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR @@ -18,7 +22,6 @@ def audio_to_duration(audio_file, output_file, **kwargs): - input_args = AR.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -37,8 +40,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): seg_set.save(output_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Writes audio file durations to table") parser.add_argument("--cfg", action=ActionConfigFile) @@ -60,3 +62,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): logging.debug(args) audio_to_duration(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 15d74f3a..fe0b1d8e 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -18,33 +22,53 @@ from hyperion.np.feats import EnergyVAD -def compute_vad(input_path, output_path, write_num_frames, **kwargs): - +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) input_args = AR.filter_args(**kwargs) - reader = AR(input_path, **input_args) + reader = AR(recordings_file, **input_args) + + metadata_columns = [ + "frame_shift", + "frame_length", + "num_frames", + "num_speech_frames", + "prob_speech", + ] - writer = DWF.create(output_path, scp_sep=" ") + writer = DWF.create(output_spec, metadata_columns=metadata_columns) if write_num_frames is not None: f_num_frames = open(write_num_frames, "w") for data in reader: key, x, fs = data - logging.info("Extracting VAD for %s" % (key)) + logging.info("Extracting VAD for %s", key) t1 = time.time() y = vad.compute(x) dt = (time.time() - t1) * 1000 rtf = vad.frame_shift * y.shape[0] / dt num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 + logging.info( - "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf) + "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + num_speech_frames, + y.shape[0], + prob_speech, + dt, + rtf, ) - writer.write([key], [y]) + metadata = { + "frame_shift": vad.frame_shift, + "frame_length": vad.frame_length, + "num_frames": y.shape[0], + "num_speech_frames": num_speech_frames, + "prob_speech": prob_speech, + } + writer.write([key], [y], metadata) if write_num_frames is not None: f_num_frames.write("%s %d\n" % (key, y.shape[0])) @@ -54,14 +78,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute Kaldi Energy VAD") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument("--write-num-frames", default=None) + parser.add_argument("--write-stats", default=None) AR.add_class_args(parser) EnergyVAD.add_class_args(parser) @@ -80,3 +104,7 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): logging.debug(args) compute_vad(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index a83f95d1..f42f260d 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -23,7 +27,6 @@ def compute_mfcc_feats( input_path, output_path, compress, compression_method, write_num_frames, **kwargs ): - mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) @@ -36,7 +39,6 @@ def compute_mfcc_feats( writer = DWF.create( output_path, - scp_sep=" ", compress=compress, compression_method=compression_method, ) @@ -55,8 +57,11 @@ def compute_mfcc_feats( dt = (time.time() - t1) * 1000 rtf = dt / (mfcc.frame_shift * y.shape[0]) logging.info( - "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, y.shape[0], dt, rtf) + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + y.shape[0], + dt, + rtf, ) writer.write([key], [y]) @@ -69,8 +74,7 @@ def compute_mfcc_feats( f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute MFCC features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -110,3 +114,7 @@ def compute_mfcc_feats( logging.debug(args) compute_mfcc_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 0385cc55..4ffc1a58 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -16,8 +16,8 @@ from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars="@", @@ -38,3 +38,7 @@ logging.debug(args) CF(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 81fa8803..bcf9e05c 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,18 +13,21 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -48,10 +51,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -77,7 +81,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -87,7 +91,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -114,9 +119,9 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - use_gpu, **kwargs): - +def decode_transducer( + input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs +): device = init_device(use_gpu) model = load_model(model_path, device) @@ -129,10 +134,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ar_args = AR.filter_args(**kwargs) logging.info("opening output: %s" % (output_spec)) - # with DWF.create(output_spec, scp_sep=scp_sep) as writer: with open(output_spec, "w") as writer: - logging.info("opening input stream: {} with args={}".format( - input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: while not reader.eof(): t1 = time.time() @@ -147,65 +152,68 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() - key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + key, x = key0, x0 # augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None, :], - dtype=torch.get_default_dtype()).to(device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) t5 = time.time() tot_frames = x.shape[1] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" % ( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( key, x.shape[1], tot_frames, x.shape[1] / tot_frames * 100, - )) + ) + ) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((model.embed_dim, ), - dtype=float_cpu()) + y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: y = decode_one_batch(model=model, sp=sp, x=x) t7 = time.time() - writer.write(key + ' ' + ' '.join(y) + "\n") + writer.write(key + " " + " ".join(y) + "\n") t8 = time.time() read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f") % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - )) - - -if __name__ == "__main__": - + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) + + +def main(): parser = ArgumentParser( - description=("Extracts x-vectors from waveform computing " - "acoustic features on the fly")) + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) AR.add_class_args(parser) @@ -216,16 +224,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, parser.add_argument("--bpe-model", required=True) parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -233,3 +237,7 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2vec2rnn_film_transducer.py b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..17cb0c3f --- /dev/null +++ b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +import sentencepiece as spm +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import HFWav2Vec2RNNFiLMTransducer +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from hyperion.utils.class_info import ClassInfo +from hyperion.utils.segment_set import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-film-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_transducer(input_spec, lang_input_spec, output_spec, scp_sep, model_path, bpe_model, lang_file, + infer_args, use_gpu, **kwargs): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + # load language dict form langfile by row number + lang_info = ClassInfo.load(lang_file) + utt2lang = SegmentSet.load(lang_input_spec) + + + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + + infer_args = HFWav2Vec2RNNFiLMTransducer.filter_infer_args(**infer_args) + logging.info(f"infer-args={infer_args}") + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s", output_spec) + with open(output_spec, "w") as writer: + logging.info(f"opening input stream: {input_spec} with args={ar_args}") + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x, fs = reader.read(1) + lang = utt2lang.loc[key, "class_id"] + lang_id = torch.tensor([lang_info.loc[lang, "class_idx"]]).to(torch.int64) + if len(key) == 0: + break + + x, key, fs = x[0], key[0], fs[0] + t2 = time.time() + logging.info("processing utt %s", key) + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype()).to(device) + + tot_frames = x.shape[1] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if x.shape[1] == 0: + y = [""] + else: + #y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor((x.shape[1], ), + dtype=torch.long, + device=device) + + y = model.infer(x=x, x_lengths=x_lengths, languageid=lang_id, **infer_args) + + y = sp.decode(y[0]) + logging.info(f"utt: {key} hyps: {y}") + t3 = time.time() + writer.write(f"{key} {y}\n") + + t4 = time.time() + tot_time = t4 - t1 + infer_time = t3 - t2 + logging.info( + ("utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + key, + tot_time, + t2 - t1, + infer_time, + t4 - t3, + x.shape[1] / fs / infer_time, + x.shape[1] / fs / tot_time, + ) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=("ASR decoding for RNN-T with Wav2vec features")) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--lang_input", dest="lang_input_spec", required=True) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) + + AR.add_class_args(parser) + parser.add_argument("--model-path", required=True) + parser.add_argument("--bpe-model", required=True) + parser.add_argument("--lang-file", required=True) + + HFWav2Vec2RNNFiLMTransducer.add_infer_args(parser, "infer-args") + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 8ef8d414..b1af102b 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -13,19 +13,23 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data.char_piece import CharPieceProcessor from hyperion.torch.models import HFWav2Vec2RNNTransducer -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -49,10 +53,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -78,7 +83,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -88,7 +93,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -115,15 +121,29 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - infer_args, use_gpu, **kwargs): - +def decode_transducer( + input_spec, + output_spec, + scp_sep, + model_path, + bpe_model, + infer_args, + use_gpu, + **kwargs, +): device = init_device(use_gpu) model = load_model(model_path, device) - logging.info("bpe-model=%s", bpe_model) - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) + + + if bpe_model.endswith(".txt"): + logging.info("loading char piece file %s", bpe_model) + sp = CharPieceProcessor() + sp.load(open(bpe_model).read().split()) + else: + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) infer_args = HFWav2Vec2RNNTransducer.filter_infer_args(**infer_args) logging.info(f"infer-args={infer_args}") @@ -143,8 +163,9 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, t2 = time.time() logging.info("processing utt %s", key) with torch.no_grad(): - x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype()).to(device) + x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()).to( + device + ) tot_frames = x.shape[1] logging.info( @@ -158,10 +179,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, if x.shape[1] == 0: y = [""] else: - #y = decode_one_batch(model=model, sp=sp, x=x) - x_lengths = torch.tensor((x.shape[1], ), - dtype=torch.long, - device=device) + # y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor( + (x.shape[1],), dtype=torch.long, device=device + ) y = model.infer(x, x_lengths, **infer_args) y = sp.decode(y[0]) @@ -173,10 +194,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, tot_time = t4 - t1 infer_time = t3 - t2 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "infer-time=%.3f " - "write-time=%.3f " - "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + ( + "utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f" + ), key, tot_time, t2 - t1, @@ -187,16 +210,14 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( - description=("ASR decoding for RNN-T with Wav2vec features")) + description=("ASR decoding for RNN-T with Wav2vec features") + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) AR.add_class_args(parser) parser.add_argument("--model-path", required=True) @@ -204,16 +225,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args") parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -221,3 +238,7 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py new file mode 100755 index 00000000..835cae0b --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +def main(): + parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py new file mode 100755 index 00000000..4fecf2f3 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -0,0 +1,617 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import ( + EnrollmentMap, + InfoTable, + SegmentSet, + TrialKey, + TrialNdx, + TrialScores, +) +from hyperion.utils.math_funcs import average_vectors, cosine_scoring + + +def get_precomp_qm_names(quality_measures): + # snorm qm will be calculated later + return [q for q in quality_measures if q not in ["snorm-mu", "snorm-mu/s"]] + + +def normalize_duration(q, min_dur, max_dur, frame_rate): + q = q / frame_rate + q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + q = (q - log_min_dur) / (log_max_dur - log_min_dur) + return q + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + + # quality measures may be in segments file or/and feature_set file + # so we combine both if both are given + if segments_file is not None: + test_segments = SegmentSet.load(segments_file) + if enroll_segments_file is not None and segments_file != enroll_segments_file: + enroll_segments = SegmentSet.load(enroll_segments_file) + else: + enroll_segments = test_segments + + test_feats_set = test_feats_reader.feature_set + enroll_feats_set = enroll_feats_reader.feature_set + if segments_file: + test_segments.add_columns(test_feats_set) + if enroll_feats_set != test_feats_set or enroll_segments != test_segments: + enroll_segments.add_columns(enroll_feats_set) + else: + test_segments = test_feats_set + enroll_segments = enroll_feats_set + + # now we retrive the quality measures + q_e = [] + q_t = [] + # snorm qm will be calculated later + retrieve_qm = get_precomp_qm_names(quality_measures) + q_e = enroll_segments.loc[enroll_map["segmentid"], retrieve_qm] + q_t = test_segments.loc[ndx.seg_set, retrieve_qm] + + # normalize durations + if "speech_duration" in retrieve_qm: + q_e["speech_duration"] = normalize_duration( + q_e["speech_duration"], min_dur, max_dur, 1 + ) + q_t["speech_duration"] = normalize_duration( + q_t["speech_duration"], min_dur, max_dur, 1 + ) + + if "num_speech_frames" in retrieve_qm: + q_e["num_speech_frames"] = normalize_duration( + q_e["num_speech_frames"], min_dur, max_dur, frame_rate + ) + q_t["num_speech_frames"] = normalize_duration( + q_t["num_speech_frames"], min_dur, max_dur, frame_rate + ) + + # q_e = np.asarray(q_e) + # q_t = np.asarray(q_t) + + return enroll_map, ndx, x_e, x_t, q_e, q_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + + # segments.add_columns(feats_reader.feature_set) + + # retrieve_qm = get_precomp_qm_names(quality_measures) + # q = np.asarray(segments[retrieve_qm]) + return segments, x # , q + + +def average_qm(q, model_set, ids): + q_avg = average_vectors(q.values, ids) + q_avg = pd.DataFrame(q, columns=q.columns) + q_avg["id"] = model_set + q_avg.set_index("id", drop=False, inplace=True) + return q_avg + + +def get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + score_file = Path(score_file) + new_suffix = "" + if score_name is not None: + new_suffix = f".{score_name}" + + if num_enroll_parts > 1 or num_test_parts > 1: + new_suffix = f"{new_suffix}.{enroll_part_idx}.{test_part_idx}" + + if new_suffix: + new_suffix = f"{new_suffix}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + return score_file + + +def save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + score_file = get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with to %s", score_file) + scores = TrialScores( + ndx.model_set, ndx.seg_set, scores, ndx.trial_mask, q_measures=q_measures + ) + scores.save(score_file) + + +def save_empty_scores( + ndx, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + scores = np.zeros(ndx.trial_mask.shape, dtype="float32") + if q_measures is not None: + q_measures = {k: scores for k in q_measures} + + save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + +def segment_to_trial_qm(q_e, q_t): + q_trial = {} + for q_name in ["speech_duration", "num_speech_frames"]: + if q_name in q_e: + q_trial_name = f"max_{q_name}" + q_trial[q_trial_name] = np.maximum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + q_trial_name = f"min_{q_name}" + q_trial[q_trial_name] = np.minimum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + + return q_trial + + +def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + if scores_norm is not None: + scores_norm = scores_norm[sort_idx] + for qm in q_trial: + q_trial[qm] = q_trial[qm][sort_idx] + + return scores, scores_norm, q_trial + + +# def make_qm_table(ndx, scores, scores_norm, q_trial): +# if scores_norm is None: +# scores = scores[ndx.trial_mask] +# else: +# scores = scores_norm[ndx.trial_mask] + +# for qm in q_trial: +# q_trial[qm] = q_trial[qm][ndx.trial_mask] + +# I, J = np.nonzero(ndx.trial_mask) +# modelid = ndx.model_set[I] +# segmentid = ndx.seg_set[J] +# unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] + +# q_dict = { +# "id": unique_id, +# "modelid": modelid, +# "segmentid": segmentid, +# "scores": scores, +# } +# q_dict.update(q_trial) +# df = pd.DataFrame(q_dict) +# return InfoTable(df) + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + preproc_file, + qmf_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if not np.any(ndx.trial_mask): + # this part doesn't have any trials, save empty files + if qmf_file is not None: + quality_measures = None + save_empty_scores( + ndx, + score_file, + "snorm.qmf" if cohort_segments_file is not None else "qmf", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + save_empty_scores( + ndx, + score_file, + None, + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if cohort_segments_file is not None: + save_empty_scores( + ndx, + score_file, + "snorm", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + return + + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + q_e = average_qm(q_e, enroll_set, enroll_ids) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + q_trial = segment_to_trial_qm(q_e, q_t) + scores_norm = None + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores_norm, mu_z, s_z, mu_t, s_t = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + if "snorm-mu" in quality_measures: + q_trial["max_snorm-mu"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu"] = np.minimum(mu_z, mu_t) + if "snorm-mu/s" in quality_measures: + mu_z = mu_z / s_z + mu_t = mu_t / s_t + q_trial["max_snorm-mu/s"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu/s"] = np.minimum(mu_z, mu_t) + + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + scores, scores_norm, q_trial = align_scores_to_ndx( + enroll_set, ndx, scores, scores_norm, q_trial + ) + if qmf_file is None: + save_scores( + ndx, + scores, + score_file, + None, + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + # qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + # qm_file = get_score_filepath( + # score_file, + # "qm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # qm_table.save(qm_file) + return + + save_scores( + ndx, + scores, + score_file, + None, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + logging.info("applying qmf") + if scores_norm is None: + score_name = "qmf" + scores_fus = [scores.ravel()] + else: + score_name = "snorm.qmf" + scores_fus = [scores_norm.ravel()] + + q_names = list(q_trial.keys()) + q_names.sort() + for q_name in q_names: + scores_fus.append(q_trial[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (ndx.num_models, ndx.num_tests)) + save_scores( + ndx, + scores_fus, + score_file, + score_name, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + # score_file_nonorm = get_score_filepath( + # score_file, + # None, + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores to %s", score_file_nonorm) + # scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + # scores.save(score_file_nonorm) + + # if scores_norm is not None: + # score_file_snorm = get_score_filepath( + # score_file, + # "snorm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores with AS-Norm to %s", score_file_snorm) + # scores.scores = scores_norm + # scores.save(score_file_snorm) + + +def main(): + parser = ArgumentParser( + description="Eval cosine-scoring with optional AS-Norm and QMF" + ) + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--enroll-segments-file", default=None) + parser.add_argument("--segments-file", default=None) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--qmf-file", default=None) + parser.add_argument( + "--quality-measures", + default=["snorm-mu/s", "speech_duration"], + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + parser.add_argument( + "--min-dur", default=0.1, type=float, help="lower bound to clip durations" + ) + parser.add_argument( + "--max-dur", default=30.0, type=float, help="upper bound to clip durations" + ) + parser.add_argument( + "--frame-rate", + default=100, + type=float, + help="frames/sec when durationa are expressed in frames", + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py new file mode 100755 index 00000000..98fd37e2 --- /dev/null +++ b/hyperion/bin/eval_verification_metrics.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + + +def eval_verification_metrics( + key_files, + score_files, + key_names, + score_names, + p_tar, + c_miss, + c_fa, + sparse, + output_file, +): + assert len(key_files) == len(key_names) + assert len(score_files) == len(score_names) + dfs = [] + for score_file, score_name in zip(score_files, score_names): + for key_file, key_name in zip(key_files, key_names): + logging.info("Evaluating %s - %s", score_name, key_name) + evaluator = VE( + key_file, + score_file, + p_tar, + c_miss, + c_fa, + key_name, + score_name, + sparse=sparse, + ) + df_ij = evaluator.compute_dcf_eer() + dfs.append(df_ij) + + df = pd.concat(dfs) + logging.info("saving results to %s", output_file) + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + sep = "\t" if output_file.suffix == ".tsv" else "," + df.to_csv(output_file, sep=sep, index=False, float_format="{:,.4f}".format) + + pd.options.display.float_format = "{:.4}".format + print(df.to_string(), flush=True) + + +def main(): + parser = ArgumentParser(description="Evaluate speaker verification metrics") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--key-files", required=True, nargs="+") + parser.add_argument("--score-files", required=True, nargs="+") + parser.add_argument("--key-names", required=True, nargs="+") + parser.add_argument("--score-names", required=True, nargs="+") + parser.add_argument( + "--p-tar", + default=[0.05, 0.01, 0.005, 0.001], + nargs="+", + type=float, + help="target priors", + ) + parser.add_argument( + "--c-miss", default=None, nargs="+", type=float, help="cost of miss" + ) + parser.add_argument( + "--c-fa", default=None, nargs="+", type=float, help="cost of false alarm" + ) + parser.add_argument("--sparse", default=False, action=ActionYesNo) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + eval_verification_metrics(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index bb01162f..1baad913 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,11 +10,15 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -45,11 +49,10 @@ def __init__( self.sigma = sigma def forward(self, s_t): - # print('sigma0=', self.sigma) if self.sigma > 0: s_t = s_t + self.sigma * torch.randn_like(s_t) - # print('sigma1=', self.sigma) - f_t = self.feat_extractor(s_t) + + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -108,7 +111,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -144,7 +146,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) xvector_model = load_model(model_path) @@ -188,7 +189,7 @@ def eval_cosine_scoring( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -316,20 +317,19 @@ def eval_cosine_scoring( ) s.save_txt(score_file) - logging.info("saving stats to %s" % (stats_file)) + logging.info("saving stats to %s", stats_file) attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -338,7 +338,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -419,3 +418,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index c483ce39..3e4e9229 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,16 +7,21 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -46,7 +51,7 @@ def __init__( sigma=0, smoothing_after_wavegan=None, wave_gan_defender=None, - wav_scale=2 ** 15 - 1, + wav_scale=2**15 - 1, ): super().__init__() self.feat_extractor = feat_extractor @@ -62,7 +67,6 @@ def __init__( self.apply_wavegan = False if wave_gan_defender is None else True def forward(self, s_t): - # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21] s_t = s_t / self.wav_scale if self.smoothing_after_wavegan: @@ -79,7 +83,7 @@ def forward(self, s_t): s_t = self.wav_scale * s_t # End of pre-processing defense - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -150,7 +154,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -189,7 +192,6 @@ def eval_cosine_scoring_wavegan( wave_gan_model_ckpt, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -243,7 +245,7 @@ def eval_cosine_scoring_wavegan( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -284,13 +286,11 @@ def eval_cosine_scoring_wavegan( vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, ) t2 = time.time() @@ -377,16 +377,15 @@ def eval_cosine_scoring_wavegan( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -395,7 +394,6 @@ def eval_cosine_scoring_wavegan( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -494,3 +492,7 @@ def eval_cosine_scoring_wavegan( logging.debug(args) eval_cosine_scoring_wavegan(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index fba182c4..781cdbdf 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,13 +11,17 @@ import numpy as np import pandas as pd +import torch +import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -import torch -import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -70,7 +75,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -108,7 +112,7 @@ def forward(self, s_t): s_t = s_t[0, 0] f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -157,7 +161,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -195,7 +198,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -344,8 +347,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector " @@ -354,9 +356,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -365,7 +367,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -435,3 +436,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 3cfde93e..2ebb7e3d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,11 +10,15 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import RandomAccessDataReaderFactory as DRF @@ -67,7 +71,6 @@ def load_calibrator(cal_file, device): def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) try: @@ -105,7 +108,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -121,8 +123,8 @@ def eval_cosine_scoring( audio_reader = AR(test_wav_file, **audio_args) if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") with torch.no_grad(): @@ -140,7 +142,7 @@ def eval_cosine_scoring( t2 = time.time() s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) - x_t = feat_extractor(s) + x_t, _ = feat_extractor(s) t4 = time.time() tot_frames = x_t.shape[1] if vad_spec is not None: @@ -200,8 +202,7 @@ def eval_cosine_scoring( s.save_txt(score_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -218,7 +219,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -270,3 +270,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index 44bdf59d..a6f8efa4 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,11 +10,15 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -45,7 +49,7 @@ def __init__( def forward(self, s_t): f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -105,7 +109,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -147,7 +150,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) # load victim model feat_extractor = init_feats(**kwargs["feats"]) @@ -205,7 +207,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -338,8 +340,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -361,7 +362,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -437,3 +437,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 676575fd..7b8bc245 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,13 +11,17 @@ import numpy as np import pandas as pd +import torch +import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -import torch -import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -54,8 +59,7 @@ def __init__( self.threshold = threshold def forward(self, s_t): - f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -115,7 +119,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -157,7 +160,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) # load victim model @@ -213,7 +215,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -363,8 +365,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -387,7 +388,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -461,3 +461,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index da6389fb..b2e6a665 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,10 +11,14 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -77,13 +81,15 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -93,7 +99,6 @@ def eval_xvec( output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -106,8 +111,7 @@ def eval_xvec( use_gpu, **kwargs ): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -125,18 +129,17 @@ def eval_xvec( num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + vad_spec, + path_prefix=vad_path_prefix, ) while not reader.eof(): @@ -159,7 +162,7 @@ def eval_xvec( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -168,13 +171,11 @@ def eval_xvec( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) if random_utt_length: @@ -199,27 +200,23 @@ def eval_xvec( read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f", + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, ) if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) + logging.info("writing num-frames to %s", write_num_frames_spec) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) @@ -228,8 +225,7 @@ def eval_xvec( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Evaluates x-vectors logits from waveform computing " @@ -243,7 +239,7 @@ def eval_xvec( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) @@ -303,3 +299,7 @@ def eval_xvec( logging.debug(args) eval_xvec(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 37d6a2a6..f2df9581 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -11,11 +11,15 @@ import numpy as np import pandas as pd +import torch import torchaudio.transforms as tat -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -28,25 +32,6 @@ resamplers = {} -def get_resampler(source_fs, target_fs): - if source_fs in resamplers: - return resamplers[source_fs] - - resampler = tat.Resample( - int(source_fs), - int(target_fs), - lowpass_filter_width=64, - rolloff=0.9475937167399596, - resampling_method="kaiser_window", - beta=14.769656459379492, - ) - resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() - resamplers[source_fs] = resampler_f - return resampler_f - -resamplers = {} - - def get_resampler(source_fs, target_fs): if source_fs in resamplers: return resamplers[source_fs] @@ -104,9 +89,11 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=fs * min_utt_length, high=fs * max_utt_length + 1) + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -118,11 +105,10 @@ def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_speech_dur, - scp_sep, vad_path_prefix, model_path, hf_chunk_length, @@ -137,8 +123,7 @@ def extract_xvectors( use_gpu, **kwargs, ): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -157,15 +142,14 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - - logging.info(f"opening input stream: {input_spec} with args={ar_args}") - with AR(input_spec, **ar_args) as reader: - + with DWF.create(output_spec) as writer: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + vad_spec, + path_prefix=vad_path_prefix, ) while not reader.eof(): @@ -180,9 +164,7 @@ def extract_xvectors( t2 = time.time() if fs != model.sample_frequency: resampler = get_resampler(fs, model.sample_frequency) - print(f"x01 {x0.shape} {np.max(x0)}") x0 = resampler(x0) - print(f"x01 {x0.shape} {np.max(x0)}") logging.info("processing utt %s", key0) for aug_id in range(num_augs): @@ -238,7 +220,7 @@ def extract_xvectors( writer.write([key], [y]) if write_speech_dur is not None: keys.append(key) - info.append(str(x.shape[1] * fs)) + info.append(str(x.shape[1] / fs)) t8 = time.time() read_time = t2 - t1 @@ -271,8 +253,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing " "acoustic features on the fly" @@ -280,10 +261,9 @@ def extract_xvectors( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument("--write-speech-dur", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) @@ -299,7 +279,7 @@ def extract_xvectors( parser.add_argument("--model-path", required=True) parser.add_argument( "--hf-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -309,7 +289,7 @@ def extract_xvectors( ) parser.add_argument( "--xvec-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -335,18 +315,18 @@ def extract_xvectors( ) parser.add_argument( "--min-utt-length", - type=int, + type=float, default=5, help=("minimum utterance length in secs when using random utt length"), ) parser.add_argument( "--max-utt-length", - type=int, + type=float, default=120, help=("maximum utterance length in secs when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) @@ -360,3 +340,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py new file mode 100755 index 00000000..763df3fc --- /dev/null +++ b/hyperion/bin/extract_wav2xvectors.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus=%d", num_gpus) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model %s", model_path) + model = TML.load(model_path) + logging.info(f"xvector-model={model}") + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_speech_dur, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs, +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + metadata_columns = ["speech_duration"] + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + x0 = resampler(x0) + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + metadata = {} + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + metadata["speech_duration"] = ( + x.shape[1] / model.sample_frequency + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y], metadata=metadata) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] / fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +def main(): + parser = ArgumentParser( + description="""Extracts x-vectors from waveform computing acoustic features on the fly""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=float, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=float, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 926e0bcc..e70225c2 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -10,10 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -51,13 +55,15 @@ def load_model(model_path, device): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -77,9 +83,8 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) @@ -91,7 +96,6 @@ def extract_xvectors( dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -169,8 +173,7 @@ def extract_xvectors( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extracts x-vectors from features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -239,3 +242,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index addabbcf..71a24bd4 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,10 +11,14 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -77,9 +81,9 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -91,11 +95,10 @@ def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -109,8 +112,7 @@ def extract_xvectors( use_gpu, **kwargs ): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -129,18 +131,14 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - + with DWF.create(output_spec) as writer: logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) + "opening input stream: {} with args={}".format(recordings_file, ar_args) ) - with AR(input_spec, **ar_args) as reader: - + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) while not reader.eof(): t1 = time.time() @@ -162,7 +160,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -235,21 +233,19 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( - "Extracts x-vectors from waveform computing " "acoustic features on the fly" + "Extracts x-vectors from waveform computing acoustic features on the fly" ) ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) @@ -304,7 +300,7 @@ def extract_xvectors( help=("maximum utterance length when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) @@ -318,3 +314,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index e3d2fcbb..a1186ed2 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -10,11 +10,15 @@ import time import numpy as np +import torch import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -70,20 +74,18 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=" ") + time_writer = DWF.create(write_timestamps_spec) dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -115,7 +117,13 @@ def extract_xvectors( t4 = time.time() if x.shape[0] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): @@ -192,8 +200,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extract x-vectors over a sliding window") parser.add_argument("--cfg", action=ActionConfigFile) @@ -206,7 +213,6 @@ def extract_xvectors( parser.add_argument("--slidwin-params-path", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -298,3 +304,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 2b1bba3b..f973b566 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -11,11 +11,15 @@ import numpy as np import pandas as pd +import torch import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -83,7 +87,6 @@ def extract_xvectors( vad_spec, write_timestamps_spec, slidwin_params_path, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -97,8 +100,7 @@ def extract_xvectors( use_gpu, **kwargs ): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -109,7 +111,7 @@ def extract_xvectors( feat_snip_edges = feat_args["snip_edges"] if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) + time_writer = DWF.create(write_timestamps_spec) if aug_cfg is not None: augmenter = SpeechAugment.create(aug_cfg, rng=rng) @@ -121,17 +123,16 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + vad_spec, + path_prefix=vad_path_prefix, ) while not reader.eof(): @@ -154,7 +155,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -163,18 +164,22 @@ def extract_xvectors( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: x = x.transpose(1, 2).contiguous() y = ( @@ -257,8 +262,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extract x-vectors over a sliding window" @@ -275,7 +279,6 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) @@ -350,3 +353,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2languageid.py b/hyperion/bin/finetune_wav2vec2languageid.py new file mode 100755 index 00000000..0403f84c --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2languageid.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import LanguageIDTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, + # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, + # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID, +} + + +def Language_collate(batch): + audio = [] + audio_length = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + batch = { + "x": audio, + "x_lengths": audio_length, + "language": language, + } + return batch + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs) + # , + # collate_fn=Language_collate) + return data_loader + + +def init_model(num_classes, in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + + model_args["languageid"]["num_classes"] = num_classes + model = TML.load(in_model_file) + logging.info(model_args) + model.change_config(**model_args) + + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + # loss_weight=train_loader.batch_sampler.class_info["weights"], + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + parser.add_argument("--data.val.dataset.text_file", type=str) + + + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.add_argument( + "--data.train.dataset.class_names", + type=str, + ) + + parser.add_argument( + "--data.dev.dataset.class_names", + type=str, + ) + + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Language model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..5ff51348 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py new file mode 100755 index 00000000..514fe4d1 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path +import gc +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer, + HFWav2Vec2RNNTransducerResnet1D, + HFWav2Vec2RNNFiLMTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + +model_dict = { + "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D, + "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D, + +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_language_collate) + return data_loader + +def init_model(num_classes, loss_class_weight, in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model_args["languageid"]["num_classes"] = num_classes + # model_args["loss_class_weight"] = loss_class_weight + model = TML.load(in_model_file) + logging.info(model_args) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model +# def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs): +# # load pretrained models +# model_wav2transducer = torch.load(in_model_transducer) +# model_wav2lid = torch.load(in_model_lid) +# if rank == 0: +# logging.info("init joint model") +# logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"])) +# logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"])) +# logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"])) +# logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"])) +# logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"])) +# logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"])) + +# # init joint model +# model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], +# transducer=model_wav2transducer["model_cfg"]["transducer"], +# languageid=model_wav2lid["model_cfg"]["languageid"], +# feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"], +# feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"], +# feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"], +# loss_weight_transducer=kwargs["model"]["loss_weight_transducer"], +# loss_weight_lid=kwargs["model"]["loss_weight_lid"], +# lid_length=kwargs["model"]["lid_length"], +# ) + +# copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank) + + +# # add finetune args +# model_args = model_class.filter_finetune_args(**kwargs["model"]) + +# # model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network ft args={}".format(model_args)) +# model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"] +# model.change_config(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) + +# model_wav2transducer = None +# model_wav2lid = None +# gc.collect() +# torch.cuda.empty_cache() +# return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], + **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} #{"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + # parser.add_argument("--in-model-transducer", required=True) + # parser.add_argument("--in-model-lid", required=True) + parser.add_argument("--in-model-file", required=True) + + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2rnn_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..64d352e0 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2rnn_transducer.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer) +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + } + return batch + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index df267e72..138f18f7 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -12,11 +12,16 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -25,7 +30,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -44,8 +48,7 @@ def transducer_collate(batch): audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) - return torch.transpose(audio,0,1), audio_length, target - + return torch.transpose(audio, 0, 1), audio_length, target def init_data(partition, rank, num_gpus, **kwargs): @@ -74,7 +77,9 @@ def init_data(partition, rank, num_gpus, **kwargs): largs = ( {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -90,11 +95,7 @@ def init_model(in_model_file, rank, model_class, **kwargs): return model - - - def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -120,7 +121,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} + metrics = {} trainer = Trainer( model, device=device, @@ -136,7 +137,7 @@ def train_model(gpu_id, args): def make_parser(model_class): parser = ArgumentParser() - + parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) @@ -162,27 +163,23 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument( "--data.train.dataset.text_file", - type=str, + type=str, ) - - parser.add_argument("--data.val.dataset.text_file", type=str) - + + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.add_argument( "--data.train.dataset.bpe_model", - type=str, + type=str, ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" - ) - + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") parser.add_argument("--in-model-file", required=True) model_class.add_finetune_args(parser, prefix="model") @@ -199,8 +196,10 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") +def main(): + parser = ArgumentParser( + description="Fine-tune Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -229,3 +228,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index b3edd9b5..31f500d8 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -11,22 +11,32 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, @@ -35,7 +45,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -79,7 +88,12 @@ def init_model(num_classes, in_model_file, rank, **kwargs): def init_hard_prototype_mining(model, train_loader, val_loader, rank): - if not train_loader.batch_sampler.hard_prototype_mining: + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: return if rank == 0: @@ -95,7 +109,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -118,7 +131,12 @@ def train_model(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + # loss_weight=train_loader.batch_sampler.class_info["weights"], + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -174,8 +192,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Finetunes Wav2Vec2XVector model from audio files" ) @@ -207,3 +224,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py new file mode 100755 index 00000000..97356c01 --- /dev/null +++ b/hyperion/bin/finetune_wav2xvector.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 2ac01025..140cc3a2 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -12,11 +12,15 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -61,7 +65,6 @@ def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **k def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -195,8 +198,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Fine-tune x-vector model with deep feature loss regularization" ) @@ -279,3 +281,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index ff97d3ca..9d745e67 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -8,13 +8,18 @@ import os import sys import time +from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -37,7 +42,6 @@ def init_data( rank, **kwargs ): - ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: @@ -83,7 +87,6 @@ def init_feats(rank, **kwargs): def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -104,7 +107,6 @@ def init_xvector( def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -232,8 +234,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Fine-tune x-vector model with deep feature loss " @@ -328,3 +329,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index 7a1fb5a9..01e0c778 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -11,10 +11,14 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -162,8 +166,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model") parser.add_argument("--cfg", action=ActionConfigFile) @@ -231,3 +234,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 227892ea..2c884d0b 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,10 +10,14 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -40,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -121,7 +124,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -209,8 +211,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -241,70 +242,5 @@ def make_parser(xvec_class): train_xvec(gpu_id, args_sc) -# if __name__ == "__main__": - -# parser = ArgumentParser(description="Fine-tune x-vector model from audio files") -# parser.add_argument("--cfg", action=ActionConfigFile) - -# train_parser = ArgumentParser(prog="") -# AD.add_class_args(train_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(train_parser, prefix="sampler") -# train_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) - -# val_parser = ArgumentParser(prog="") -# AD.add_class_args(val_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(val_parser, prefix="sampler") -# val_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) -# data_parser = ArgumentParser(prog="") -# data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) -# data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) -# parser.add_argument("--data", action=ActionParser(parser=data_parser)) -# parser.link_arguments( -# "data.train.dataset.class_file", "data.val.dataset.class_file" -# ) -# parser.link_arguments( -# "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" -# ) -# parser.link_arguments( -# "data.train.sampler.batch_size", "data.val.sampler.batch_size" -# ) - -# AF.add_class_args(parser, prefix="feats") -# parser.add_argument("--in-model-path", required=True) - -# XVec.add_finetune_args(parser, prefix="model") -# Trainer.add_class_args( -# parser, prefix="trainer", train_modes=XVec.valid_train_modes() -# ) -# ddp.add_ddp_args(parser) - -# parser.add_argument("--seed", type=int, default=1123581321, help="random seed") -# parser.add_argument( -# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int -# ) -# parser.add_argument("--local_rank", default=0, type=int) - -# args = parser.parse_args() -# gpu_id = args.local_rank -# del args.local_rank - -# if gpu_id == 0: -# try: -# config_file = Path(args.exp_path) / "config.yaml" -# parser.save(args, str(config_file), format="yaml", overwrite=True) -# except: -# pass - -# # torch docs recommend using forkserver -# multiprocessing.set_start_method("forkserver") -# train_xvec(gpu_id, args) +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index a058893d..00452695 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -11,12 +11,16 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -153,14 +157,13 @@ def generate_attacks( num_parts, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, **kwargs) model.to(device) logging.info("opening audio read stream: %s" % (wav_file)) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file ** audio_args) + audio_reader = AR(wav_file**audio_args) wav_scale = audio_reader.wav_scale logging.info("opening audio write stream: %s" % (output_wav_dir)) @@ -168,7 +171,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) keys, class_names, class_ids = read_utt_list( list_file, class2int_file, part_idx, num_parts @@ -204,7 +207,7 @@ def generate_attacks( s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) target = torch.as_tensor([class_id], dtype=torch.long).to(device) if vad_spec is not None: - vad = v_reader.read([key.seg_set[j]])[0] + vad = v_reader.read([key])[0] tot_frames = len(vad) speech_frames = np.sum(vad) vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( @@ -214,7 +217,7 @@ def generate_attacks( logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" % ( - key.seg_set[j], + key, speech_frames, tot_frames, speech_frames / tot_frames * 100, @@ -312,8 +315,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker classification with x-vectors" ) @@ -330,7 +332,6 @@ def generate_attacks( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) @@ -413,3 +414,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 83375cb6..ab7d907b 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -11,12 +11,16 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -74,7 +78,6 @@ def forward(self, s_t): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -174,7 +177,6 @@ def generate_attacks( random_seed, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) model.to(device) @@ -197,7 +199,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) attack_factory = init_attack_factory(**kwargs) attacks_info = {} @@ -347,8 +349,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker verification with x-vectors+cos+calibration" ) @@ -443,3 +444,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py new file mode 100755 index 00000000..17fff2ba --- /dev/null +++ b/hyperion/bin/hyperion_dataset.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from typing import List, Optional, Union + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + ClassInfo, + Dataset, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) + +subcommand_list = [ + "add_features", + "set_recordings", + "make_from_recordings", + "remove_short_segments", + "rebuild_class_idx", + "remove_classes_few_segments", + "split_train_val", + "copy", + "add_cols_to_segments", +] + + +def add_common_args(parser): + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_add_features_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--features-name", required=True, help="""name of the feature""" + ) + parser.add_argument("--features-file", required=True, help="""feature set file""") + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def add_features( + dataset: PathLike, + features_name: str, + features_file: PathLike, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.add_features(features_name, features_file) + dataset.save(output_dataset) + + +def make_set_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + parser.add_argument( + "--remove-features", + default=None, + nargs="+", + help="""removes feature files from the dataset, + since they maybe obsolote after modifiying the recordings""", + ) + parser.add_argument( + "--update-seg-durs", + default=False, + action=ActionYesNo, + help="""updates the durations in the segment table""", + ) + + add_common_args(parser) + return parser + + +def set_recordings( + dataset: PathLike, + recordings_file: PathLike, + output_dataset: PathLike, + remove_features: List[str], + update_seg_durs: bool, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.set_recordings(recordings_file, update_seg_durs) + if remove_features is not None: + for features_name in remove_features: + dataset.remove_features(features_name) + + dataset.save(output_dataset) + + +def make_make_from_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + + add_common_args(parser) + return parser + + +def make_from_recordings( + dataset: PathLike, + recordings_file: PathLike, +): + output_dataset = dataset + import pandas as pd + + rec_df = pd.read_csv(recordings_file) + seg_df = rec_df[["id"]] + segments = SegmentSet(seg_df) + dataset = Dataset(segments, recordings=recordings_file) + dataset.save(output_dataset) + + +def make_remove_short_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--min-length", + required=True, + type=float, + help="""minimum required length of the segment""", + ) + + parser.add_argument( + "--length-name", + default="duration", + help="""name of the column indicating the length of the segment""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_short_segments( + dataset: PathLike, + min_length: float, + length_name: str, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_short_segments(min_length, length_name) + dataset.save(output_dataset) + + +def make_rebuild_class_idx_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def rebuild_class_idx( + dataset: PathLike, + class_name: str, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.rebuild_class_idx(class_name) + dataset.save(output_dataset) + + +def make_remove_classes_few_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx) + dataset.save(output_dataset) + + +def make_split_train_val_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""input dataset dir or .yaml file""" + ) + parser.add_argument( + "--val-prob", + default=0.05, + type=float, + help="""proportion of segments used for val""", + ) + parser.add_argument( + "--min-train-samples", + default=1, + type=int, + help="""min. number of training samples / class""", + ) + + parser.add_argument( + "--joint-classes", + default=None, + nargs="+", + help="""types of classes that need to have same classes in train and val""", + ) + parser.add_argument( + "--disjoint-classes", + default=None, + nargs="+", + help="""types of classes that need to have different classes in train and val""", + ) + parser.add_argument( + "--seed", + default=11235813, + type=int, + help="""random seed""", + ) + + parser.add_argument( + "--train-dataset", + required=True, + help="""output train dataset dir""", + ) + parser.add_argument( + "--val-dataset", + required=True, + help="""output val dataset dir""", + ) + + add_common_args(parser) + return parser + + +def split_train_val( + dataset: PathLike, + val_prob: float, + joint_classes: List[str], + disjoint_classes: List[str], + min_train_samples: int, + seed: int, + train_dataset: PathLike, + val_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + train_ds, val_ds = dataset.split_train_val( + val_prob, joint_classes, disjoint_classes, min_train_samples, seed + ) + train_ds.save(train_dataset) + val_ds.save(val_dataset) + + num_total = len(dataset) + num_train = len(train_ds) + num_val = len(val_ds) + logging.info( + "train: %d (%.2f%%) segments, val: %d (%.2f%%) segments", + num_train, + num_train / num_total * 100, + num_val, + num_val / num_total * 100, + ) + + +def make_copy_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--output-dataset", + required=True, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def copy( + dataset: PathLike, + output_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + dataset.save(output_dataset) + + +def make_add_cols_to_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--right-table", required=True, help="table where the new data is" + ) + parser.add_argument( + "--columns", + required=True, + nargs="+", + help="""columns to copy to segments table""", + ) + parser.add_argument( + "--on", + default=["id"], + nargs="+", + help="""columns to match both tables rows""", + ) + parser.add_argument( + "--right-on", + default=None, + nargs="+", + help="""columns to match both tables rows""", + ) + + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def add_cols_to_segments( + dataset: PathLike, + right_table: PathLike, + column_names: List[str], + on: List[str], + right_on: List[str], + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.add_cols_to_segments(right_table, column_names, on, right_on) + dataset.save(output_dataset) + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py new file mode 100755 index 00000000..59472d83 --- /dev/null +++ b/hyperion/bin/hyperion_tables.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from typing import List, Optional, Union + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + ClassInfo, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) + +subcommand_list = ["cat"] +table_dict = { + "segments": SegmentSet, + "recordings": RecordingSet, + "features": FeatureSet, + "classes": ClassInfo, + "enrollments": EnrollmentMap, + "generic": InfoTable, +} + + +def add_common_args(parser): + parser.add_argument( + "--table-type", + default="generic", + choices=list(table_dict.keys()), + help=f"Type of table in {list(table_dict.keys())}", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_cat_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-tables", + default=0, + type=int, + help="""number of jobs we used to create the individual tables""", + ) + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + + add_common_args(parser) + return parser + + +def cat( + table_type: str, + input_files: Union[List[PathLike], None], + output_file: PathLike, + num_tables: int, + base_idx: int = 1, +): + assert input_files is not None or num_tables != 0 + output_file = Path(output_file) + if input_files is None: + ext = output_file.suffix + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_tables): + idx = base_idx + i + input_file_i = input_file_base.with_suffix(f".{idx}{ext}") + input_files.append(input_file_i) + + table_class = table_dict[table_type] + tables = [] + for file_path in input_files: + tables.append(table_class.load(file_path)) + + output_table = table_class.cat(tables) + output_table.save(output_file) + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/identificate_wav2languageid.py b/hyperion/bin/identificate_wav2languageid.py new file mode 100755 index 00000000..37cf22e4 --- /dev/null +++ b/hyperion/bin/identificate_wav2languageid.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Dict, List, Tuple + +import sentencepiece as spm +import torch.nn as nn + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch import TorchModelLoader as TML + +from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("lid-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_one_batch( + model: nn.Module, + lang_dict: Dict[int, str], + x: torch.Tensor) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x #batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + # feature_lens = torch.Tensor([x.shape[1]]).int() + + # encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + # x=feature, x_lengths=feature_lens) + + predictions = [] + batch_size = feature.size(0) + + # encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # # fmt: off + # encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] + # fmt: on + output = model(feature) + _, pred = torch.max(output, dim=-1) + # to integer + pred = pred.cpu().numpy().tolist()[0] + predictions.append(lang_dict[pred]) + + logging.info("hyps:{}".format(" ".join(predictions))) + + return predictions + + +def decode_languageid(input_spec, output_spec, scp_sep, model_path, lang_file, + use_gpu, **kwargs): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + logging.info(nn.functional.softmax(model.feat_fuser, dim=-1)) + # load language dict form langfile by row number + lang_dict = {} + with open(lang_file, "r") as f: + for i, line in enumerate(f): + lang_dict[i] = line.strip() + + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s" % (output_spec)) + # with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with open(output_spec, "w") as writer: + logging.info("opening input stream: {} with args={}".format( + input_spec, ar_args)) + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s" % (key0)) + for aug_id in range(num_augs): + t3 = time.time() + key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], + dtype=torch.get_default_dtype()).to(device) + + t5 = time.time() + tot_frames = x.shape[1] + + # logging.info( + # "utt %s detected %d/%d (%.2f %%) speech frames" % ( + # key, + # x.shape[1], + # tot_frames, + # x.shape[1] / tot_frames * 100, + # )) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim, ), + dtype=float_cpu()) + else: + y = decode_one_batch(model=model, lang_dict=lang_dict, x=x) + + t7 = time.time() + + # writer.write([key], [y]) + writer.write(key + ' ' + ' '.join(y)+ "\n") + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ("utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f") % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + )) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=("Extracts x-vectors from waveform computing " + "acoustic features on the fly")) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) + + AR.add_class_args(parser) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + + parser.add_argument("--lang-file", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_languageid(**namespace_to_dict(args)) diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 972ff01f..43d6ab91 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,9 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer @@ -21,92 +24,94 @@ from hyperion.utils import Utt2Info -def make_noise(xs): - +def make_noise(xs, max_value): lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) num_tiles = np.ceil(max_len / lens) for i in range(len(xs)): xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] + xs[0] -= xs[0].mean() for i in range(1, len(xs)): xs[0] += xs[i] - xs[i].mean() + max_x = np.max(np.abs(xs[0])) + if max_x > max_value: + xs[0] *= max_value / max_x + return xs[0] def make_babble_noise_audio_files( - input_path, + recordings_file, output_path, - output_script, - write_time_durs_spec, + output_recordings_file, + write_time_durs, min_spks=3, max_spks=7, num_reuses=5, random_seed=112358, - **kwargs + **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") - rng = np.random.RandomState(seed=random_seed) + rng = np.random.default_rng(seed=random_seed) - if write_time_durs_spec is not None: + if write_time_durs is not None: okeys = [] info = [] count = 0 t1 = time.time() - with AR(input_path, **input_args) as reader: + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: keys = reader.keys - with Writer(output_path, output_script, **output_args) as writer: - - for iters in range(num_reuses): - keys = rng.permutation(keys) - - cur_spks = min_spks + for iters in range(num_reuses): + keys = rng.permutation(keys) + + cur_spks = min_spks + utt_list = [] + for utt_idx in range(len(keys)): + if len(utt_list) < cur_spks: + utt_list.append(keys[utt_idx]) + continue + + x, fs = reader.read(utt_list) + fs = fs[0] + y = make_noise(x, reader.wav_scale) + babble_id = "babble-%05d" % (count) + logging.info("writing file %s", babble_id) + writer.write([babble_id], [y], [fs]) + if write_time_durs is not None: + okeys.append(babble_id) + info.append(y.shape[0] / fs) + + count += 1 utt_list = [] - for utt_idx in range(len(keys)): - if len(utt_list) < cur_spks: - utt_list.append(keys[utt_idx]) - continue - - x, fs = reader.read(utt_list) - fs = fs[0] - y = make_noise(x) - babble_id = "babble-%05d" % (count) - logging.info("writing file % s" % (babble_id)) - writer.write([babble_id], [y], [fs]) - if write_time_durs_spec is not None: - okeys.append(babble_id) - info.append(y.shape[0] / fs) - - count += 1 - utt_list = [] - cur_spks += 1 - if cur_spks > max_spks: - cur_spks = min_spks - - if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) - u2td = Utt2Info.create(okeys, info) - u2td.save(write_time_durs_spec) + cur_spks += 1 + if cur_spks > max_spks: + cur_spks = min_spks - logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1)) + if write_time_durs is not None: + logging.info("writing time durations to %s", write_time_durs) + u2td = Utt2Info.create(okeys, info) + u2td.save(write_time_durs) + logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Creates babble noise by adding speech files") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) - parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) + parser.add_argument("--output-recordings-file", required=True) + parser.add_argument("--write-time-durs", default=None) AR.add_class_args(parser) Writer.add_class_args(parser) @@ -130,3 +135,7 @@ def make_babble_noise_audio_files( logging.debug(args) make_babble_noise_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py new file mode 100755 index 00000000..b3a1a2d5 --- /dev/null +++ b/hyperion/bin/make_wav2xvector.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" + Copyright 2023 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as W2RXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF + + +def init_feats(feats): + feat_args = AF.filter_args(**feats) + logging.info(f"feat args={feat_args}") + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + logging.info(f"feat-extractor={feat_extractor}") + return feat_extractor + + +def load_model(model_path): + logging.info("loading model %s", model_path) + model = TorchModel.auto_load(model_path) + logging.info(f"xvector-model={model}") + return model + + +def make_wav2xvector(feats, xvector_path, output_path): + feats = init_feats(feats) + xvector_model = load_model(xvector_path) + if isinstance(xvector_model, RXVec): + model = W2RXVec(feats, xvector_model) + elif isinstance(xvector_model, R1dXVec): + model = W2R1dXVec(feats, xvector_model) + else: + TypeError( + "Conversion of xvector class=%s not available", xvector_model.__class__ + ) + + logging.info("saving model of class %s to %s", model.__class__, output_path) + model.save(output_path) + + +def main(): + parser = ArgumentParser( + description="""Combines the feature extractor config with XVector model + to produce a Wav2XVector model with integrated feature extraction""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--xvector-path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + del args.cfg + logging.debug(args) + + make_wav2xvector(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py new file mode 100755 index 00000000..72ab6010 --- /dev/null +++ b/hyperion/bin/merge_scores.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialScores + + +def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + + ext = output_file.suffix + + if input_files is None: + if ext in [".h5", ".csv", ".tsv"]: + input_file_base = output_file + else: + input_file_base = output_file.parent / (output_file.name + ".txt") + ext = "" + + logging.info("merging %s* -> %s", input_file_base.with_suffix(""), output_file) + input_files = [] + for i in range(num_enroll_parts): + idx_i = base_idx + i + for j in range(num_test_parts): + idx_j = base_idx + j + input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") + input_files.append(input_file_i) + else: + logging.info("merging %s -> %s", " + ".join(input_files), output_file) + + if ext == ".h5": + # if files are h5 we need to load everything in RAM + score_list = [] + for score_file in input_files: + scores = TrialScores.load(score_file) + score_list.append(scores) + + scores = TrialScores.merge(score_list) + scores.save(output_file) + else: + has_header = ext in [".csv", ".tsv"] + write_header = True + with open(output_file, "w", encoding="utf-8") as f_out: + for score_file in input_files: + with open(score_file) as f_in: + for i, line in enumerate(f_in): + if i == 0 and has_header and not write_header: + continue + f_out.write(line) + write_header = False + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts we divided the enrollment set""", + ) + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts we divided the test set""", + ) + + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + merge_scores(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index dccf58da..bf88d674 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -19,8 +23,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): - - writer = DWF.create(output_spec, scp_sep=" ", compress=False) + writer = DWF.create(output_spec, compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: for data in reader: @@ -33,16 +36,18 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): h[h < 1e-3] = 0 h = np.trim_zeros(h) logging.info( - "Packing rir %s h_max=%f h_delay=%d h-length=%d" - % (key, h_max, h_delay, len(h)) + "Packing rir %s h_max=%f h_delay=%d h-length=%d", + key, + h_max, + h_delay, + len(h), ) writer.write([key], [h]) - logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1)) - + logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -63,3 +68,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): logging.debug(args) pack_wav_rirs(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e011dfe8..60d7ac5c 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,8 +13,13 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF @@ -41,7 +46,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -127,8 +131,7 @@ def plot_embedding_tsne( # plt.clf() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Projects embeddings using TSNE") parser.add_argument("--train-v-file", required=True) @@ -163,6 +166,9 @@ def plot_embedding_tsne( plot_embedding_tsne(**namespace_to_dict(args)) +if __name__ == "__main__": + main() + # #!/usr/bin/env python # """ # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6f35f074..08e4ef70 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,15 +13,20 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] @@ -46,7 +51,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -88,7 +92,7 @@ def plot_embedding_tsne( if do_ahc: if cluster_tsne: # in the low dim space, we cannot use cosine scoring - x2 = np.sum(x_tsne ** 2, axis=1)[:, None] + x2 = np.sum(x_tsne**2, axis=1)[:, None] d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T d2 = np.clip(d2, a_min=0, a_max=None) scores = -np.sqrt(d2) @@ -136,8 +140,7 @@ def plot_embedding_tsne( train_segs.save(output_dir / "segments.csv") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Projects embeddings using TSNE, " @@ -190,3 +193,7 @@ def plot_embedding_tsne( logging.debug(args) plot_embedding_tsne(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index 4105f482..dd1bde27 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,8 +6,12 @@ import logging from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger @@ -19,7 +23,7 @@ def make_parser(data_prep_class): return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( description="""Prepares a dataset into relational database tables""" ) @@ -34,6 +38,9 @@ def make_parser(data_prep_class): config_logger(1) data_prep_class = DataPrep.registry[args.subcommand] args = namespace_to_dict(args)[args.subcommand] - data_prep = data_prep_class(**args) data_prep.prepare() + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 2f4e5cbc..0bdace08 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from scipy import ndimage, signal from hyperion.hyp_defs import config_logger @@ -21,8 +25,17 @@ from hyperion.utils import Utt2Info +def resample_vad(vad, length): + step = (len(vad) - 1) / length + assert step < 1 + idx = step * np.arange(length, dtype=float) + idx = np.round(idx).astype(int) + return vad[idx] + + def process_vad(vad, length, fs, dilation, erosion): - vad = signal.resample(vad, length) > 0.5 + # vad = signal.resample(vad, length) > 0.5 + vad = resample_vad(vad, length) if dilation > 0: iters = int(dilation * fs) vad = ndimage.binary_dilation(vad, iterations=iters) @@ -35,115 +48,121 @@ def process_vad(vad, length, fs, dilation, erosion): def process_audio_files( - input_path, + recordings_file, output_path, - output_script, + output_recordings_file, write_time_durs_spec, vad_spec, vad_path_prefix, + output_sampling_rate, vad_fs=100, vad_dilation=0, vad_erosion=0, remove_dc_offset=False, - **kwargs + **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") if write_time_durs_spec is not None: keys = [] info = [] - with AR(input_path, **input_args) as reader: - with Writer(output_path, output_script, **output_args) as writer: - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - t1 = time.time() - for data in reader: - key, x, fs = data - logging.info("Processing audio %s" % (key)) - t2 = time.time() - - tot_samples = x.shape[0] - if vad_spec is not None: - num_vad_frames = int(round(tot_samples * vad_fs / fs)) - vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( - "bool", copy=False - ) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - x = x[vad] - - logging.info( - "utt %s detected %f/%f secs (%.2f %%) speech " - % ( - key[0], - x.shape[0] / fs, - tot_samples / fs, - x.shape[0] / tot_samples * 100, - ) - ) + t1 = time.time() + for data in reader: + key, x, fs = data + logging.info("Processing audio %s", key) + t2 = time.time() - if x.shape[0] > 0: - if remove_dc_offset: - x -= np.mean(x) - - writer.write([key], [x], [fs]) - if write_time_durs_spec is not None: - keys.append(key) - info.append(x.shape[0] / fs) - - xmax = np.max(x) - xmin = np.min(x) - else: - xmax = 0 - xmin = 0 - - t3 = time.time() - dt2 = (t2 - t1) * 1000 - dt3 = (t3 - t1) * 1000 - time_dur = len(x) / fs - rtf = (time_dur * 1000) / dt3 - logging.info( - ( - "Packed audio %s length=%0.3f secs " - "elapsed-time=%.2f ms. " - "read-time=%.2f ms. write-time=%.2f ms. " - "real-time-factor=%.2f" - "x-range=[%f-%f]" - ) - % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax) + tot_samples = x.shape[0] + if vad_spec is not None: + num_vad_frames = int(round(tot_samples * vad_fs / fs)) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False ) - t1 = time.time() + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + x = x[vad] + + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech ", + key[0], + x.shape[0] / fs, + tot_samples / fs, + x.shape[0] / tot_samples * 100, + ) + + if x.shape[0] > 0: + if remove_dc_offset: + x -= np.mean(x) + + writer.write([key], [x], [fs]) + if write_time_durs_spec is not None: + keys.append(key) + info.append(x.shape[0] / fs) + + xmax = np.max(x) + xmin = np.min(x) + else: + xmax = 0 + xmin = 0 + + t3 = time.time() + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f " + "x-range=[%f - %f]" + ), + key, + time_dur, + dt3, + dt2, + dt3 - dt2, + rtf, + xmin, + xmax, + ) + t1 = time.time() if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) + logging.info("writing time durations to %s", write_time_durs_spec) u2td = Utt2Info.create(keys, info) u2td.save(write_time_durs_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) + parser.add_argument("--output-recordings-file", required=True) parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) + parser.add_argument( + "--output-sampling-rate", default=None, type=int, help=("resample output audio")) parser.add_argument( "--vad-fs", default=100, type=float, help=("vad sampling frequency") @@ -186,3 +205,7 @@ def process_audio_files( logging.debug(args) process_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py new file mode 100755 index 00000000..50c2f1f2 --- /dev/null +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import Dataset + + +def main(): + parser = ArgumentParser( + description=( + """Split speakers in dataset into test speaker to create ASV trials and + cohort speakers for S-Norm""" + ) + ) + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument( + "--num-trial-speakers", + type=int, + default=1000, + help="number of speakers to create trials", + ) + parser.add_argument( + "--intra-gender", + default=True, + action=ActionYesNo, + help="Whether we create intra gender trials or not", + ) + parser.add_argument("--seed", type=int, default=1123, help="random seed") + parser.add_argument( + "--trials-dir", default=None, help="Path to output trials dataset" + ) + parser.add_argument( + "--cohort-dir", default=None, help="Path to output cohort dataset" + ) + + args = parser.parse_args() + config_logger(1) + data_dir = args.data_dir + cohort_dir = args.cohort_dir + cohort_dir = f"{data_dir}_cohort" if cohort_dir is None else cohort_dir + trials_dir = args.trials_dir + trials_dir = f"{data_dir}_trials" if trials_dir is None else trials_dir + + del args.data_dir + del args.cohort_dir + del args.trials_dir + args = namespace_to_dict(args) + + dataset = Dataset.load(data_dir) + trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) + trials_dataset.save(trials_dir) + cohort_dataset.save(cohort_dir) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py new file mode 100755 index 00000000..42aabe0c --- /dev/null +++ b/hyperion/bin/train_qmf.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores + + +def print_q_stats(scr, q_names): + for k in q_names: + q_vec = scr.q_measures[k][scr.score_mask] + s = f"{k} stats mean={np.mean(q_vec)} min={np.min(q_vec)} max={np.max(q_vec)} median={np.median(q_vec)}" + logging.info(s) + + +def train_qmf( + score_file, key_file, model_file, prior, lambda_reg, quality_measures, verbose +): + logging.info("load key: %s", key_file) + key = TrialKey.load(key_file) + logging.info("load scores: %s", score_file) + scr = TrialScores.load(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + if quality_measures is None: + quality_measures = list(scr.q_measures.keys()) + quality_measures.sort() + + print_q_stats(scr, quality_measures) + q_tar, q_non = scr.get_tar_non_q_measures(key, quality_measures) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + logging.info("train calibration") + # tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + # non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + tar = np.hstack((tar[:, None], q_tar)) + non = np.hstack((non[:, None], q_non)) + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + score_file = Path(score_file) + output_file = score_file.with_suffix(f".qmf{score_file.suffix}") + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +def main(): + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--quality-measures", + default=None, + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_qmf(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 26fcf72c..c00c4633 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -12,18 +12,22 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from torch.nn.utils.rnn import pad_sequence model_dict = { "rnn_rnn_transducer": Wav2RNNRNNTransducer, @@ -73,14 +77,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -98,7 +100,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -106,8 +107,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -116,8 +117,11 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -160,8 +164,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -177,34 +180,27 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": - parser = ArgumentParser( - description="Train RNN Transducer model from audio files") +def main(): + parser = ArgumentParser(description="Train RNN Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -233,3 +229,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py new file mode 100755 index 00000000..680ddd61 --- /dev/null +++ b/hyperion/bin/train_wav2vec2languageid.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp + +from hyperion.torch.trainers import LanguageIDTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, + # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, + # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID, +} + + +def Language_collate(batch): + audio = [] + audio_length = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + batch = { + "x": audio, + "x_lengths": audio_length, + "language": language, + } + return batch + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs) + # , + # collate_fn=Language_collate) + return data_loader + + +def init_model(num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + model_args["languageid"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + # import pdb; pdb.set_trace() + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + # import pdb; pdb.set_trace() + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + loss_weight=train_loader.batch_sampler.class_info["weights"], + **trn_args, + ) + trainer.load_last_checkpoint() + init_hard_prototype_mining(trainer.model, train_loader, val_loader, rank) + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + parser.add_argument("--data.val.dataset.text_file", type=str) + + + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.add_argument( + "--data.train.dataset.class_names", + type=str, + ) + + parser.add_argument( + "--data.dev.dataset.class_names", + type=str, + ) + + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Language model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..2306b467 --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_collate) + return data_loader + + +# def init_model_from_transducer(in_model_file, rank, model_class, **kwargs): +# model_args = model_class.filter_finetune_args(**kwargs["model"]) +# # model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network ft args={}".format(model_args)) +# model = TML.load(in_model_file) +# model.change_config(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) +# return model + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} #{"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + # parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py new file mode 100755 index 00000000..d5a6ad6f --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer, + HFWav2Vec2RNNTransducerResnet1D, + HFWav2Vec2RNNFiLMTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D, +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_language_collate) + return data_loader + +def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model_args["languageid"]["num_classes"] = num_classes + model_args["loss_class_weight"] = loss_class_weight + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 + + # import pdb; pdb.set_trace() + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], + **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + # parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 5daffb6d..c01c17cd 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -12,26 +12,31 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import (HFWav2Vec2ConformerV1RNNTransducer, - HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer) +from hyperion.torch.models import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, +) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, - "hf_wav2vec2conformer_v1_rnn_transducer": - HFWav2Vec2ConformerV1RNNTransducer, + "hf_wav2vec2conformer_v1_rnn_transducer": HFWav2Vec2ConformerV1RNNTransducer, # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, @@ -89,14 +94,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -114,7 +117,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -124,7 +126,7 @@ def train_model(gpu_id, args): set_float_cpu("float32") #torch.backends.cudnn.deterministic = True #torch.backends.cudnn.benchmark = False - torch.backends.cudnn.enabled = False + # torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -138,13 +140,16 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -182,8 +187,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -199,34 +203,29 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -253,5 +252,9 @@ def make_parser(model_class): args_sc.model_class = model_dict[model_type] # torch docs recommend using forkserver - # multiprocessing.set_start_method("forkserver") + multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py new file mode 100755 index 00000000..bafe8f66 --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D, +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_language_collate) + return data_loader + +def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model_args["languageid"]["num_classes"] = num_classes + model_args["loss_class_weight"] = loss_class_weight + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 + + # import pdb; pdb.set_trace() + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], + **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + # parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index ce53be86..8c9cb8de 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -12,11 +12,16 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -24,10 +29,10 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, + "hf_wav2vec2rnn_transducer": HFWav2Vec2Transducer, } @@ -52,9 +57,12 @@ def transducer_collate(batch): def init_data(partition, rank, num_gpus, **kwargs): + logging.getLogger().setLevel(logging.INFO) data_kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**data_kwargs["dataset"]) sampler_args = data_kwargs["sampler"] + logging.info("rank={}".format(rank)) + logging.info("{} audio dataset args={}".format(partition, ad_args)) if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) @@ -74,18 +82,17 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + logging.getLogger().setLevel(logging.INFO) model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) @@ -99,7 +106,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -107,9 +113,6 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False - torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -123,13 +126,18 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) + logging.info("trainer args={}".format(trn_args)) + logging.info("rank={}".format(rank)) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -167,8 +175,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -184,38 +191,34 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - + print("cuda available:", torch.cuda.is_available()) + logging.info("cuda available: {}".format(torch.cuda.is_available())) for k, v in model_dict.items(): parser_k = make_parser(v) subcommands.add_subcommand(k, parser_k) @@ -240,3 +243,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 5e7ecafa..e25c2d88 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -5,27 +5,38 @@ """ import logging import multiprocessing + # import sys import os import time from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, @@ -34,7 +45,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -77,7 +87,6 @@ def init_model(num_classes, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -96,10 +105,14 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: - logging.info("trainer args={}".format(trn_args)) + logging.info(f"trainer args={trn_args}") metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -156,8 +169,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -187,3 +199,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py new file mode 100755 index 00000000..7373a338 --- /dev/null +++ b/hyperion/bin/train_wav2xvector.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 7f4ab0fa..a2acdf4c 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -11,11 +11,15 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.data import FeatSeqDataset as SD @@ -40,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] sd_args = SD.filter_args(**kwargs["dataset"]) sampler_args = Sampler.filter_args(**kwargs["sampler"]) @@ -81,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -165,8 +167,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -197,3 +198,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 5c999dd1..c3f6170d 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -6,14 +6,16 @@ import logging import multiprocessing import os -import sys -import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -39,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -93,7 +94,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -179,8 +179,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -209,3 +208,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index 7caae8c4..9d885718 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -3,6 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# from .data_prep import data_prep_registry from .data_prep import DataPrep +from .musan import MusanDataPrep +from .rirs import RIRSDataPrep from .voxceleb2 import VoxCeleb2DataPrep +from .voxceleb1 import VoxCeleb1DataPrep +from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index d9f6b238..0f654676 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -2,6 +2,7 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -49,12 +50,12 @@ def dataset_name(): raise NotImplementedError() @staticmethod - def _get_recording_duration(scp, i, n): + def _get_recording_duration(recordings, i, n): from ..io import SequentialAudioReader as AR durations = [] fss = [] - with AR(scp, part_idx=i, num_parts=n) as reader: + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: for data in reader: key, x, fs = data duration = x.shape[0] / fs @@ -67,20 +68,21 @@ def get_recording_duration(self, recording_set): import itertools - from ..utils import SCPList + # from ..utils import SCPList #don't remember why I put this here - scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i in range(self.num_threads): + for i in tqdm(range(self.num_threads)): future = pool.submit( - DataPrep._get_recording_duration, scp, i, self.num_threads + DataPrep._get_recording_duration, recording_set, i, self.num_threads ) futures.append(future) + logging.info("waiting threats...") res = [f.result() for f in tqdm(futures)] fss = list(itertools.chain(*[r[0] for r in res])) - durations = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) recording_set["duration"] = durations recording_set["sample_freq"] = fss diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py new file mode 100644 index 00000000..abf7a46c --- /dev/null +++ b/hyperion/data_prep/musan.py @@ -0,0 +1,107 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class MusanDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data noise, music, speech + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + self.subset = subset + + @staticmethod + def dataset_name(): + return "musan" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + choices=["noise", "music", "speech"], + help="""musan subset in [noise, music, speech]""", + required=True, + ) + + def prepare(self): + logging.info( + "Peparing Musan %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + "noise_type": self.subset, + } + ) + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset( + segments, + recordings=recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py new file mode 100644 index 00000000..066819a8 --- /dev/null +++ b/hyperion/data_prep/rirs.py @@ -0,0 +1,103 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class RIRSDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + + @staticmethod + def dataset_name(): + return "rirs" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + + def prepare(self): + logging.info( + "Peparing RIRS corpus_dir:%s -> data_dir:%s", + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir + rirs_file = self.corpus_dir / "rir_list" + if rirs_file.exists(): + rirs_table = pd.read_csv( + rirs_file, + sep=" ", + header=None, + names=["dummy1", "rir_id", "dummy2", "room_id", "rec_files"], + ) + rec_files = [Path(f) for f in rirs_table["rec_files"].values] + room_ids = rirs_table["room_id"].values + else: + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + room_ids = None + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + {"id": rec_ids, "duration": recs.loc[rec_ids, "duration"].values,} + ) + if room_ids is not None: + segments["room_id"] = room_ids + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset(segments, recordings=recs,) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", len(segments), + ) diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py new file mode 100644 index 00000000..025fad37 --- /dev/null +++ b/hyperion/data_prep/voxceleb1.py @@ -0,0 +1,343 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +import glob + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxCeleb1DataPrep(DataPrep): + """Class for preparing VoxCeleb1 database into tables, + It prepares the full voxceleb either to train or test with + Original/Entire/Hard. + We don't consider preparing dev for train and test for test Original + + Attributes: + corpus_dir: input data directory + task: train/test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + task: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + self.task = task + assert ( + cat_videos == False or task == "train" + ), "cat-videos is only available for train task" + + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb1" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--task", + default="test", + choices=["test", "train"], + help="""if we prepare the data for [test, train]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox1_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox1_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb1 ID", inplace=True) + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) + + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) + return df_lang + + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + + def make_trials(self): + url_base = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta" + trials_file_names = [ + "veri_test2.txt", + "list_test_hard2.txt", + "list_test_all2.txt", + ] + trials_names = ["trials_o", "trials_h", "trials_e"] + + trials = {} + dfs = [] + logging.info("making trials") + for trial_name, file_name in zip(trials_names, trials_file_names): + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = f"{url_base}/{file_name}" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_in = pd.read_csv( + file_path, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + def get_modelid(s): + s = re.sub(r"\.wav", "", s) + return re.sub(r"/", "-", s) + + if self.use_kaldi_ids: + get_segmentid = get_modelid + else: + + def get_segmentid(s): + s = get_modelid(s) + return re.sub(r"[^-]*-", "", s) + + modelid = [get_modelid(f) for f in df_in["enroll_file"]] + segmentid = [get_segmentid(f) for f in df_in["test_file"]] + df_out = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / f"{trial_name}.csv" + df_out.to_csv(file_path, index=False) + dfs.append(df_out) + trials[trial_name] = file_path + + df_out = pd.concat(dfs, ignore_index=True) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_out.to_csv(file_path, index=False) + trials["trials"] = file_path + + logging.info("making enrollment map") + modelid = df_out["modelid"].sort_values().unique() + if self.use_kaldi_ids: + segmentid = modelid + else: + segmentid = [re.sub(r"[^-]*-", "", s) for s in modelid] + + df_out = pd.DataFrame({"modelid": modelid, "segmentid": segmentid}) + file_path = self.output_dir / "enrollment.csv" + df_out.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + return enrollments, trials + + def prepare(self): + logging.info( + "Peparing VoxCeleb1 for %s corpus_dir:%s -> data_dir:%s", + self.task, + self.corpus_dir, + self.output_dir, + ) + logging.info("getting audio meta-data") + df_meta = self._get_metadata() + logging.info("getting language estimations") + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True + ) + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] + + file_paths = [] + futures = [] + logging.info("making video cat lists") + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in tqdm(enumerate(rec_ids)): + future = pool.submit( + VoxCeleb1DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + rec_idx, + i, + ) + futures.append(future) + + logging.info("waiting threats...") + file_paths = [f.result() for f in tqdm(futures)] + else: + file_names = [f.with_suffix("").name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [str(r) for r in rec_files] + + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + "nationality": df_meta.loc[speakers, "Nationality"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace1 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + "nationality": df_meta.loc[uniq_speakers, "Nationality"], + } + ) + speakers = ClassInfo(speakers) + + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + if self.task == "test": + enrollments, trials = self.make_trials() + + logging.info("making dataset") + dataset = Dataset( + segments, + classes={"speaker": speakers, "language_est": languages}, + recordings=recs, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index d8b9dd99..969f2228 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging +import glob import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -39,8 +40,7 @@ def __init__( target_sample_freq: int, num_threads: int = 10, ): - if cat_videos: - use_kaldi_ids = True + use_kaldi_ids = True super().__init__( corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads ) @@ -136,45 +136,60 @@ def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): return file_path def prepare(self): + logging.info( + "Peparing VoxCeleb2 %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) logging.info("getting audio meta-data") df_meta = self._get_metadata() logging.info("getting language estimations") df_lang = self._get_langs_est() rec_dir = self.corpus_dir / self.subset logging.info("searching audio files in %s", str(rec_dir)) - rec_files = list(rec_dir.glob("**/*.m4a")) + rec_files1 = list(rec_dir.glob("**/*.m4a")) + rec_files = [Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)] + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) - uniq_video_ids, uniq_video_idx, video_idx = np.unique( - video_ids, return_index=True, return_inverse=True + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True ) - rec_ids = uniq_video_ids - speakers = [speakers[i] for i in uniq_video_idx] - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] file_paths = [] futures = [] logging.info("making video cat lists") + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i, rec_id in enumerate(rec_ids): + for i, rec_id in tqdm(enumerate(rec_ids)): future = pool.submit( VoxCeleb2DataPrep.make_cat_list, lists_cat_dir, rec_id, rec_files, - video_idx, + rec_idx, i, ) futures.append(future) + logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] - video_ids = uniq_video_ids - else: - file_names = [f.name for f in rec_files] + file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: rec_ids = [ f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) @@ -213,14 +228,9 @@ def prepare(self): df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" for r in rec_ids ], - # "duration": recs.loc[rec_ids, "duration"], + "duration": recs.loc[rec_ids, "duration"].values, } ) - print( - recs.loc[rec_ids, "duration"], - len(segments), - len(recs.loc[rec_ids, "duration"]), - ) segments = SegmentSet(segments) segments.sort() @@ -242,8 +252,8 @@ def prepare(self): logging.info("making dataset") dataset = Dataset( segments, - {"speaker": speakers, "languages": languages}, - {"recordings": recs}, + {"speaker": speakers, "language_est": languages}, + recs, ) logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py new file mode 100644 index 00000000..f81f6eaf --- /dev/null +++ b/hyperion/data_prep/voxsrc22.py @@ -0,0 +1,178 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxSRC22DataPrep(DataPrep): + """Class to prepare VoxSRC22 dev/test data + Attributes: + corpus_dir: input data directory + vox1_corpus_dir: input data directory for VoxCeleb1 + subset: subset of the data dev or test + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + vox1_corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = False + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + assert ( + vox1_corpus_dir is not None or subset == "test" + ), "dev set needs the VoxCeleb1 corpus dir" + self.subset = subset + self.vox1_corpus_dir = ( + None if vox1_corpus_dir is None else Path(vox1_corpus_dir) + ) + + @staticmethod + def dataset_name(): + return "voxsrc22" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--vox1-corpus-dir", + default=None, + help="""corpus directory of voxceleb 1.""", + ) + + def prepare_track12_dev(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s + %s -> %s", + self.subset, + self.corpus_dir, + self.vox1_corpus_dir, + self.output_dir, + ) + logging.info("making trials") + trials_file = self.corpus_dir / "voxsrc2022_dev.txt" + df_in = pd.read_csv( + trials_file, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + modelid = df_in["enroll_file"] + segmentid = df_in["test_file"] + df_trials = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_trials.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_trials.to_csv(file_path, index=False) + trials = {"trials": file_path} + modelid = df_trials["modelid"].sort_values().unique() + uniq_segmentid = df_trials["segmentid"].sort_values().unique() + uniq_segmentid = np.unique(np.concatenate((uniq_segmentid, modelid), axis=0)) + + logging.info("making enrollment map") + df_enroll = pd.DataFrame({"modelid": modelid, "segmentid": modelid}) + file_path = self.output_dir / "enrollment.csv" + df_enroll.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + logging.info("making RecordingSet") + vox1_segmentid = [] + vox22_segmentid = [] + for s in uniq_segmentid: + if "VoxSRC2022_dev" in s: + vox22_segmentid.append(s) + else: + vox1_segmentid.append(s) + + vox1_rec_files = [ + glob.glob(f"{self.vox1_corpus_dir}/**/{s}")[0] for s in vox1_segmentid + ] + # vox22_rec_files = [ + # glob.glob(f"{self.corpus_dir}/**/{s}")[0] for s in vox22_segmentid + # ] + vox22_rec_files = [f"{self.corpus_dir}/{s}" for s in vox22_segmentid] + + rec_ids = vox22_segmentid + vox1_segmentid + rec_files = vox22_rec_files + vox1_rec_files + + assert len(vox22_rec_files) > 0, "vox22 recording files not found" + assert len(vox1_rec_files) > 0, "vox1 recording files not found" + + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making dataset") + dataset = Dataset( + segments, + recordings=recs, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) + + def prepare_track12_test(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s -> %s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + + def prepare(self): + if self.subset == "dev": + self.prepare_track12_dev() + else: + self.prepare_track12_test() diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 4f33770b..85904eb2 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -16,7 +16,7 @@ from ..utils.utt2info import Utt2Info -class TrialDataReader(object): +class TrialDataReader: """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index c4c531ad..a9993768 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -49,7 +49,7 @@ def __init__( v[0]: int(v[1]) for v in [line.rstrip().split() for line in f] } - self.rng = np.random.RandomState(vcr_seed) + self.rng = np.random.default_rng(vcr_seed) self.csplit_max_spc = csplit_max_spc self.csplit_min_spc = csplit_min_spc self.csplit_mode = csplit_mode diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 14b1b35f..aa5ac653 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -16,10 +16,10 @@ from .hyp_data_reader import * from .hyp_data_writer import * from .kaldi_data_reader import * -from .packed_audio_reader import (RandomAccessPackedAudioReader, - SequentialPackedAudioReader) +from .packed_audio_reader import ( + RandomAccessPackedAudioReader, + SequentialPackedAudioReader, +) from .packed_audio_writer import PackedAudioWriter from .segment_vad_reader import SegmentVADReader from .vad_rw_factory import VADReaderFactory - -# from .queues import * diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 3919ddfa..eaf76d49 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -4,15 +4,15 @@ """ import multiprocessing as threading -import sys +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu -from ..utils.kaldi_io_funcs import (init_kaldi_input_stream, is_token, peek, - read_token) +from ..utils.kaldi_io_funcs import init_kaldi_input_stream, is_token, peek, read_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList + +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader @@ -27,10 +27,9 @@ class SequentialArkDataReader(SequentialDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.lock = threading.Lock() @@ -42,7 +41,7 @@ def close(self): self.f.close() self.f = None - def _seek(self, offset): + def _seek(self, offset: int): """Moves the pointer of the input file. Args: @@ -52,7 +51,7 @@ def _seek(self, offset): delta = offset - cur_pos self.f.seek(delta, 1) - def _open_archive(self, file_path, offset=0): + def _open_archive(self, file_path: PathLike, offset: int = 0): """Opens the current file if it is not open and moves the file pointer to a given position. Closes previous open Ark files. @@ -69,7 +68,7 @@ def _open_archive(self, file_path, offset=0): if offset > 0: self._seek(offset) - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -86,7 +85,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -120,10 +119,8 @@ class SequentialArkFileDataReader(SequentialArkDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): - super(SequentialArkFileDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) + def __init__(self, file_path: PathLike, **kwargs): + super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._eof = False self._keys = None @@ -151,7 +148,7 @@ def keys(self): return self._keys - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -188,7 +185,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -206,12 +209,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): key: List of recording names. data: List of feature matrices/vectors or 3D/2D numpy array. """ - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] count = 0 @@ -224,8 +223,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): self._eof = True break - row_offset_i = row_offset[i] if row_offset_is_list else row_offset - num_rows_i = num_rows[i] if num_rows_is_list else num_rows + row_offset_i = row_offset[count] if row_offset_is_list else row_offset + num_rows_i = num_rows[count] if num_rows_is_list else num_rows binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( @@ -264,28 +263,25 @@ class SequentialArkScriptDataReader(SequentialArkDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): - super(SequentialArkScriptDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): + super().__init__(file_path, permissive=False, **kwargs) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) self.cur_item = 0 @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open Ark files and puts the read pointer pointing @@ -295,9 +291,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -318,15 +314,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True): for i in range(num_records): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) - - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -338,7 +337,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -359,12 +364,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -373,7 +374,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -417,21 +425,24 @@ class RandomAccessArkDataReader(RandomAccessDataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ def __init__( - self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" " + self, + file_path: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, ): - super(RandomAccessArkDataReader, self).__init__( - file_path, transform, permissive - ) + super().__init__(file_path, transform, permissive) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -448,7 +459,7 @@ def close(self): f.close() self.f = [None] * len(self.f) - def _open_archive(self, key_idx, offset=0): + def _open_archive(self, key_idx: int, offset: int = 0): """Opens the Ark file correspoding to a given feature/matrix if it is not already open and moves the file pointer to the point where we can read that feature matrix. @@ -473,7 +484,9 @@ def _open_archive(self, key_idx, offset=0): return f, self.locks[archive_idx] - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -489,7 +502,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -507,7 +522,9 @@ def read_dims(self, keys, assert_same_dim=True): assert np.all(dims == dims[0]) return dims - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -525,25 +542,26 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False) - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) shapes.append(shape_i) @@ -553,7 +571,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -574,12 +598,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -588,15 +608,20 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 58f5c0a1..26f77112 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,15 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List, Dict import numpy as np - +import pandas as pd from ..hyp_defs import float_save -from ..utils.kaldi_io_funcs import (init_kaldi_output_stream, is_token, - write_token) +from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -28,11 +27,17 @@ class ArkDataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). + """ - def __init__(self, archive_path, script_path=None, binary=True, **kwargs): - super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) + def __init__( + self, + archive_path: PathLike, + script_path: Optional[PathLike] = None, + binary: bool = True, + **kwargs, + ): + super().__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: @@ -40,10 +45,12 @@ def __init__(self, archive_path, script_path=None, binary=True, **kwargs): else: self.f = open(archive_path, "w") - if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + if script_path is not None and not self.script_is_scp: + columns = ["id", "storage_path", "storage_byte"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -67,7 +74,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts the feature matrix from numpy array to KaldiMatrix or KaldiCompressedMatrix. """ @@ -89,7 +96,12 @@ def _convert_data(self, data): raise ValueError("Data is not ndarray or KaldiMatrix") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -99,9 +111,7 @@ def write(self, keys, data): it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -114,9 +124,15 @@ def write(self, keys, data): data_i.write(self.f, self.binary) if self.f_script is not None: - self.f_script.write( - "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") + else: + columns = [key_i, str(self.archive_path), str(pos)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 69cfa65b..a1adaef0 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -10,11 +10,13 @@ import subprocess import numpy as np +import pandas as pd import soundfile as sf from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List from ..hyp_defs import float_cpu -from ..utils import SCPList, SegmentList +from ..utils import RecordingSet, SegmentSet, PathLike valid_ext = [ ".wav", @@ -34,7 +36,7 @@ ".sds", ".sf", ".voc", - "w64", + ".w64", ".wve", ".xi", ] @@ -44,38 +46,36 @@ class AudioReader(object): """Class to read audio files from wav, flac or pipe Attributes: - file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. - segments_path: segments file with format: segment_id file_id tbeg tend + recordings: RecordingSet or file path to RecordingSet + segments: SegmentSet or file path to SegmentSet wav_scale: multiplies signal by scale factor """ - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - self.file_path = file_path - if isinstance(file_path, SCPList): - self.scp = file_path - else: - self.scp = SCPList.load(file_path, sep=" ", is_wav=True) - - self.segments_path = segments_path - if segments_path is None: - self.segments = None - self.with_segments = False - else: + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + ): + if not isinstance(recordings, RecordingSet): + recordings = RecordingSet.load(recordings) + + self.recordings = recordings + + self.with_segments = False + if segments is not None: self.with_segments = True - if isinstance(file_path, SegmentList): - self.segments = segments_path - else: - self.segments = SegmentList.load(segments_path, - sep=" ", - index_by_file=False) + if not isinstance(segments, SegmentSet): + segments = SegmentSet.load(segments) + self.segments = segments self.wav_scale = wav_scale @property def keys(self): if self.with_segments: - return np.asarray(self.segments["segment_id"]) - return self.scp.key + return self.segments["id"].values + return self.recordings["id"].values def __enter__(self): """Function required when entering contructions of type @@ -94,10 +94,12 @@ def __exit__(self, exc_type, exc_value, traceback): pass @staticmethod - def read_wavspecifier(wavspecifier, - scale=2**15, - time_offset=0, - time_dur=0): + def read_wavspecifier( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0.0, + time_dur: float = 0.0, + ): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile ` @@ -113,59 +115,123 @@ def read_wavspecifier(wavspecifier, wavspecifier = wavspecifier.strip() if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] - x, fs = AudioReader.read_pipe(wavspecifier, scale) - if time_offset == 0 and time_dur == 0: - return x, fs - - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - if num_samples == 0: - return x[start_sample:], fs - - end_sample = start_sample + num_samples - assert end_sample <= len(x) - return x[start_sample:end_sample], fs + return AudioReader.read_pipe(wavspecifier, scale, time_offset, time_dur) ext = os.path.splitext(wavspecifier)[1] if ext in valid_ext: - if time_offset == 0 and time_dur == 0: - x, fs = sf.read(wavspecifier, dtype=float_cpu()) - x *= scale - return x, fs - - with sf.SoundFile(wavspecifier, "r") as f: - fs = f.samplerate - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - f.seek(start_sample) - if num_samples > 0: - x = scale * f.read(num_samples, dtype=float_cpu()) - else: - x = scale * f.read(dtype=float_cpu()) - return x, fs + return AudioReader.read_file(wavspecifier, scale, time_offset, time_dur) raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2**15): + def read_pipe( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ - # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - proc = subprocess.Popen(wavspecifier, - shell=True, - stdout=subprocess.PIPE) + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: - raise Exception("Wave read pipe command %s returned code %d" % - (wavspecifier, proc.returncode)) + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale - return x, fs + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + @staticmethod + def read_file_sf( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + + return x, fs + + @staticmethod + def read_file( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + try: + return AudioReader.read_file_sf(wavspecifier, scale, time_offset, time_dur) + except: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading keys=%s offset=%f duration=%f" + "retrying reading until end-of-file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + try: + x, fs = AudioReader.read_file_sf(wavspecifier, scale, time_offset) + num_samples = int(math.floor(time_dur * fs)) + x = x[:num_samples] + return x, fs + except: + logging.info( + ( + "error-2 reading keys=%s offset=%f duration=%f" + "retrying reading full file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + + x, fs = AudioReader.read_file_sf(wavspecifier, scale) + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + x = x[start_sample : start_sample + num_samples] + return x, fs - def _read_segment(self, segment, time_offset=0, time_dur=0): + def _read_segment( + self, segment: pd.Series, time_offset: float = 0, time_dur: float = 0 + ): """Reads a wave segment Args: @@ -173,28 +239,11 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): Returns: Wave, sampling frequency """ - file_id = segment["file_id"] - t_beg = segment["tbeg"] + time_offset - t_end = segment["tend"] - if time_dur > 0: - t_end_new = t_beg + time_dur - assert t_end_new <= t_end - t_end = t_end_new - - file_path, _, _ = self.scp[file_id] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) - num_samples_i = len(x_i) - s_beg = int(t_beg * fs_i) - if s_beg >= num_samples_i: - raise Exception( - "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (file_id, t_beg, s_beg, file_id, num_samples_i)) - - s_end = int(t_end * fs_i) - if s_end > num_samples_i or t_end < 0: - s_end = num_samples_i - - x_i = x_i[s_beg:s_end] + recording_id = segment["recording_id"] + t_start = segment["start"] + time_offset + t_dur = segment["duration"] + storage_path = self.recordings.loc[recording_id, "storage_path"] + x_i, fs_i = self.read_wavspecifier(storage_path, self.wav_scale, t_start, t_dur) return x_i, fs_i def read(self): @@ -202,27 +251,23 @@ def read(self): class SequentialAudioReader(AudioReader): - def __init__( self, - file_path, - segments_path=None, - wav_scale=2**15 - 1, - part_idx=1, - num_parts=1, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + part_idx: int = 1, + num_parts: int = 1, ): - super().__init__(file_path, segments_path, wav_scale=wav_scale) + super().__init__(recordings, segments, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx self.num_parts = num_parts if self.num_parts > 1: if self.with_segments: - self.segments = self.segments.split(self.part_idx, - self.num_parts) + self.segments = self.segments.split(self.part_idx, self.num_parts) else: - self.scp = self.scp.split(self.part_idx, - self.num_parts, - group_by_key=False) + self.recordings = self.recordings.split(self.part_idx, self.num_parts) def __iter__(self): """Needed to build an iterator, e.g.: @@ -262,9 +307,9 @@ def eof(self): """ if self.with_segments: return self.cur_item == len(self.segments) - return self.cur_item == len(self.scp) + return self.cur_item == len(self.recordings) - def read(self, num_records=0, time_offset=0, time_durs=0): + def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = 0): """Reads next num_records audio files Args: @@ -281,7 +326,7 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: num_records = len(self.segments) - self.cur_item else: - num_records = len(self.scp) - self.cur_item + num_records = len(self.recordings) - self.cur_item offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) @@ -297,13 +342,16 @@ def read(self, num_records=0, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - segment = self.segments[self.cur_item] - key = segment["segment_id"] + segment = self.segments.iloc[self.cur_item] + key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path, _, _ = self.scp[self.cur_item] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + segment = self.recordings.iloc[self.cur_item] + key = segment["id"] + file_path = segment["storage_path"] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) keys.append(key) data.append(x_i) @@ -318,14 +366,15 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -334,38 +383,50 @@ def add_class_args(parser, prefix=None): "--part-idx", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) parser.add_argument( "--num-parts", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) except: pass if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args class RandomAccessAudioReader(AudioReader): + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + ): + super().__init__(recordings, segments, wav_scale) - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - super().__init__(file_path, segments_path, wav_scale) - - def _read(self, keys, time_offset=0, time_durs=0): + def read( + self, + keys: Union[str, List, np.array], + time_offset: float = 0, + time_durs: float = 0, + ): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. + time_offset: float or float list with time-offsets + time_durs: float or float list with durations Returns: data: List of waveforms @@ -384,93 +445,93 @@ def _read(self, keys, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - if not (key in self.segments): + if not (key in self.segments.index): raise Exception("Key %s not found" % key) - segment = self.segments[key] + segment = self.segments.loc[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - if not (key in self.scp): + if not (key in self.recordings.index): raise Exception("Key %s not found" % key) - file_path, _, _ = self.scp[key] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + file_path = self.recordings.loc[key, "storage_path"] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) data.append(x_i) fs.append(fs_i) return data, fs - def read(self, keys, time_offset=0, time_durs=0): - """Reads the waveforms for the recordings in keys. - - Args: - keys: List of recording/segment_ids names. - - Returns: - data: List of waveforms - fs: List of sampling freq. - """ - try: - x, fs = self._read(keys, - time_offset=time_offset, - time_durs=time_durs) - except: - if isinstance(keys, str): - keys = [keys] - - if not isinstance(time_offset, (list, np.ndarray)): - time_offset = [time_offset] * len(keys) - if not isinstance(time_durs, (list, np.ndarray)): - time_durs = [time_durs] * len(keys) - - try: - # some files produce error in the fseek after reading the data, - # this seems an issue from pysoundfile or soundfile lib itself - # we try to read from - # time-offset to the end of the file, and remove the extra frames later, - # this solves the problem in most cases - logging.info(("error-1 reading at keys={} offset={} " - "retrying reading until end-of-file ...").format( - keys, time_offset)) - x, fs = self._read(keys, time_offset=time_offset) - for i in range(len(x)): - end_sample = int(time_durs[i] * fs[i]) - x[i] = x[i][:end_sample] - except: - # try to read the full file - logging.info(("error-2 reading at key={}, " - "retrying reading full file ...").format(keys)) - x, fs = self._read(keys) - for i in range(len(x)): - start_sample = int(time_offset[i] * fs[i]) - end_sample = start_sample + int(time_durs[i] * fs[i]) - x[i] = x[i][start_sample:end_sample] - - return x, fs + # def read(self, keys, time_offset=0, time_durs=0): + # """Reads the waveforms for the recordings in keys. + + # Args: + # keys: List of recording/segment_ids names. + + # Returns: + # data: List of waveforms + # fs: List of sampling freq. + # """ + # try: + # x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + # except: + # if isinstance(keys, str): + # keys = [keys] + + # if not isinstance(time_offset, (list, np.ndarray)): + # time_offset = [time_offset] * len(keys) + # if not isinstance(time_durs, (list, np.ndarray)): + # time_durs = [time_durs] * len(keys) + + # try: + # logging.info( + # ( + # "error-1 reading at keys={} offset={} " + # "retrying reading until end-of-file ..." + # ).format(keys, time_offset) + # ) + # x, fs = self._read(keys, time_offset=time_offset) + # for i in range(len(x)): + # end_sample = int(time_durs[i] * fs[i]) + # x[i] = x[i][:end_sample] + # except: + # # try to read the full file + # logging.info( + # ( + # "error-2 reading at key={}, " "retrying reading full file ..." + # ).format(keys) + # ) + # x, fs = self._read(keys) + # for i in range(len(x)): + # start_sample = int(time_offset[i] * fs[i]) + # end_sample = start_sample + int(time_durs[i] * fs[i]) + # x[i] = x[i][start_sample:end_sample] + + # return x, fs @staticmethod def filter_args(**kwargs): - valid_args = ("wav_scale", ) + valid_args = ("wav_scale",) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index f98a3251..ca0dde9f 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -8,12 +8,16 @@ import numpy as np import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List +from pathlib import Path from ..hyp_defs import float_cpu from ..utils.kaldi_io_funcs import is_token -from ..utils.scp_list import SCPList +from ..utils import PathLike from .audio_reader import valid_ext + subtype_to_npdtype = { "PCM_32": "int32", "ALAW": "int16", @@ -23,12 +27,33 @@ "DOUBLE": "float64", "MS_ADPCM": "int16", "ULAW": "int16", - "PCM_U8": "uint8", - "PCM_S8": "int8", + "PCM_S8": "int16", "VORBIS": "float32", "GSM610": "int16", "G721_32": "int16", - "PCM_24": "int24", + "PCM_24": "int32", +} + +scale_32 = 2 ** 31 - 1 +scale_24 = 2 ** 23 - 1 +scale_16 = 2 ** 15 - 1 +scale_8 = 2 ** 7 - 1 + + +subtype_to_scale = { + "PCM_32": scale_32, + "ALAW": scale_16, + "IMA_ADPCM": scale_16, + "FLOAT": 1, + "PCM_16": scale_16, + "DOUBLE": 1, + "MS_ADPCM": scale_16, + "ULAW": scale_16, + "PCM_S8": scale_8, + "VORBIS": 1, + "GSM610": scale_16, + "G721_32": scale_16, + "PCM_24": scale_24, } @@ -37,43 +62,56 @@ class AudioWriter(object): Attributes: output_path: output data file path. - script_path: optional output scp file. + script_path: optional output kaldi .scp or pandas .csv file. audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) - scp_sep: Separator for scp files (default ' '). + wav_scale: scale of the input waveform """ def __init__( self, - output_path, - script_path=None, - audio_format="wav", - audio_subtype=None, - scp_sep=" ", + output_path: PathLike, + script_path: Optional[PathLike] = None, + audio_format: str = "wav", + audio_subtype: Optional[str] = None, + wav_scale: float = 1.0, ): - self.output_path = output_path - self.script_path = script_path + self.output_path = Path(output_path) + self.script_path = Path(script_path) if script_path is not None else None self.audio_format = audio_format - self.scp_sep = scp_sep + self.output_path.mkdir(exist_ok=True, parents=True) assert "." + self.audio_format in valid_ext if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: - self.subtype = audio_subtype + self.subtype = audio_subtype.upper() assert sf.check_format(self.audio_format, self.subtype) - if not os.path.exists(output_path): - try: - os.makedirs(output_path) - except FileExistsError: - pass + self._dtype = subtype_to_npdtype[self.subtype] + + self.wav_scale = wav_scale + # we multiply the audio for this number before saving it. + self._output_wav_scale = subtype_to_scale[self.subtype] / wav_scale + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", encoding="utf-8") + row = self.script_sep.join( + ["id", "storage_path", "duration", "sample_freq"] + ) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -96,7 +134,12 @@ def close(self): if self.f_script is not None: self.f_script.close() - def write(self, keys, data, fs): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + fs: Union[int, float, List[int], List[float], np.array], + ): """Writes waveform to audio file. Args: @@ -109,8 +152,7 @@ def write(self, keys, data, fs): data = [data] fs_is_list = isinstance(fs, (list, np.ndarray)) - assert self.subtype in subtype_to_npdtype - dtype = subtype_to_npdtype[self.subtype] + output_files = [] for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -120,14 +162,21 @@ def write(self, keys, data, fs): file_basename, self.audio_format, ) - fs_i = fs[i] if fs_is_list else fs - data_i = data[i].astype(dtype, copy=False) + fs_i = int(fs[i]) if fs_is_list else fs + data_i = (self._output_wav_scale * data[i]).astype(self._dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) if self.f_script is not None: - self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file)) + if self.script_is_scp: + self.f_script.write(f"{key_i} {output_file}\n") + else: + duration_i = data_i.shape[-1] / fs_i + row = self.script_sep.join( + [key_i, output_file, str(duration_i), str(fs_i)] + ) + self.f_script.write(f"{row}\n") self.f_script.flush() return output_files @@ -135,40 +184,42 @@ def write(self, keys, data, fs): @staticmethod def filter_args(**kwargs): valid_args = ( - "output_fs", - "output_wav_scale", - "output_audio_format", - "output_audio_subtype", - ) - return dict( - (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + "wav_scale", + "audio_format", + "audio_subtype", ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." - - # parser.add_argument(p1+'output-wav-scale', default=1, type=float, - # help=('scale to divide the waveform before writing')) + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "output-audio-format", + "--audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - p1 + "output-audio-subtype", + "--audio-subtype", default=None, - choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + choices=["pcm_16", "pcm_24", "pcm_32", "float", "double", "vorbis"], help=("coding format for audio file"), ) - # parser.add_argument(p1+'output-fs', default=16000, type=int, - # help=('output sample frequency')) + try: + parser.add_argument( + "--wav-scale", default="1.0", help=("input waveform scale wrt 1"), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index e4e64777..8ce91d15 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -18,13 +18,12 @@ def __init__( self, rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, ): - r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep) + r = DRF.create(rspecifier, path_prefix) super().__init__(r.file_path, r.permissive) self.r = r self.frame_shift = frame_shift @@ -60,7 +59,7 @@ def read( vad = self.r.read(keys) output_vad = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) offset_i = offset[i] if offset_is_list else offset num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) @@ -78,7 +77,7 @@ def read_timestamps(self, keys, merge_tol=0.001): vad = self.r.read(keys) ts = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) ts_i = bin_vad_to_timestamps( vad_i, self.frame_length / 1000, diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index bbefa62d..73c120b5 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -6,18 +6,24 @@ import logging import multiprocessing from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu from ..np.transforms import TransformList -from ..utils.scp_list import SCPList +from ..utils import PathLike class DataReader(object): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files. Attributes: @@ -57,7 +63,7 @@ def close(self): pass @staticmethod - def _squeeze(data, permissive=False): + def _squeeze(data: np.array, permissive: bool = False): """Converts list of matrices to 3D numpy array or list of vectors to 2D numpy array. @@ -121,7 +127,7 @@ def _combine_ranges(read_range, row_offset, num_rows): return row_offset, num_rows @staticmethod - def _apply_range_to_shape(shape, row_offset, num_rows): + def _apply_range_to_shape(shape: Tuple[int, int], row_offset: int, num_rows: int): """Modifies shape given the user defined row_offset and num_rows to read. If we are reading a matrix of shape (100,4) and row_offset=10, num_rows=20, it returns (20,4). @@ -158,25 +164,22 @@ class SequentialDataReader(DataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ __metaclass__ = ABCMeta def __init__( self, - file_path, - transform=None, - permissive=False, - part_idx=1, - num_parts=1, - split_by_key=False, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + part_idx: int = 1, + num_parts: int = 1, ): super().__init__(file_path, transform, permissive) self.lock = multiprocessing.Lock() self.part_idx = part_idx self.num_parts = num_parts - self.split_by_key = split_by_key def __iter__(self): """Needed to build an iterator, e.g.: @@ -218,7 +221,7 @@ def eof(self): return False @abstractmethod - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -234,7 +237,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -250,7 +253,7 @@ def read_dims(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -266,7 +269,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -290,7 +299,12 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): class RandomAccessDataReader(DataReader): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files in random order. @@ -305,7 +319,7 @@ def __init__(self, file_path, transform=None, permissive=False): super().__init__(file_path, transform, permissive) @abstractmethod - def read_num_rows(self, keys=None, assert_same_dim=True): + def read_num_rows(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -320,7 +334,7 @@ def read_num_rows(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_dims(self, keys=None, assert_same_dim=True): + def read_dims(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -335,7 +349,7 @@ def read_dims(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, keys=None, assert_same_dim=True): + def read_shapes(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -350,7 +364,13 @@ def read_shapes(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read(self, keys, squeeze=False, offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str]], + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index 7868baae..092f5549 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -4,10 +4,13 @@ """ import logging +from typing import Union, Optional, List, Callable, Tuple from jsonargparse import ActionParser, ArgumentParser +import numpy as np from ..utils.kaldi_matrix import compression_methods +from ..utils import PathLike from .ark_data_reader import RandomAccessArkDataReader as RADR from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR @@ -17,8 +20,7 @@ from .h5_data_reader import SequentialH5FileDataReader as SH5FDR from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR from .h5_data_writer import H5DataWriter as H5DW -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType class DataWriterFactory(object): @@ -27,7 +29,12 @@ class DataWriterFactory(object): """ @staticmethod - def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): + def create( + wspecifier: PathLike, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, + ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -43,7 +50,7 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, + metadata_columns=metadata_columns, ) else: return ADW( @@ -53,21 +60,20 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, + metadata_columns=metadata_columns, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "compress", "compression_method") + valid_args = ("compress", "compression_method") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument("--compress", default=False, action="store_true") parser.add_argument( "--compression-method", default="auto", choices=compression_methods @@ -75,12 +81,11 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='data writer options') class SequentialDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): + def create(rspecifier: PathLike, path_prefix: Optional[PathLike] = None, **kwargs): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) @@ -92,27 +97,21 @@ def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): return SAFDR(rspecifier.archive, **kwargs) else: if rspecifier.archive_type == ArchiveType.H5: - return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SH5SDR(rspecifier.script, path_prefix, **kwargs) else: - return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SASDR(rspecifier.script, path_prefix, **kwargs) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + valid_args = ("path_prefix", "part_idx", "num_parts") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) @@ -139,7 +138,11 @@ def add_class_args(parser, prefix=None): class RandomAccessDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): + def create( + rspecifier: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + ): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) @@ -162,7 +165,6 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) else: return RADR( @@ -170,26 +172,19 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix") + valid_args = "path_prefix" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index cf2bb4f9..ff35ef2a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,14 @@ import os from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Dict +from pathlib import Path +import numpy as np +import pandas as pd +from ..utils import PathLike -class DataWriter(object): +class DataWriter: """Abstract base class to write Ark or hdf5 feature files. Attributes: @@ -19,35 +24,42 @@ class DataWriter(object): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ __metaclass__ = ABCMeta def __init__( self, - archive_path, - script_path=None, - flush=False, - compress=False, - compression_method="auto", - scp_sep=" ", + archive_path: PathLike, + script_path: Optional[PathLike] = None, + flush: bool = False, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): - self.archive_path = archive_path - self.script_path = script_path + self.archive_path = Path(archive_path) + self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method - self.scp_sep = scp_sep + self.metadata_columns = metadata_columns - archive_dir = os.path.dirname(archive_path) - if not os.path.exists(archive_dir): - os.makedirs(archive_dir) + archive_dir = self.archive_path.parent + archive_dir.mkdir(exist_ok=True, parents=True) + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - script_dir = os.path.dirname(script_path) - if not os.path.exists(script_dir): - os.makedirs(script_dir) + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", encoding="utf-8") def __enter__(self): """Function required when entering contructions of type @@ -76,8 +88,38 @@ def flush(self): """Flushes the file""" pass + def standardize_write_args( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): + if isinstance(keys, str): + keys = [keys] + data = [data] + + if metadata is not None: + if isinstance(metadata, pd.DataFrame): + metadata = metadata.to_dict() + + metadata_list = [] + for c in self.metadata_columns: + m_c = metadata[c] + if not isinstance(m_c, (list, np.ndarray)): + m_c = [m_c] + metadata_list.append(m_c) + + metadata = metadata_list + + return keys, data, metadata + @abstractmethod - def write(self, key, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -86,5 +128,6 @@ def write(self, key, data): If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. + metadata: dictionary/DataFrame with metadata """ pass diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index dfefbec3..d509504d 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -6,8 +6,8 @@ """ import multiprocessing -import sys import time +from typing import Union, Optional, List, Callable, Tuple import h5py import numpy as np @@ -16,11 +16,18 @@ from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.list_utils import split_list, split_list_group_by_key -from ..utils.scp_list import SCPList + +# from ..utils.scp_list import SCPList +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader -def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): +def _read_h5_data( + dset, + row_offset: int = 0, + num_rows: int = 0, + transform: Optional[Callable[[np.array], np.array]] = None, +): """Auxiliary function to read the feature matrix from hdf5 dataset. It decompresses the data if it was compressed. @@ -74,7 +81,7 @@ class SequentialH5DataReader(SequentialDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.cur_file = None @@ -86,7 +93,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Opens the hdf5 file where the next matrix/vector is if it is not open. If there was another hdf5 file open, it closes it. @@ -96,7 +103,7 @@ def _open_archive(self, file_path): self.cur_file = file_path self.f = h5py.File(file_path, "r") - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -113,7 +120,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -147,7 +154,7 @@ class SequentialH5FileDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._keys = list(self.f.keys()) @@ -172,7 +179,7 @@ def eof(self): """Returns True when it reaches the end of the ark file.""" return self.cur_item == len(self._keys) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -204,7 +211,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -225,12 +238,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self._keys) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] with self.lock: @@ -268,7 +277,6 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): the scp file. This is useful when data is read from a different directory of that it was created. - scp_sep: Separator for scp files (default ' '). transform: TransformList object, applies a transformation to the features after reading them from disk. part_idx: It splits the input into num_parts and writes only @@ -277,20 +285,20 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, permissive=False, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open hdf5 files and puts the read pointer pointing @@ -300,9 +308,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -316,7 +324,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): List of tuples with num_records shapes. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item keys = [] shapes = [] @@ -324,14 +332,15 @@ def read_shapes(self, num_records=0, assert_same_dim=True): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - - self._open_archive(file_path) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + self._open_archive(feature_spec["storage_path"]) shape_i = self.f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -343,7 +352,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -362,14 +377,10 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): data: List of feature matrices/vectors or 3D/2D numpy array. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -378,7 +389,13 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -413,11 +430,18 @@ class RandomAccessH5DataReader(RandomAccessDataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): super().__init__(file_path, transform, permissive) self.f = None - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -433,7 +457,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -463,7 +489,7 @@ class RandomAccessH5FileDataReader(RandomAccessH5DataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.lock = multiprocessing.Lock() self._open_archive(file_path) @@ -474,7 +500,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Open the hdf5 file it it is not open.""" if self.f is None: self.close() @@ -484,7 +510,9 @@ def _open_archive(self, file_path): def keys(self): return list(self.f.keys()) - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -518,7 +546,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -539,12 +573,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -589,17 +619,20 @@ class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -614,9 +647,9 @@ def close(self): @property def keys(self): - return self.scp.key + return self.feature_set["id"] - def _open_archive(self, key_idx): + def _open_archive(self, key_idx: int): """Opens the hdf5 file correspoding to a given feature/matrix if it is not already open. @@ -633,7 +666,9 @@ def _open_archive(self, key_idx): return self.f[archive_idx], self.locks[archive_idx] - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -651,18 +686,15 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] f, lock = self._open_archive(index) if not (key in f): if self.permissive: @@ -673,8 +705,12 @@ def read_shapes(self, keys, assert_same_dim=True): with lock: shape_i = f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) + + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + shapes.append(shape_i) if assert_same_dim: @@ -683,7 +719,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -704,12 +746,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -718,15 +756,19 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index fed91d1e..4d05f963 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,15 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List, Dict import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -27,18 +28,21 @@ class H5DataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, archive_path, script_path=None, **kwargs): + def __init__( + self, archive_path: PathLike, script_path: Optional[PathLike] = None, **kwargs + ): super().__init__(archive_path, script_path, **kwargs) self.f = h5py.File(archive_path, "w") - if script_path is None: - self.f_script = None - else: - self.f_script = open(script_path, "w") + if script_path is not None and not self.script_is_scp: + columns = ["id", "storage_path"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -64,7 +68,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts data to the format for saving. Compresses the data it needed. Args: @@ -85,7 +89,12 @@ def _convert_data(self, data): else: raise ValueError("Data is not ndarray") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -95,9 +104,7 @@ def write(self, keys, data): it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -108,9 +115,15 @@ def write(self, keys, data): dset.attrs[k] = v if self.f_script is not None: - self.f_script.write( - "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}\n") + else: + columns = [key_i, str(self.archive_path)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index 575c3087..63d463fb 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -76,9 +76,8 @@ def read_random_slice(self, key, num_samples, rng, field=""): dataset = key + field assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] - # print('hola',num_rows,num_samples,num_rows-num_samples) - # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0] - index = rng.randint(low=0, high=num_rows - num_samples + 1) + + index = rng.integers(low=0, high=num_rows - num_samples + 1) X = self.f[dataset][index : index + num_samples] return X, index diff --git a/hyperion/io/old_audio_reader.py b/hyperion/io/old_audio_reader.py new file mode 100644 index 00000000..341f04a4 --- /dev/null +++ b/hyperion/io/old_audio_reader.py @@ -0,0 +1,477 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import io +import logging +import math +import os +import subprocess + +import numpy as np +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..hyp_defs import float_cpu +from ..utils import SCPList, SegmentList + +valid_ext = [ + ".wav", + ".flac", + ".ogg", + ".au", + ".avr", + ".caf", + ".htk", + ".iff", + ".mat", + ".mpc", + ".oga", + ".pvf", + ".rf64", + ".sd2", + ".sds", + ".sf", + ".voc", + "w64", + ".wve", + ".xi", +] + + +class AudioReader(object): + """Class to read audio files from wav, flac or pipe + + Attributes: + file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. + segments_path: segments file with format: segment_id file_id tbeg tend + wav_scale: multiplies signal by scale factor + """ + + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + self.file_path = file_path + if isinstance(file_path, SCPList): + self.scp = file_path + else: + self.scp = SCPList.load(file_path, sep=" ", is_wav=True) + + self.segments_path = segments_path + if segments_path is None: + self.segments = None + self.with_segments = False + else: + self.with_segments = True + if isinstance(file_path, SegmentList): + self.segments = segments_path + else: + self.segments = SegmentList.load( + segments_path, sep=" ", index_by_file=False + ) + + self.wav_scale = wav_scale + + @property + def keys(self): + if self.with_segments: + return np.asarray(self.segments["segment_id"]) + return self.scp.key + + def __enter__(self): + """Function required when entering contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Function required when exiting from contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + pass + + @staticmethod + def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + """Reads an audiospecifier (audio_file/pipe) + It reads from pipe or from all the files that can be read + by `libsndfile ` + + Args: + wavspecifier: A pipe, wav, flac, ogg file etc. + scale: Multiplies signal by scale factor + time_offset: float indicating the start time to read in the utterance. + time_durs: floats indicating the number of seconds to read from the utterance, + if 0 it reads untils the end + + """ + wavspecifier = wavspecifier.strip() + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + x, fs = AudioReader.read_pipe(wavspecifier, scale) + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + ext = os.path.splitext(wavspecifier)[1] + if ext in valid_ext: + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + return x, fs + + raise Exception("Unknown format for %s" % (wavspecifier)) + + @staticmethod + def read_pipe(wavspecifier, scale=2 ** 15): + """Reads wave file from a pipe + Args: + wavspecifier: Shell command with pipe output + scale: Multiplies signal by scale factor + """ + # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) + pipe = proc.communicate()[0] + if proc.returncode != 0: + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) + x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) + x *= scale + return x, fs + + def _read_segment(self, segment, time_offset=0, time_dur=0): + """Reads a wave segment + + Args: + segment: pandas DataFrame (segment_id , file_id, tbeg, tend) + Returns: + Wave, sampling frequency + """ + file_id = segment["file_id"] + t_beg = segment["tbeg"] + time_offset + t_end = segment["tend"] + if time_dur > 0: + t_end_new = t_beg + time_dur + assert t_end_new <= t_end + t_end = t_end_new + + file_path, _, _ = self.scp[file_id] + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) + num_samples_i = len(x_i) + s_beg = int(t_beg * fs_i) + if s_beg >= num_samples_i: + raise Exception( + "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" + % (file_id, t_beg, s_beg, file_id, num_samples_i) + ) + + s_end = int(t_end * fs_i) + if s_end > num_samples_i or t_end < 0: + s_end = num_samples_i + + x_i = x_i[s_beg:s_end] + return x_i, fs_i + + def read(self): + pass + + +class SequentialAudioReader(AudioReader): + def __init__( + self, + file_path, + segments_path=None, + wav_scale=2 ** 15 - 1, + part_idx=1, + num_parts=1, + ): + super().__init__(file_path, segments_path, wav_scale=wav_scale) + self.cur_item = 0 + self.part_idx = part_idx + self.num_parts = num_parts + if self.num_parts > 1: + if self.with_segments: + self.segments = self.segments.split(self.part_idx, self.num_parts) + else: + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=False + ) + + def __iter__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key, s, fs in r: + print(key) + process(s) + """ + return self + + def __next__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key , s, fs in r: + process(s) + """ + key, x, fs = self.read(1) + if len(key) == 0: + raise StopIteration + return key[0], x[0], fs[0] + + def next(self): + """__next__ for Python 2""" + return self.__next__() + + def reset(self): + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. + """ + self.cur_item = 0 + + def eof(self): + """End of file. + + Returns: + True, when we have read all the recordings in the dataset. + """ + if self.with_segments: + return self.cur_item == len(self.segments) + return self.cur_item == len(self.scp) + + def read(self, num_records=0, time_offset=0, time_durs=0): + """Reads next num_records audio files + + Args: + num_records: Number of audio files to read. + time_offset: List of floats indicating the start time to read in the utterance. + time_durs: List of floats indicating the number of seconds to read from each utterance + + Returns: + key: List of recording names. + data: List of waveforms + fs: list of sample freqs + """ + if num_records == 0: + if self.with_segments: + num_records = len(self.segments) - self.cur_item + else: + num_records = len(self.scp) - self.cur_item + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + keys = [] + data = [] + fs = [] + for i in range(num_records): + if self.eof(): + break + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + segment = self.segments[self.cur_item] + key = segment["segment_id"] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + key, file_path, _, _ = self.scp[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + keys.append(key) + data.append(x_i) + fs.append(fs_i) + self.cur_item += 1 + + return keys, data, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + try: + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args + + +class RandomAccessAudioReader(AudioReader): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + super().__init__(file_path, segments_path, wav_scale) + + def _read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + """ + if isinstance(keys, str): + keys = [keys] + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + data = [] + fs = [] + for i, key in enumerate(keys): + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + if not (key in self.segments): + raise Exception("Key %s not found" % key) + + segment = self.segments[key] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + if not (key in self.scp): + raise Exception("Key %s not found" % key) + + file_path, _, _ = self.scp[key] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + data.append(x_i) + fs.append(fs_i) + + return data, fs + + def read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + fs: List of sampling freq. + """ + try: + x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + except: + if isinstance(keys, str): + keys = [keys] + + if not isinstance(time_offset, (list, np.ndarray)): + time_offset = [time_offset] * len(keys) + if not isinstance(time_durs, (list, np.ndarray)): + time_durs = [time_durs] * len(keys) + + try: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ..." + ).format(keys, time_offset) + ) + x, fs = self._read(keys, time_offset=time_offset) + for i in range(len(x)): + end_sample = int(time_durs[i] * fs[i]) + x[i] = x[i][:end_sample] + except: + # try to read the full file + logging.info( + ( + "error-2 reading at key={}, " "retrying reading full file ..." + ).format(keys) + ) + x, fs = self._read(keys) + for i in range(len(x)): + start_sample = int(time_offset[i] * fs[i]) + end_sample = start_sample + int(time_durs[i] * fs[i]) + x[i] = x[i][start_sample:end_sample] + + return x, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("wav_scale",) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py index 17f78bc2..fb17cb18 100644 --- a/hyperion/io/packed_audio_reader.py +++ b/hyperion/io/packed_audio_reader.py @@ -378,7 +378,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -633,7 +634,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15, + default=1.0, + # default=2 ** 15, type=float, help=("multiplicative factor for waveform"), ) diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 37f579b4..60e01ef1 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -7,6 +7,8 @@ import re from enum import Enum +from pathlib import Path +import pandas as pd class ArchiveType(Enum): @@ -174,6 +176,11 @@ def create(cls, wspecifier): archive_type = ArchiveType.AUDIO archive = archives[cur_archive] cur_archive += 1 + elif option == "csv": + assert script is None, "Repeated csv in wspecifier %s" % script + assert len(archives) > cur_archive + script = archives[cur_archive] + cur_archive += 1 elif option == "scp": assert script is None, "Repeated scp in wspecifier %s" % script assert len(archives) > cur_archive @@ -332,7 +339,7 @@ def create(cls, rspecifier): assert len(archives) == 1 spec_type = None - archive = archives[0] + archive = Path(archives[0]) archive_type = None once = False is_sorted = False @@ -361,6 +368,9 @@ def create(cls, rspecifier): assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.RTTM + elif option == "csv": + assert spec_type is None + spec_type = RSpecType.SCRIPT elif option == "scp": assert spec_type is None spec_type = RSpecType.SCRIPT @@ -374,24 +384,31 @@ def create(cls, rspecifier): assert spec_type is not None, "Wrong wspecifier options %s" % fields[0] if spec_type == RSpecType.SCRIPT: - with open(archive, "r") as f: - scp_f2 = f.readline().strip().split(" ")[1] - if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + if archive.suffix == ".csv": + df = pd.read_csv(archive, nrows=2) + storage_path = df["storage_path"].values[0] + if re.match(r".*\.h5$", storage_path) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark:.*$", scp_f2) is not None: + elif re.match(r".*\.ark$", storage_path) is not None: archive_type = ArchiveType.ARK - elif ( - re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None - ): + elif re.match(r".*[cvg]$", storage_path) is not None: archive_type = ArchiveType.AUDIO else: - archive_type = ArchiveType.ARK - - # .split('[')[0].split(':') - # if len(scp) == 1: - # archive_type = ArchiveType.H5 - # else: - # archive_type = ArchiveType.ARK + raise ValueError(f"Unknown format for {storage_path}") + else: + with open(archive, "r") as f: + scp_f2 = f.readline().strip().split(" ")[1] + if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + archive_type = ArchiveType.H5 + elif re.match(r".*\.ark:.*$", scp_f2) is not None: + archive_type = ArchiveType.ARK + elif ( + re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) + is not None + ): + archive_type = ArchiveType.AUDIO + else: + archive_type = ArchiveType.ARK if archive_type == ArchiveType.ARK: for option in options: diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index 32032d1d..fff1ab4a 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -6,8 +6,7 @@ import logging from .bin_vad_reader import BinVADReader as BVR -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType from .segment_vad_reader import SegmentVADReader as SVR @@ -16,7 +15,6 @@ class VADReaderFactory(object): def create( rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, @@ -33,7 +31,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -48,7 +45,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -57,7 +53,6 @@ def create( @staticmethod def filter_args(**kwargs): valid_args = ( - "scp_sep", "path_prefix", "frame_shift", "frame_length", @@ -72,9 +67,6 @@ def add_class_args(parser, prefix=None): else: p1 = "--" + prefix + "." - parser.add_argument( - p1 + "scp-sep", default=" ", help=("scp file field separator") - ) parser.add_argument( p1 + "path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 799db930..92bd57dd 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -26,7 +26,7 @@ class SingleNoiseAugment(object): min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -46,7 +46,7 @@ def __init__( self.cache = None self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -55,7 +55,7 @@ def __init__( @staticmethod def _power(x): """Computes power of x in dB.""" - return 10 * np.log10((x ** 2).sum()) + return 10 * np.log10((x**2).sum() + 1e-10) @staticmethod def snr(x, n): @@ -96,7 +96,7 @@ def forward(self, x): while noise is None or noise.shape[0] < num_samples: with self.lock: - noise_idx = self.rng.randint(len(self.noise_keys)) + noise_idx = self.rng.integers(len(self.noise_keys)) key = self.noise_keys[noise_idx] noise_k, fs_k = self.r.read([key]) noise_k = noise_k[0] @@ -112,12 +112,22 @@ def forward(self, x): with self.lock: self.cache = noise_k[need_samples:] + num_zeros = np.sum(noise == 0) with self.lock: + # add dither for noises files with many 0s. + if num_zeros > len(noise) // 3: + noise += 0.0001 * self.rng.standard_normal( + noise.shape, dtype=noise.dtype + ) + target_snr = self.rng.uniform(self.min_snr, self.max_snr) + scale = self._compute_noise_scale(x, noise, target_snr) info = {"noise_type": self.noise_type, "snr": target_snr} - return x + scale * noise, info + y = x + scale * noise + + return y, info def __call__(self, x): return self.forward(x) @@ -136,7 +146,7 @@ class NoiseAugment(object): is proportional to how often we want to sample a given noise type. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): @@ -166,7 +176,7 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -177,7 +187,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object @@ -208,7 +218,7 @@ def forward(self, x): # decide whether to add noise or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.noise_prob: # we don't add noise diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py index cf4cc6cb..0b1f3596 100644 --- a/hyperion/np/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -39,7 +39,7 @@ class SingleReverbAugment(object): its first sample. preload_rirs: if True all RIRS are loaded into RAM. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -80,7 +80,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -129,7 +129,7 @@ def forward(self, x): num_samples = x.shape[0] with self.lock: - rir_idx = self.rng.randint(len(self.rir_keys)) + rir_idx = self.rng.integers(len(self.rir_keys)) if self.preload_rirs: h = self.rirs[rir_idx] @@ -155,6 +155,7 @@ def forward(self, x): "h_max": h_max, "h_delay": h_delay, } + return y, info def __call__(self, x): @@ -176,7 +177,7 @@ class ReverbAugment(object): max_reverb_context: number of samples required as left context for the convolution operation. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -210,7 +211,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -221,7 +222,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with reverb options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: ReverbAugment object. @@ -267,7 +268,7 @@ def forward(self, x): # decide whether to add reverb or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.reverb_prob: # we don't add reverb diff --git a/hyperion/np/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py index 0b1233f1..c27ca321 100644 --- a/hyperion/np/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -37,7 +37,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: SpeechAugment object. diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 18a15651..a648190d 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -22,7 +22,7 @@ class SpeedAugment(object): keep_length: applies padding or cropping to keep the lenght of the signal. random_seed: random seed for random number generator. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -34,14 +34,16 @@ def __init__( rng=None, ): logging.info( - "init speed augment with prob={}, speed_ratios={}, keep_length={}". - format(speed_prob, speed_ratios, keep_length)) + "init speed augment with prob={}, speed_ratios={}, keep_length={}".format( + speed_prob, speed_ratios, keep_length + ) + ) self.speed_prob = speed_prob self.speed_ratios = speed_ratios self.keep_length = keep_length if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -52,7 +54,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object. @@ -84,7 +86,7 @@ def forward(self, x): """ # decide whether to add noise or not - p = self.rng.random_sample() + p = self.rng.random() if p > self.speed_prob: # we don't add speed perturbation info = {"speed_ratio": 1} @@ -98,14 +100,12 @@ def forward(self, x): # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2**15 # we add some dither in the padding - pad_y = dither * np.ones( - (x.shape[-1] - y.shape[-1], ), dtype=y.dtype) + dither = np.max(x) / 2 ** 15 # we add some dither in the padding + pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: - y = y[:x.shape[-1]] + y = y[: x.shape[-1]] - # print(f"2 r={r} {x.shape} {y.shape}", flush=True) return y, info def __call__(self, x): diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py index d9d02ed0..60582016 100644 --- a/hyperion/np/classifiers/__init__.py +++ b/hyperion/np/classifiers/__init__.py @@ -10,4 +10,4 @@ from .linear_svmc import LinearSVMC from .logistic_regression import LogisticRegression from .q_scoring_homo_gbe import QScoringHomoGBE -from .svmc import GaussianSVMC +from .svmc import SVMC diff --git a/hyperion/np/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py index 82a84529..e77115cd 100644 --- a/hyperion/np/classifiers/binary_logistic_regression.py +++ b/hyperion/np/classifiers/binary_logistic_regression.py @@ -29,7 +29,7 @@ class BinaryLogisticRegression(LogisticRegression): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py index 842b850e..f03a05a0 100644 --- a/hyperion/np/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -42,8 +42,8 @@ class GreedyFusionBinaryLR(NPModel): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: int, RandomState instance or None, optional, default: None - The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. + random_state: int, default_rng instance or None, optional, default: None + The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If default_rng instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py index a6b8c7cc..f551af14 100644 --- a/hyperion/np/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -10,7 +10,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel @@ -426,7 +426,8 @@ def add_class_args(parser, prefix=None): parser.add_argument("--name", default="lgbe", help="model name") if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) @staticmethod @@ -468,7 +469,8 @@ def add_eval_args(parser, prefix=None): ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/np/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py index 8566aeab..37ac9656 100644 --- a/hyperion/np/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -9,8 +9,13 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import (fullcov_varfloor, int2onehot, invert_pdmat, - logdet_pdmat, softmax) +from ...utils.math_funcs import ( + fullcov_varfloor, + int2onehot, + invert_pdmat, + logdet_pdmat, + softmax, +) from ..np_model import NPModel from .linear_gbe import LinearGBE diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index 5d743a46..6a977df9 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -10,7 +10,7 @@ from sklearn.svm import LinearSVC as SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -41,7 +41,7 @@ class LinearSVMC(NPModel): The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None max_iter: int, default: 100 Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. @@ -61,7 +61,7 @@ class LinearSVMC(NPModel): penalty and dual will be ignored. verbose: int, default: 0 balance_class_weight: if True and class_weight is None, it makes class_weight="balanced". - lr_seed: seed form RandomState, used when random_state is None. + lr_seed: seed form default_rng, used when random_state is None. labels: list of class labels """ @@ -93,7 +93,7 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.use_bias = use_bias self.bias_scaling = bias_scaling diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 8e3d7e2e..03d9fd13 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression as LR from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -36,7 +36,7 @@ class LogisticRegression(NPModel): Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. @@ -93,6 +93,7 @@ def __init__( super().__init__(**kwargs) if random_state is None: + # random_state = np.random.default_rng(seed=lr_seed) random_state = np.random.RandomState(seed=lr_seed) if bias_scaling is None: diff --git a/hyperion/np/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py index 9e54e0f4..3345dd72 100644 --- a/hyperion/np/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -9,7 +9,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 9311b8e8..ac5211ef 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -9,20 +9,24 @@ import numpy as np from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from sklearn.svm import SVC as SVC +from sklearn.svm import SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax +from ...utils.misc import filter_func_args from ..np_model import NPModel -class GaussianSVMC(NPModel): +class SVMC(NPModel): """Gaussian Support Vector Machine for Classification.""" def __init__( self, C=1.0, + kernel="rbf", + degree=3, gamma="scale", + coef0=0.0, shrinking=True, probability=True, tol=0.0001, @@ -32,7 +36,6 @@ def __init__( class_weight=None, random_state=None, max_iter=100, - model=None, verbose=0, balance_class_weight=True, lr_seed=1024, @@ -46,27 +49,40 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) + + self.C = C + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.shrinking = shrinking + self.probability = probability + self.tol = tol + self.cache_size = cache_size + self.multi_class = multi_class + self.break_ties = break_ties + self.class_weight = class_weight self.balance_class_weight = balance_class_weight - if model is None: - self.svm = SVC( - C=C, - kernel="rbf", - gamma=gamma, - shrinking=shrinking, - probability=probability, - tol=tol, - cache_size=cache_size, - class_weight=class_weight, - verbose=verbose, - max_iter=max_iter, - decision_function_shape=multi_class, - break_ties=break_ties, - random_state=random_state, - ) - else: - self.svm = model + self.svm = SVC( + C=C, + kernel=kernel, + gamma=gamma, + degree=degree, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=multi_class, + break_ties=break_ties, + random_state=random_state, + ) + self.set_labels(labels) @property @@ -84,6 +100,18 @@ def get_config(self): Dictionary with config hyperparams. """ config = { + "C": self.C, + "kernel": self.kernel, + "gamma": self.gamma, + "degree": self.degree, + "coef0": self.coef0, + "shrinking": self.shrinking, + "probability": self.probability, + "tol": self.tol, + "cache_size": self.cache_size, + "multi_class": self.multi_class, + "break_ties": self.break_ties, + "class_weight": self.class_weight, "balance_class_weight": self.balance_class_weight, "labels": self.labels, } @@ -135,7 +163,6 @@ def fit(self, x, class_ids, sample_weight=None): class_ids: class integer [0, num_classes-1] identifier (num_samples,) sample_weight: weight of each sample in the estimation (num_samples,) """ - print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------") self.svm.fit(x, class_ids) if self.svm.fit_status_: logging.warning("SVM did not converge") @@ -153,9 +180,6 @@ def save(self, file_path): if not split_path[-1] == "sav": file_path = "".join(split_path[0] + ".sav") with open(file_path, "wb") as f: - # with h5py.File(file_path, "w") as f: - # config = self.to_json() - # f.create_dataset("config", data=np.array(config, dtype="S")) self.save_params(f) @classmethod @@ -169,27 +193,17 @@ def load(cls, file_path): Model object. """ split_path = os.path.splitext(file_path) - if not split_path[-1] == "sav": - file_path = "".join(split_path[0] + ".sav") + if not split_path[-1] == "pkl": + file_path = "".join(split_path[0] + ".pkl") - # with h5py.File(file_path, "r") as f: with open(file_path, "rb") as f: - # json_str = str(np.asarray(f["config"]).astype("U")) - # config = cls.load_config_from_json(json_str) - config = None - return cls.load_params(f, config) + return pickle.load(f) def save_params(self, f): - # params = {"A": self.A, "b": self.b} - # self._save_params_from_dict(f, params) pickle.dump(self, f) @classmethod - def load_params(cls, f, config): - # param_list = ["A", "b"] - # params = cls._load_params_to_dict(f, config["name"], param_list) - # kwargs = dict(list(config.items()) + list(params.items())) - # return cls(**kwargs) + def load_params(cls, f): svmc = pickle.load(f) return svmc @@ -200,27 +214,7 @@ def filter_class_args(**kwargs): Returns: Hyperparamter dictionary to initialize the class. """ - valid_args = ( - "nu", - "gamma", - "shrinking", - "probability", - "tol", - "cache_size", - "multi_class", - "break_ties", - "class_weight", - "random_state", - "max_iter", - "verbose", - "balance_class_weight", - "lr_seed", - "model", - "labels", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - filter_train_args = filter_class_args + return filter_func_args(SVMC.__init__, **kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -240,17 +234,27 @@ def add_class_args(parser, prefix=None): type=float, help="inverse of regularization strength", ) - # parser.add_argument( - # "--class_weight", - # default=None, - # help="Class weights", - # ) + parser.add_argument( + "--kernel", + default="rbf", + choices=["linear", "poly", "rbf", "sigmoid", "precomputed"], + help="kernel for svm", + ) + parser.add_argument( + "--degree", defaut=3, type=int, help="degree of polynomial kernel" + ) parser.add_argument( "--gamma", default="scale", choices=["scale", "auto"], help="Kernel coefficient for ‘rbf’", ) + parser.add_argument( + "--coef0", + default=0.0, + type=float, + help="independent term of poly and sigmoid kernels", + ) parser.add_argument( "--shrinking", default=True, @@ -264,7 +268,7 @@ def add_class_args(parser, prefix=None): help="Whether to enable probability estimates", ) parser.add_argument( - "--break_ties", + "--break-ties", default=True, type=bool, help="If true, predict will break ties according to the confidence values of decision_function; otherwise \ @@ -293,7 +297,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - "--cache_size", + "--cache-size", default=600, type=int, help="Specify the size of the kernel cache (in MB)", diff --git a/hyperion/np/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py index 5b9eb751..1d578c68 100644 --- a/hyperion/np/feats/energy_vad.py +++ b/hyperion/np/feats/energy_vad.py @@ -5,6 +5,7 @@ import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import lfilter from ...hyp_defs import float_cpu @@ -19,7 +20,7 @@ class EnergyVAD(object): sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 2^(-15)) snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) @@ -32,7 +33,7 @@ def __init__( sample_frequency=16000, frame_length=25, frame_shift=10, - dither=1, + dither=1 / 2 ** 15, snip_edges=True, vad_energy_mean_scale=0.5, vad_energy_threshold=5, @@ -97,7 +98,7 @@ def compute(self, x, return_loge=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -174,14 +175,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help=( @@ -191,24 +190,21 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -221,7 +217,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "vad-energy-mean-scale", + "--vad-energy-mean-scale", type=float, default=0.5, help=( @@ -231,13 +227,13 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-energy-threshold", + "--vad-energy-threshold", type=float, default=5, help="Constant term in energy threshold for MFCC0 for VAD", ) parser.add_argument( - p1 + "vad-frames-context", + "--vad-frames-context", type=int, default=0, help=( @@ -246,7 +242,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-proportion-threshold", + "--vad-proportion-threshold", type=float, default=0.6, help=( @@ -254,5 +250,7 @@ def add_class_args(parser, prefix=None): "the window that need to have more energy than the threshold" ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py index cd98840d..b56728b8 100644 --- a/hyperion/np/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -6,6 +6,7 @@ from enum import Enum import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.fftpack import dct from scipy.signal import lfilter @@ -72,7 +73,7 @@ class MFCC(object): preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 1/2**15) fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') low_freq: Low cutoff frequency for mel bins (default = 20) high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) @@ -98,7 +99,7 @@ def __init__( preemphasis_coeff=0.97, window_type="povey", use_fft2=True, - dither=1, + dither=1 / 2 ** 15, fb_type="mel_kaldi", low_freq=20, high_freq=0, @@ -256,7 +257,7 @@ def compute(self, x, return_fft=False, return_spec=False, return_logfb=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -400,14 +401,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help="Waveform data sample frequency " @@ -415,27 +414,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", - ) - parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "fft-length", type=int, default=512, help="Length of FFT" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) + parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT") parser.add_argument( - p1 + "remove-dc-offset", + "--remove-dc-offset", default=True, type=str2bool, help="Subtract mean from waveform on each frame", ) parser.add_argument( - p1 + "preemphasis-coeff", + "--preemphasis-coeff", type=float, default=0.97, help="Coefficient for use in signal preemphasis", @@ -444,30 +438,30 @@ def add_class_args(parser, prefix=None): FWF.add_class_args(parser, prefix) parser.add_argument( - p1 + "use-fft2", + "--use-fft2", default=True, type=str2bool, help="If true, it uses |X(f)|^2, if false, it uses |X(f)|", ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) FBF.add_class_args(parser, prefix) parser.add_argument( - p1 + "num-ceps", + "--num-ceps", type=int, default=13, help="Number of cepstra in MFCC computation (including C0)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -480,34 +474,34 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "energy-floor", + "--energy-floor", type=float, default=0, help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - p1 + "raw-energy", + "--raw-energy", default=True, type=str2bool, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( - p1 + "use-energy", + "--use-energy", default=True, type=str2bool, help="Use energy (not C0) in MFCC computation", ) parser.add_argument( - p1 + "cepstral-lifter", + "--cepstral-lifter", type=float, default=22, help="Constant that controls scaling of MFCCs", ) parser.add_argument( - p1 + "input-step", + "--input-step", default="wave", choices=["wave", "fft", "spec", "log_spec", "logfb"], help=( @@ -516,7 +510,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "output-step", + "--output-step", default="mfcc", choices=["fft", "spec", "log_spec", "logfb", "mfcc"], help=( @@ -524,4 +518,7 @@ def add_class_args(parser, prefix=None): ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py index 36afdbf5..d45daba5 100644 --- a/hyperion/np/metrics/__init__.py +++ b/hyperion/np/metrics/__init__.py @@ -5,7 +5,10 @@ from .acc import compute_accuracy from .confusion_matrix import * -from .dcf import (compute_act_dcf, compute_dcf, compute_min_dcf, - fast_eval_dcf_eer) +from .dcf import compute_act_dcf, compute_dcf, compute_min_dcf, fast_eval_dcf_eer from .eer import compute_eer, compute_prbep from .utils import effective_prior +from .verification_evaluator import ( + VerificationEvaluator, + VerificationAdvAttackEvaluator, +) diff --git a/hyperion/np/metrics/cllr.py b/hyperion/np/metrics/cllr.py index ec816286..cd97a97c 100644 --- a/hyperion/np/metrics/cllr.py +++ b/hyperion/np/metrics/cllr.py @@ -5,7 +5,7 @@ import numpy as np -from ..utils.math import neglogsigmoid +from ..utils.math_funcs import neglogsigmoid from .utils import opt_loglr diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py index 0715d809..e638fd1b 100644 --- a/hyperion/np/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -8,7 +8,7 @@ import numpy as np from ...hyp_defs import float_cpu -from ...utils.math import logsumexp, softmax +from ...utils.math_funcs import logsumexp, softmax def effective_prior(p_tar, c_miss, c_fa): diff --git a/hyperion/np/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py index 2adf15cf..e35e7cf7 100644 --- a/hyperion/np/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -2,8 +2,6 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - import copy import logging import re @@ -18,13 +16,13 @@ import matplotlib.pyplot as plt from ...hyp_defs import float_cpu -from ...utils import TrialKey, TrialScores +from ...utils import TrialKey, TrialScores, SparseTrialKey, SparseTrialScores from ...utils.trial_stats import TrialStats from .dcf import fast_eval_dcf_eer from .utils import effective_prior -class VerificationEvaluator(object): +class VerificationEvaluator: """Class computes performance metrics for verification problems. Same metrics can be obtained from fast_eval_dcf_eer functions @@ -34,21 +32,40 @@ class VerificationEvaluator(object): p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + key_name: name describing the key + score_name: name describing the score + sparse: use sparse versions of TrialScores and Keys """ - def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): - + def __init__( + self, + key, + scores, + p_tar, + c_miss=None, + c_fa=None, + key_name=None, + score_name=None, + sparse=False, + ): if isinstance(key, str): - logging.info("Load key: %s" % key) - key = TrialKey.load(key) + logging.info("Load key: %s", key) + if sparse: + key = SparseTrialKey.load(key) + else: + key = TrialKey.load(key) if isinstance(scores, str): - logging.info("Load scores: %s" % scores) - scores = TrialScores.load(scores) + logging.info("Load scores: %s", scores) + if sparse: + scores = SparseTrialScores.load(scores) + else: + scores = TrialScores.load(scores) self.key = key self.scores = scores.align_with_ndx(key) + self.key_name = key_name + self.score_name = score_name # compute effective prior is c_miss and c_fa are given if isinstance(p_tar, float): @@ -56,13 +73,16 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): p_tar = np.asarray(p_tar) if c_miss is not None and c_fa is not None: + assert len(c_miss) == len(p_tar) + assert len(c_fa) == len(p_tar) c_miss = np.asarray(c_miss) c_fa = np.asarray(c_fa) p_tar = effective_prior(p_tar, c_miss, c_fa) + self._p_tar_sort = np.argsort(p_tar) self.p_tar = p_tar - def compute_dcf_eer(self, return_df=False): + def compute_dcf_eer(self, return_df=True): """ Computes DCF/EER @@ -74,24 +94,38 @@ def compute_dcf_eer(self, return_df=False): """ logging.info("separating tar/non") tar, non = self.scores.get_tar_non(self.key) + ntar = len(tar) + nnon = len(non) logging.info("computing EER/DCF") - min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar) + min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer( + tar, non, self.p_tar[self._p_tar_sort] + ) + min_dcf[self._p_tar_sort] = min_dcf.copy() + act_dcf[self._p_tar_sort] = act_dcf.copy() if not return_df: - return min_dcf, act_dcf, eer + return min_dcf, act_dcf, eer, ntar, nnon if len(self.p_tar) == 1: eer = [eer] min_dcf = [min_dcf] act_dcf = [act_dcf] - df = pd.DataFrame({"eer": eer}) - + df = pd.DataFrame( + { + "scores": [self.score_name], + "key": [self.key_name], + "eer": eer, + "eer(%)": eer * 100, + } + ) for i in range(len(min_dcf)): pi = self.p_tar[i] df["min-dcf-%.3f" % (pi)] = min_dcf[i] df["act-dcf-%.3f" % (pi)] = act_dcf[i] + df["num_targets"] = ntar + df["num_nontargets"] = nnon return df @@ -116,9 +150,7 @@ class VerificationAdvAttackEvaluator(VerificationEvaluator): def __init__( self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None ): - super(VerificationAdvAttackEvaluator, self).__init__( - key, scores, p_tar, c_miss, c_fa - ) + super().__init__(key, scores, p_tar, c_miss, c_fa) if not isinstance(attack_scores, list): attack_scores = [attack_scores] if not isinstance(attack_stats, list): @@ -133,7 +165,7 @@ def __init__( if isinstance(attack_scores[0], str): l = [] for file_path in attack_scores: - logging.info("Load attack scores: %s" % file_path) + logging.info("Load attack scores: %s", file_path) scores = TrialScores.load(file_path) l.append(scores) attack_scores = l @@ -151,7 +183,7 @@ def __init__( if isinstance(attack_stats[0], str): l = [] for file_path in attack_stats: - logging.info("Load attack stats: %s" % file_path) + logging.info("Load attack stats: %s", file_path) scores = TrialStats.load(file_path) l.append(scores) attack_stats = l @@ -216,7 +248,7 @@ def compute_dcf_eer_vs_stats( stat_bins, attacked_trials="all", higher_better=False, - return_df=False, + return_df=True, ): """ Computes DCF/EER versus SNR/Linf/etc curves @@ -307,7 +339,7 @@ def find_best_attacks( threshold=None, prior_idx=0, higher_better=False, - return_df=False, + return_df=True, ): """ Find the best attacks from the point of view of some of the stats. E.g., diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index ee464161..aa635fc5 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -99,6 +99,8 @@ def _save_params_from_dict(self, f, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_save()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) if self.name is None: prefix = "" @@ -174,6 +176,9 @@ def _load_params_to_dict(f, name, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_cpu()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) + if name is None: prefix = "" else: diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py index b8f8bb54..67872315 100644 --- a/hyperion/np/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -7,11 +7,20 @@ import scipy.linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, symmat2vec, vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -213,7 +222,7 @@ def sample(self, num_samples, rng=None, seed=1024): assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( float_cpu() ) diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py index c9986f4c..23535112 100644 --- a/hyperion/np/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -7,9 +7,12 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -183,7 +186,7 @@ def sample(self, num_samples, rng=None, seed=1024): """ assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) return self.mu + 1.0 / self.cholLambda * x diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py index 80232e36..92d9c371 100644 --- a/hyperion/np/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -6,7 +6,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax +from ....utils.math_funcs import logsumexp, softmax from ..core import PDF @@ -232,7 +232,7 @@ def viterbi_decode(self, x, nbest=1): def sample(self, num_seqs, num_steps, rng=None, seed=1024): if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = np.zeros((num_seqs, num_steps, self.num_states), dtype=float_cpu()) x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,)) diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 041431fb..6e2b79e3 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -7,8 +7,13 @@ from scipy import linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (invert_pdmat, invert_trimat, logdet_pdmat, - symmat2vec, vec2symmat) +from ....utils.math_funcs import ( + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) from ..core.pdf import PDF diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 5560882c..d1cf7f68 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -7,8 +7,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.queues import GeneratorQueue +from ....utils.math_funcs import logsumexp, softmax from ..core import PDF @@ -110,86 +109,6 @@ def fit( else: return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def fit_generator( - self, - generator, - train_steps, - epochs=10, - val_data=None, - val_steps=0, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Trains the model from data read by a generator function. - This function is deprecated. - - Args: - generator: train data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - train_steps: number of training steps / epoch - epochs: number of epochs. - val_data: val. data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - val_steps: number of validation steps / epoch - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - log p(X) of the training data. - log p(x) per sample. - log p(X) of the val. data, if present. - log p(x) of the val. data per sample, if present. - """ - - do_validation = bool(val_data) - val_gen = hasattr(val_data, "next") or hasattr(val_data, "__next__") - if val_gen and not val_steps: - raise ValueError( - "When using a generator for validation data, " - "you must specify a value for " - "`val_steps`." - ) - - if do_validation and not val_gen: - x, u_x_val, sample_weight_val = self.tuple2data(val_data) - log_h_val = self.accum_log_h(x, sample_weight_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - N, u_x, log_h = self.Estep_generator( - generator, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - - self.Mstep(N, u_x) - elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) - - if val_data is not None: - if val_gen: - N, u_x, log_h_val = self.Estep_generator( - val_data, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - else: - N, u_x = self.Estep(val_data, u_x_val, sample_weight_val) - elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - - if val_data is None: - return elbo, elbo / x.shape[0] - else: - return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def log_h(self, x): """Computes log h(x) of the exp. family.""" return 0 @@ -404,7 +323,6 @@ def _accum_suff_stats_segments_prob_1batch( def _accum_suff_stats_segments_prob_nbatches( self, x, prob, sample_weight, batch_size ): - sw_i = None for i1 in range(0, x.shape[0], batch_size): i2 = np.minimum(i1 + batch_size, x.shape[0]) @@ -458,7 +376,6 @@ def accum_suff_stats_sorttime( def _accum_suff_stats_sorttime_1batch( self, x, frame_length, frame_shift, u_x=None, sample_weight=None ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -494,7 +411,6 @@ def _accum_suff_stats_sorttime_1batch( def _accum_suff_stats_sorttime_nbatches( self, x, frame_length, frame_shift, sample_weight, batch_size ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -539,65 +455,6 @@ def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): """ return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - def Estep_generator( - self, - generator, - num_steps, - return_log_h, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Expectation step, where data is read from a generator function. - - Args: - generator: data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - num_steps: number of steps / epoch - return_log_h: returns accumlated log h(x). - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - N zero order sufficient statistics (number of samples). - Accumlated sufficient statistics \sum u(x). - Accumlated log h(x) (optional). - """ - wait_time = 0.01 # in secs - queue = None - N = None - acc_u_x = None - log_h = 0 - try: - queue = GeneratorQueue( - generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time - ) - queue.start(workers=workers, max_queue_size=max_queue_size) - queue_generator = queue.get() - - cur_step = 0 - for cur_step in range(num_steps): - data = next(queue_generator) - x, u_x, sample_weight = self.tuple2data(data) - N_i, u_x_i = self.Estep(x, u_x, sample_weight) - if return_log_h: - log_h += self.accum_log_h(x) - if cur_step == 0: - N = N_i - acc_u_x = u_x_i - else: - N += N_i - acc_u_x += u_x_i - finally: - if queue is not None: - queue.stop() - - if return_log_h: - return N, acc_u_x, log_h - else: - return N, acc_u_x - def sum_suff_stats(self, N, u_x): """Sums suff. stats from muttiple sub-processes. @@ -754,28 +611,6 @@ def get_config(self): base_config = super(ExpFamilyMixture, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @staticmethod - def tuple2data(data): - if isinstance(data, tuple): - if len(data) == 2: - x, u_x = data - if u_x.ndim == 2: - sample_weight = None - elif u_x.ndim == 1: - sample_weight = u_x - u_x = None - else: - raise ValueError("Generator output: " + str(data)) - elif len(data) == 3: - x, u_x, sample_weight = data - else: - raise ValueError("Generator output: " + str(data)) - else: - x = data - u_x = None - sample_weight = None - return x, u_x, sample_weight - @staticmethod def compute_A_nat(eta): """Computes A_theta from the natural param.""" diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index ca197142..7b080dae 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -8,12 +8,22 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, logsumexp, softmax, symmat2vec, - vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + logsumexp, + softmax, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from ..core import Normal from .exp_family_mixture import ExpFamilyMixture @@ -292,7 +302,7 @@ def sample(self, num_samples, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 90141573..7589243e 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -8,10 +8,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .exp_family_mixture import ExpFamilyMixture @@ -262,7 +265,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index 4dc8f46e..6ef7c891 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -7,10 +7,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .gmm_diag_cov import GMMDiagCov @@ -193,7 +196,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index 183725a7..af8c5d8b 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -465,7 +465,7 @@ def sample( assert self.is_init if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sb = invert_pdmat(self.B, return_inv=True)[-1] chol_Sb = sla.cholesky(Sb, lower=False) diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index fd2eb9a9..76299970 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -674,7 +674,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) x_dim = self.mu.shape[0] diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index f9322d26..5d397183 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -6,7 +6,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -502,7 +502,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sw = invert_pdmat(self.W, return_inv=True)[-1] chol_Sw = sla.cholesky(Sw, lower=False) diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index 3f60c4be..ebabc6ec 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -23,7 +23,7 @@ class SklTSNE(NPModel): metric: the metric to use when calculating distance between instances in ['cosine', 'euclidean', 'l1', 'l2', 'precomputed'] or callable function. init: initialization method in ['random', 'pca'] or embedding matrix of shape (num_samples, num_comp) verbose: verbosity level. - rng: RandomState instance + rng: default_rng instance rng_seed: seed for random number generator method: gradient calculation method in [‘barnes_hut’, 'exact'] angle: angle thetha in Barnes-Hut TSNE @@ -53,7 +53,7 @@ def __init__( super().__init__(**kwargs) self.rng_seed = rng_seed if rng is None: - rng = np.random.RandomState(seed=rng_seed) + rng = np.random.default_rng(seed=rng_seed) self._tsne = TSNE( n_components=tsne_dim, diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 1e42a1c3..5e604e6a 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -9,7 +9,8 @@ import numpy as np import pandas as pd -#import k2 + +# import k2 import sentencepiece as spm import torchaudio.transforms as tat from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -24,17 +25,12 @@ from ...utils.segment_set import SegmentSet from ...utils.text import read_text from ..torch_defs import floatstr_torch - -#from torch.nn.utils.rnn import pad_sequence - - - +from .char_piece import CharPieceProcessor class AudioDataset(Dataset): - def __init__( self, - audio_file, + recordings_file, segments_file, class_names=None, class_files=None, @@ -46,7 +42,7 @@ def __init__( return_segment_info=None, return_orig=False, target_sample_freq=None, - wav_scale=2**15 - 1, + wav_scale=2 ** 15 - 1, is_val=False, ): @@ -61,12 +57,6 @@ def __init__( self.rank = rank self.world_size = world_size self.epoch = 0 - - if rank == 0: - logging.info("opening audio reader %s", audio_file) - - self.r = AR(audio_file, wav_scale=wav_scale) - if rank == 0: logging.info("loading segments file %s", segments_file) @@ -74,19 +64,27 @@ def __init__( if rank == 0: logging.info("dataset contains %d seqs", len(self.seg_set)) + if rank == 0: + logging.info("opening audio reader %s", recordings_file) + + audio_seg_set = self.seg_set if self.seg_set.has_time_marks else None + self.r = AR(recordings_file, segments=audio_seg_set, wav_scale=wav_scale) + self.is_val = is_val if time_durs_file is not None: - if rank == 0: - logging.info("loading durations file %s", time_durs_file) + self._load_legacy_durations(time_durs_file) + + # time_durs = SegmentSet.load(time_durs_file) + # self.seg_set["duration"] = time_durs.loc[ + # self.seg_set["id"]].class_id.values.astype(float, + # copy=False) + # else: + # assert "duration" in self.seg_set + assert "duration" in self.seg_set - time_durs = SegmentSet.load(time_durs_file) - self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"]].class_id.values.astype(np.float, - copy=False) - else: - assert "duration" in self.seg_set logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) if bpe_model is not None: @@ -96,8 +94,9 @@ def __init__( if text_file is not None: logging.info("loading text files") self._load_text_infos(text_file, is_val) - self.return_segment_info = ([] if return_segment_info is None else - return_segment_info) + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) self.return_orig = return_orig self.num_augs = num_augs @@ -106,11 +105,31 @@ def __init__( self.target_sample_freq = target_sample_freq self.resamplers = {} - def _load_bpe_model(self, bpe_model, is_val): + def _load_legacy_durations(self, time_durs_file): if self.rank == 0: - logging.info("loading bpe file %s" % bpe_model) - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model) + logging.info("loading durations file %s", time_durs_file) + + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(float, copy=False) + + def _load_bpe_model(self, bpe_model, is_val): + # if bpe_model end with .txt, it is a char piece model + # if bpe_model end with .model, it is a sentence piece model + if bpe_model.endswith(".txt"): + if self.rank == 0: + logging.info("loading char piece file %s", bpe_model) + self.sp = CharPieceProcessor() + self.sp.load(open(bpe_model).read().split()) + else: + if self.rank == 0: + logging.info("loading bpe file %s", bpe_model) + self.sp = spm.SentencePieceProcessor() + self.sp.load(bpe_model) + + + blank_id = self.sp.piece_to_id("") vocab_size = self.sp.get_piece_size() @@ -118,7 +137,7 @@ def _load_text_infos(self, text_file, is_val): if text_file is None: return if self.rank == 0: - logging.info("loading text file %s" % text_file) + logging.info("loading text file %s", text_file) text = read_text(text_file) self.seg_set["text"] = text.loc[self.seg_set["id"]].text @@ -131,8 +150,9 @@ def _load_class_infos(self, class_names, class_files, is_val): assert len(class_names) == len(class_files) for name, file in zip(class_names, class_files): - assert (name in self.seg_set - ), f"class_name {name} not present in the segment set" + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" if self.rank == 0: logging.info("loading class-info file %s" % file) table = ClassInfo.load(file) @@ -143,8 +163,9 @@ def _load_class_infos(self, class_names, class_files, is_val): segment_class_ids = self.seg_set[name].unique() for c_id in class_ids: if c_id not in segment_class_ids: - logging.warning("%s class: %s not present in dataset", - name, c_id) + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) def _create_augmenters(self, aug_cfgs): self.augmenters = [] @@ -154,12 +175,11 @@ def _create_augmenters(self, aug_cfgs): for aug_cfg in aug_cfgs: logging.info(f"loading augmentation={aug_cfg}") - augmenter = SpeechAugment.create(aug_cfg, - random_seed=112358 + - 1000 * self.rank) + augmenter = SpeechAugment.create( + aug_cfg, random_seed=112358 + 1000 * self.rank + ) self.augmenters.append(augmenter) - self.reverb_context = max(augmenter.max_reverb_context, - self.reverb_context) + self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) def set_epoch(self, epoch): self.epoch = epoch @@ -201,12 +221,13 @@ def _parse_segment_item(self, segment): assert duration <= self.seg_set.loc[seg_id].duration, ( f"{seg_id} with start={start} duration " f"({self.seg_set.loc[seg_id].duration}) < " - f"chunk duration ({duration})") + f"chunk duration ({duration})" + ) else: seg_id, start, duration = segment, 0, 0 - if "start" in self.seg_set: - start += self.seg_set.loc[seg_id].start + # if "start" in self.seg_set: + # start += self.seg_set.loc[seg_id].start return seg_id, start, duration @@ -217,14 +238,23 @@ def _read_audio(self, seg_id, start, duration): start -= reverb_context read_duration = duration + reverb_context + # read audio + x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) + return x[0].astype(floatstr_torch(), copy=False), fs[0] + + def _read_audio0(self, seg_id, start, duration): + # how much extra audio we need to load to + # calculate the reverb of the first part of the audio + reverb_context = min(self.reverb_context, start) + start -= reverb_context + read_duration = duration + reverb_context + # read audio recording_id = self.seg_set.recording_ids(seg_id) - x, fs = self.r.read([recording_id], - time_offset=start, - time_durs=read_duration) + x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] - def _apply_augs(self, x, num_samples, reverb_context_samples): + def _apply_augs(self, x, reverb_context_samples): x_augs = {} # for each type of augmentation for i, augmenter in enumerate(self.augmenters): @@ -233,7 +263,7 @@ def _apply_augs(self, x, num_samples, reverb_context_samples): # augment x x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. - x_aug = x_aug[reverb_context_samples:len(x)] + x_aug = x_aug[reverb_context_samples : len(x)] x_aug = x_aug.astype(floatstr_torch(), copy=False) x_augs[f"x_aug_{i}_{j}"] = x_aug @@ -293,6 +323,7 @@ def __getitem__(self, segment): x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) data = {"seg_id": seg_id, "sample_freq": fs} + if self.augmenters: # augmentations if duration == 0: @@ -300,7 +331,7 @@ def __getitem__(self, segment): else: num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples - x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + x_augs = self._apply_augs(x, reverb_context_samples) data.update(x_augs) # add original non augmented audio @@ -311,17 +342,19 @@ def __getitem__(self, segment): else: data["x"] = x - # try: - # import soundfile as sf - - # for i, z in enumerate(r): - # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") - # except: - # print("soundfile failed", flush=True) - - # adds the segment labels seg_info = self._get_segment_info(seg_id) data.update(seg_info) + if np.any(~np.isfinite(data["x"])): + print( + "zzz", + x.max(), + x.min(), + x.mean(), + data["x"].max(), + data["x"].min(), + data["x"].mean(), + flush=True, + ) return data @staticmethod @@ -329,7 +362,7 @@ def filter_args(**kwargs): ar_args = AR.filter_args(**kwargs) valid_args = ( - "audio_file", + "recordings_file", "segments_file", "aug_cfgs", "num_augs", @@ -352,48 +385,44 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - if "audio_file" not in skip: + if "recordings_file" not in skip: parser.add_argument( - "--audio-file", + "--recordings-file", required=True, - help=("audio manifest file"), + help=("recordings manifest file (kaldi .scp or pandas .csv)"), ) if "segments_file" not in skip: parser.add_argument( "--segments-file", required=True, - help=("segments manifest file"), + help=("segments manifest file (kaldi .scp or pandas .csv)"), ) parser.add_argument( "--class-names", default=None, nargs="+", - help= - ("list with the names of the types of classes in the datasets, e.g., speaker, language" - ), + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), ) parser.add_argument( - "--class-files", - default=None, - nargs="+", - help=("list of class info files"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) parser.add_argument( "--time-durs-file", default=None, - help= - ("segment to duration in secs file, if durations are not in segments_file" - ), + help=( + "(deprecated) segment to duration in secs file, if durations are not in segments_file" + ), ) + parser.add_argument( - "--bpe-model", - default=None, - help=("bpe model for the text label"), + "--bpe-model", default=None, help=("bpe model for the text label"), ) parser.add_argument( @@ -418,32 +447,31 @@ def add_class_args(parser, prefix=None, skip=set()): "--return-segment-info", default=None, nargs="+", - help= - ("list of columns of the segment file which should be returned as supervisions" - ), + help=( + "list of columns of the segment file which should be returned as supervisions" + ), ) parser.add_argument( "--return-orig", default=False, action=ActionYesNo, - help= - ("when using augmentation, whether or not to return also the original audio" - ), + help=( + "when using augmentation, whether or not to return also the original audio" + ), ) parser.add_argument( "--target-sample-freq", default=None, type=int, - help= - ("target sampling frequencey, if not None all audios are converted to this sample freq" - ), + help=( + "target sampling frequencey, if not None all audios are converted to this sample freq" + ), ) AR.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='audio dataset options') add_argparse_args = add_class_args diff --git a/hyperion/torch/data/char_piece.py b/hyperion/torch/data/char_piece.py new file mode 100644 index 00000000..43c07619 --- /dev/null +++ b/hyperion/torch/data/char_piece.py @@ -0,0 +1,34 @@ +import logging + +class CharPieceProcessor: + def __init__(self): + self.token2id = {} + self.id2token = {} + + def load(self, token_list): + for idx, token in enumerate(token_list): + self.token2id[token] = idx + self.id2token[idx] = token + logging.info("Loaded {} tokens".format(len(self.token2id))) + logging.info("First 10 tokens: {}".format(list(self.token2id.keys())[:10])) + return True + + + def piece_to_id(self, token): + return self.token2id.get(token, self.token2id[""]) + + def id_to_piece(self, idx): + return self.id2token.get(idx, "") + + def encode_as_pieces(self, text): + return [char for char in text] + + def encode(self, text, out_type=int): + assert out_type in [int] + return [self.piece_to_id(char) for char in text] + + def decode(self, ids): + return ''.join([self.id_to_piece(idx) for idx in ids]) + + def get_piece_size(self): + return len(self.token2id) diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py new file mode 100644 index 00000000..1509d446 --- /dev/null +++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py @@ -0,0 +1,265 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser +import logging + +import numpy as np +import pandas as pd +import torch +import torch.distributed as dist + +from .hyp_sampler import HypSampler +from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler + + +class ClassWeightedRandomBucketingSegSampler(HypSampler): + def __init__(self, + seg_set, + class_info, + base_sampler=ClassWeightedRandomSegSampler, + num_buckets=10, + length_column="duration", + num_chunks_per_seg_epoch=1.0, + weight_exponent=1.0, + weight_mode="custom", + class_name="language", + max_audio_length=None, + seed=1234, + **base_kwargs): + super().__init__(shuffle=False, seed=seed) + self.class_name = class_name + self.seg_set = seg_set + self.class_info = class_info + self.base_sampler = base_sampler + self.base_kwargs = base_kwargs + self.base_kwargs["seed"] = seed + self.num_buckets = num_buckets + self.length_column = length_column + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch + self.weight_exponent = weight_exponent + self.max_audio_length = max_audio_length + self.weight_mode = weight_mode + self._gather_class_info() + self._set_class_weights() + self._create_bucket_samplers() + self._compute_len() + self.depleted_buckets = torch.zeros((num_buckets, ), dtype=torch.bool) + + def create_buckets(self): + # class_ids = self._sample_classes() + sort_idx = np.argsort(self.seg_set[self.length_column].values) + sorted_seg_set = self.seg_set.iloc[sort_idx] + # import pdb; pdb.set_trace() + # remove audio length larger than max_audio_length + if self.max_audio_length is not None: + sorted_seg_set = sorted_seg_set.loc[sorted_seg_set[self.length_column] <= self.max_audio_length] + cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, + axis=0) + bucket_length = cum_lengths[-1] / self.num_buckets + buckets = [] + for i in range(self.num_buckets): + # logging.info("self.seg_set", self.seg_set.get_col_idx(self.length_column)) + # logging.info("sorted_seg_set", sorted_seg_set.get_col_idx(self.length_column)) + bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_i = sorted_seg_set.loc[bucket_idx] + # logging.info("bucket_i", bucket_i.get_col_idx(self.length_column)) + buckets.append(bucket_i) + cum_lengths -= bucket_length + + return buckets + + def _create_bucket_samplers(self): + buckets = self.create_buckets() + bucket_samplers = [] + for i in range(self.num_buckets): + sampler_i = self.base_sampler(buckets[i], + self.class_info, + class_name=self.class_name, + num_chunks_per_seg_epoch=self.num_chunks_per_seg_epoch, + **self.base_kwargs) + bucket_samplers.append(sampler_i) + + self.bucket_samplers = bucket_samplers + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + total_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_column] + total_dur[i] = durs_i.sum() + else: + total_dur[i] = 0 + + self.class_info["total_duration"] = total_dur + # logging.info("total_duration", self.class_info["total_duration"]) + + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + def _set_class_weights(self): + # logging.info("setting class weights") + # logging.info(f'weight mode:{self.weight_mode}') + # logging.info(f'weight exponent:{self.weight_exponent}') + # import pdb; pdb.set_trace() + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "data-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(weights) + logging.info(f'data-prior weight:{self.class_info["weights"]}') + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) + logging.info(f'weight_exponent weight:{self.class_info["weights"]}') + + + def _compute_len(self): + self._len = 0 + for i in range(self.num_buckets): + self._len += len(self.bucket_samplers[i]) + + def set_epoch(self, epoch): + for i in range(self.num_buckets): + self.bucket_samplers[i].set_epoch(epoch) + + def __iter__(self): + super().__iter__() + self.depleted_buckets[:] = False + for i in range(self.num_buckets): + self.bucket_samplers[i].__iter__() + + return self + + def all_buckets_depleted(self): + return torch.all(self.depleted_buckets).item() + + def __next__(self): + + if self.batch == self._len or self.all_buckets_depleted(): + raise StopIteration + + while True: + bucket_idx = torch.randint(low=0, + high=self.num_buckets, + size=(1, ), + generator=self.rng).item() + if self.depleted_buckets[bucket_idx]: + continue + + bucket = self.bucket_samplers[bucket_idx] + try: + batch = next(bucket) + break + except StopIteration: + self.depleted_buckets[bucket_idx] = True + if self.all_buckets_depleted(): + raise StopIteration() + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(batch[:10])) + + self.batch += 1 + return batch + + @property + def avg_batch_size(self): + avg_batch_size = 0 + for sampler in self.bucket_samplers: + avg_batch_size += sampler.avg_batch_size + + avg_batch_size /= self.num_buckets + return avg_batch_size + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "num_buckets", + "length_column", + "num_chunks_per_seg_epoch", + "weight_exponent", + "weight_mode", + "max_audio_length", + "class_name", + "length_column", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default=1, + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + + + parser.add_argument( + "--max-audio-length", + default=None, + type=float, + help=("the maximum length of an audio segment in seconds"), + ) + + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-column", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 7fbfbd71..5020c5bd 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -205,7 +205,8 @@ def _set_class_weights(self): self.class_info.set_uniform_weights() elif self.weight_mode == "data-prior": weights = self.class_info["total_duration"].values - self.class_info.set_weights(self, weights) + logging.info(weights) + self.class_info.set_weights(weights) if self.weight_exponent != 1.0: self.class_info.exp_weights(self.weight_exponent) @@ -217,6 +218,7 @@ def _set_class_weights(self): self.var_weights = np.any( self.seg_set[self.length_name] < self.max_chunk_length ) + logging.info(f'updated weight:{self.class_info["weights"]}') @property def hard_prototype_mining(self): @@ -244,7 +246,7 @@ def set_hard_prototypes(self, affinity_matrix): ).indices def get_hard_prototypes(self, class_idx): - return self.hard_prototypes[class_idx].flatten().numpy() + return self.hard_prototypes[class_idx].flatten().cpu().numpy() def _sample_chunk_length(self): if self.var_batch_size: diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py new file mode 100644 index 00000000..5af8cdcc --- /dev/null +++ b/hyperion/torch/data/class_weighted_seg_sampler.py @@ -0,0 +1,364 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging +import copy + + +import numpy as np + +import torch +from .hyp_sampler import HypSampler + + +def get_loc(seg_set, keys): + if isinstance(keys, (list, np.ndarray)): + return seg_set.index.get_indexer(keys) + + loc = seg_set.index.get_loc(keys) + if isinstance(loc, int): + return loc + elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + return np.nonzero(loc)[0] + else: + return list(range(loc.start, loc.stop, loc.step)) + +class ClassWeightedRandomSegSampler(HypSampler): + def __init__( + self, + seg_set, + class_info, + min_batch_size=1, + max_batch_size=None, + max_batch_length=None, + num_chunks_per_seg_epoch=1, + length_name="duration", + shuffle=False, + drop_last=False, + num_segs_per_class=1, + class_name="class_id", + seed=1234, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.class_info = copy.deepcopy(class_info) + self.num_segs_per_class = num_segs_per_class + self.class_name=class_name + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch + self.seg_set = seg_set + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.max_batch_length = max_batch_length + self.var_batch_size = max_batch_length is not None + self.length_name = length_name + if self.var_batch_size: + avg_batch_size = max_batch_length / np.mean( + self.seg_set[self.length_name]) + else: + avg_batch_size = min_batch_size + + self.avg_batch_size = avg_batch_size + + if drop_last: + self._len = int( + self.num_chunks_per_seg_epoch * len(self.seg_set) / (avg_batch_size * self.world_size)) + else: + self._len = int( + math.ceil( + (self.num_chunks_per_seg_epoch * len(self.seg_set) // self.world_size) / avg_batch_size)) + + self._gather_class_info() + self._permutation = None + + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + max_dur = np.zeros(len(self.class_info)) + min_dur = np.zeros(len(self.class_info)) + total_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_name] + max_dur[i] = durs_i.max() + min_dur[i] = durs_i.min() + total_dur[i] = durs_i.sum() + else: + max_dur[i] = min_dur[i] = total_dur[i] = 0 + + self.class_info["max_seg_duration"] = max_dur + self.class_info["min_seg_duration"] = min_dur + self.class_info["total_duration"] = total_dur + # logging.info("total_duration", self.class_info["total_duration"]) + + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of segments from each class + # to speed up segment sampling + # searching then in each batch, it is too slow + map_class_to_segs = self.seg_set[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_segs_idx = {} + for class_id in self.class_info["id"].values: + if class_id in map_class_to_segs.index: + seg_ids = map_class_to_segs.loc[class_id, "id"] + if isinstance(seg_ids, str): + seg_ids = [seg_ids] + else: + seg_ids = seg_ids.values + + seg_idx = get_loc(self.seg_set,seg_ids) + else: + seg_idx = [] + logging.warning("no segments found with class=%s", class_id) + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + + self.map_class_to_segs_idx[class_id] = seg_idx + logging.info(f'weight_exponent weight:{self.class_info["weights"]}') + + + def _get_class_weights(self): + # if not self.var_weights: + # return torch.as_tensor(self.class_info["weights"].values) + + class_weights = self.class_info["weights"].values.copy() + # renormalize weights + class_weights /= class_weights.sum() + return torch.as_tensor(class_weights) + + def _sample_classes(self, num_classes): + weights = self._get_class_weights() + # logging.info("weights: %s", weights) + + row_idx = torch.multinomial( + weights, num_samples=num_classes, replacement=True, generator=self.rng, + ).numpy() + + class_ids = self.class_info.iloc[row_idx].id.values + + return class_ids + + + def _sample_segs(self, class_ids): + + dur_col_idx = self.seg_set.columns.get_loc(self.length_name) + id_col_idx = self.seg_set.columns.get_loc("id") + + seg_ids = [] + for c in class_ids: + # for each class we sample segments longer than chunk length + # get segments belonging to c + # t1 = time.time() + seg_idx_c = self.map_class_to_segs_idx[c] + # seg_idx_c = self.map_class_to_segs_idx[c] + # t2 = time.time() + durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values + # if self.class_info.loc[c, "min_seg_duration"] < chunk_length: + # mask = durs >= chunk_length + # seg_idx_c = seg_idx_c[mask] + # durs = durs[mask] + + # t3 = time.time() + # sample num_segs_per_class random segments + if len(seg_idx_c) == 0: + logging.error("no segments found with class=%s dur=%d", c, chunk_length) + + sel_idx = torch.randint( + low=0, + high=len(seg_idx_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() + + sel_seg_idx_c = seg_idx_c[sel_idx] + sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) + # t5 = time.time() + seg_ids.extend(sel_seg_ids_c) + # t6 = time.time() + # logging.info( + # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + # ) + + return seg_ids + + def __len__(self): + return self._len + + def _shuffle_segs(self): + self._permutation = torch.randperm(len(self.seg_set), + generator=self.rng).numpy() + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + + if self.var_batch_size: + column_idx = self.seg_set.columns.get_loc(self.length_name) + idxs = [] + max_length = 0 + batch_size = 0 + while True: + if self.shuffle: + idx = self._permutation[self.start] + else: + idx = self.start + + max_length = max(max_length, self.seg_set.iloc[idx, + column_idx]) + if max_length * (batch_size + 1) > self.max_batch_length: + break + + idxs.append(idx) + self.start = (self.start + self.world_size) % len(self.seg_set) + batch_size += 1 + if (self.max_batch_size is not None + and batch_size >= self.max_batch_size): + break + + assert len( + idxs + ) >= 1, f"increase max_batch_length {self.max_batch_length} >= {max_length}" + else: + stop = min(self.start + self.world_size * self.min_batch_size, + len(self.seg_set)) + if self.shuffle: + idxs = self._permutation[self.start:stop:self.world_size] + else: + idxs = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size + + + class_ids = self._sample_classes(batch_size) + seg_ids = self._sample_segs(class_ids) + + + # if "chunk_start" in self.seg_set: + # chunks = self.seg_set.iloc[idxs] + # seg_ids = [(id, s, d) for id, s, d in zip( + # chunks.seg_id, chunks.chunk_start, chunks[self.length_name])] + # else: + # seg_ids = self.seg_set.iloc[idxs].id.values + + if self.batch == 0: + logging.info("batch 0 seg_ids=%s", str(seg_ids[:10])) + + self.batch += 1 + return seg_ids + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "min_batch_size", + "max_batch_size", + "max_batch_length", + "length_name", + "num_segs_per_class", + "num_chunks_per_seg_epoch", + "class_name", + "shuffle", + "drop_last", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help= + ("maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help= + ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help= + "shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-name", + default="duration", + help= + "which column in the segment table indicates the duration of the file", + ) + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default=1, + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 35973f50..0a9a8a69 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -13,14 +13,20 @@ ClassWeightedRandomSegChunkSampler from .feat_seq_dataset import FeatSeqDataset from .seg_chunk_sampler import SegChunkSampler +from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_bucketing_seg_sampler import ClassWeightedRandomBucketingSegSampler +from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler + from .seg_sampler import SegSampler sampler_dict = { "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, "seg_sampler": SegSampler, + "class_weighted_seg_sampler": ClassWeightedRandomSegSampler, "seg_chunk_sampler": SegChunkSampler, "bucketing_seg_sampler": BucketingSegSampler, + "class_weighted_random_bucketing_seg_sampler": ClassWeightedRandomBucketingSegSampler, } @@ -46,7 +52,7 @@ def create( sampler_class = sampler_dict[sampler_type] sampler_kwargs = sampler_class.filter_args(**kwargs) - if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler"]: + if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]: base_sampler_class = sampler_dict[base_sampler_type] base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) sampler_kwargs.update(base_sampler_kwargs) @@ -56,7 +62,9 @@ def create( base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) sampler_kwargs.update(base_sampler_kwargs) - if sampler_type in ["class_weighted_random_seg_chunk_sampler"]: + if sampler_type in ["class_weighted_random_seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]: + # import pdb; pdb.set_trace() + logging.info(f"sampler-args={sampler_kwargs}") try: class_name = sampler_kwargs["class_name"] except: @@ -78,6 +86,7 @@ def filter_args(**kwargs): "min_batch_size", "max_batch_size", "max_batch_length", + "max_audio_length", "num_chunks_per_seg_epoch", "num_segs_per_class", "num_chunks_per_seg", @@ -111,7 +120,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--base-sampler-type", - choices=["seg_sampler", "bucketing_seg_sampler"], + choices=["seg_sampler", "bucketing_seg_sampler", "bucketing_seg_sampler","class_weighted_seg_sampler"], default="seg_sampler", help= "base sampler used for seg_chunk_sampler or bucketing_seg_sampler", @@ -146,6 +155,15 @@ def add_class_args(parser, prefix=None): ), ) + + parser.add_argument( + "--max-audio-length", + default=None, + type=float, + help=("the maximum length of an audio segment in seconds"), + ) + + parser.add_argument( "--batch-size", default=None, diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 22cc629d..61d97285 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -9,6 +9,7 @@ from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock from .etdnn_blocks import ETDNNBlock from .fc_blocks import FCBlock +from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual from .mbconv_blocks import MBConvBlock, MBConvInOutBlock from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock @@ -31,8 +32,11 @@ from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample from .tdnn_blocks import TDNNBlock from .transducer_joiner import TransducerJoiner -from .transducer_predictor import (TransducerConvPredictor, - TransducerRNNPredictor) +from .transducer_predictor import TransducerRNNPredictor, TransducerConvPredictor + +from .transducer_film_joiner import TransducerFiLMJoiner +from .transducer_film_predictor import TransducerRNNFiLMPredictor + from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .transformer_encoder_v1 import TransformerEncoderBlockV1 from .transformer_feedforward import (Conv1dLinear, Conv1dx2, diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py new file mode 100644 index 00000000..28871e9c --- /dev/null +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -0,0 +1,131 @@ +import torch +import torch.nn as nn + +class FiLM(nn.Module): + def __init__(self, input_size, condition_size, film_type="linear"): + # condition_size: the size of the language id vector + # input_size: the size of the RNN input to the FiLM layer + super(FiLM, self).__init__() + if film_type == "tanh": + self.linear_scale = nn.Sequential( + nn.Linear(condition_size, input_size), + nn.Tanh() + ) + self.linear_shift = nn.Sequential( + nn.Linear(condition_size, input_size), + nn.Tanh() + ) + elif film_type == "linear": + self.linear_scale = nn.Linear(condition_size, input_size) + self.linear_shift = nn.Linear(condition_size, input_size) + + def forward(self, x, lang_condition): + # import pdb; pdb.set_trace() + if x.ndim == 3: + gamma = self.linear_scale(lang_condition).unsqueeze(1).expand_as(x) + beta = self.linear_shift(lang_condition).unsqueeze(1).expand_as(x) + x = x * gamma + beta + elif x.ndim == 4: + gamma = self.linear_scale(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x) + beta = self.linear_shift(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x) + x = x * gamma + beta + return x + + + +class RNNWithFiLM(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh", film_cond_type="one-hot"): + super(RNNWithFiLM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.batch_first = batch_first + self.rnn_type = rnn_type + self.film_type = film_type + self.film_cond_type = film_cond_type + + if self.rnn_type == "lstm": + self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + elif self.rnn_type == "gru": + self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + if self.film_cond_type == "lid_pred_embed": + self.lid_films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + + self.dropout_layer = nn.Dropout(dropout) + + def forward(self, x, states, lang_condition): + outputs = [] + new_h, new_c = [], [] + if self.rnn_type == "lstm": + rnns = self.lstms + elif self.rnn_type == "gru": + rnns = self.grus + + if self.film_cond_type in ["one-hot", "lid_pred"]: + films = self.films + else: + films = self.lid_films + + for i, (rnn, film) in enumerate(zip(rnns, films)): + if states: + x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) + else: + x, (h_i, c_i) = rnn(x) + x = film(x, lang_condition) + new_h.append(h_i) + new_c.append(c_i) + if i != self.num_layers - 1: + x = self.dropout_layer(x) + outputs.append(x) + new_h = torch.cat(new_h, dim=0) + new_c = torch.cat(new_c, dim=0) + return x, (new_h, new_c) + + +class RNNWithFiLMResidual(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual", film_type="linear"): + super(RNNWithFiLMResidual, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.batch_first = batch_first + self.rnn_type = rnn_type + if self.rnn_type == "lstm_residual": + self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + elif self.rnn_type == "gru_residual": + self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + self.film_type = film_type + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + self.dropout_layer = nn.Dropout(dropout) + + def forward(self, x, states, lang_condition): + outputs = [] + new_h, new_c = [], [] + + if self.rnn_type == "lstm_residual": + rnns = self.lstms + elif self.rnn_type == "gru_residual": + rnns = self.grus + + for i, (rnn, film) in enumerate(zip(rnns, self.films)): + if states: + x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) + else: + x, (h_i, c_i) = rnn(x) + x = film(x, lang_condition) + if i != 0: + x = x + residual + residual = x + new_h.append(h_i) + new_c.append(c_i) + if i != self.num_layers - 1: + x = self.dropout_layer(x) + outputs.append(x) + new_h = torch.cat(new_h, dim=0) + new_c = torch.cat(new_c, dim=0) + return x, (new_h, new_c) + diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 73255a24..8de700c4 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -410,9 +410,6 @@ def forward(self, x, x_mask=None): x += residual - if not self.norm_before: - x = self.bn3(x) - if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py new file mode 100644 index 00000000..dde91778 --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py @@ -0,0 +1,88 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from .film_blocks import FiLM + + +class TransducerFiLMJoiner(nn.Module): + """ RNN-T Joiner network. + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer_stateless7/joiner.py + + Attributes: + in_feats: input feature dimension. + vocab_size: vocabulary size + """ + + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear", film_cond_type="one-hot"): + + super().__init__() + self.enc_feats = enc_feats + self.pred_feats = pred_feats + self.hid_feats = hid_feats + self.vocab_size = vocab_size + + self.enc_proj = nn.Linear(enc_feats, hid_feats) + self.pred_proj = nn.Linear(pred_feats, hid_feats) + self.output = nn.Linear(hid_feats, vocab_size) + + self.film_cond_type = film_cond_type + + self.film = FiLM(hid_feats, condition_size, film_type) + + if self.film_cond_type == "lid_pred_embed": + self.lid_film = FiLM(hid_feats, condition_size, film_type) + + # self.film = FiLM(hid_feats, condition_size, film_type) + + + def get_config(self): + config = { + "joiner_type": "basic", + "hid_feats": self.hid_feats, + } + return config + + def forward(self, + enc_out: torch.Tensor, + pred_out: torch.Tensor, + lang_condition: torch.Tensor, + project_input: bool = True) -> torch.Tensor: + + """ + Args: + enc_out: output from the encoder with shape = (N, T, C) or (N, T, s_range, C) + pred_out: output from the predictor with shape = (N, U, C) or (N, T, s_range, C) + project_input: if True projects the encoder and predictor features + in the forward founction, if False it expects them outside. + Returns: + Symbols' logits of shape (N, T, U, C). + """ + assert enc_out.ndim == pred_out.ndim + assert enc_out.ndim in (3, 4) + if enc_out.ndim == 3: + enc_out = enc_out.unsqueeze(2) # (N, T, 1, C) + pred_out = pred_out.unsqueeze(1) # (N, 1, U, C) + + # enc_out = self.FiLM_encoder(enc_out, lang_condition) + + if project_input: + x = self.enc_proj(enc_out) + self.pred_proj(pred_out) + else: + x = enc_out + pred_out + + if self.film_cond_type in ["one-hot", "lid_pred"]: + x = self.film(x, lang_condition) + else: + x = self.lid_film(x, lang_condition) + + x = torch.tanh(x) + logits = self.output(x) + return logits diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py new file mode 100644 index 00000000..42272051 --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -0,0 +1,282 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layers import ActivationFactory as AF +from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual + +class TransducerRNNFiLMPredictor(nn.Module): + """ RNN-T prediction network with LSTM or GRU + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + num_layers: Number of LSTM layers. + hid_feats: Hidden dimension of LSTM layers. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + rnn_dropout_rate: Dropout for LSTM layers. + rnn_type: between lstm and gru + blank_id: The ID of the blank symbol. + """ + + def __init__(self, + vocab_size: int, + embed_dim: int, + num_layers: int, + hid_feats: int, + condition_size: int, + out_feats: Optional[int] = None, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + rnn_type: str = "lstm", + film_type: str = "linear", + film_cond_type: str = "one-hot", + blank_id: int = 0): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + if rnn_type in ["lstm","gru"]: + self.rnn = RNNWithFiLM( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + dropout=rnn_dropout_rate, + condition_size=condition_size, + batch_first=True, + rnn_type=rnn_type, + film_type=film_type, + film_cond_type=film_cond_type + ) + elif rnn_type in ["lstm_residual","gru_residual"]: + self.rnn = RNNWithFiLMResidual( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + dropout=rnn_dropout_rate, + condition_size=condition_size, + batch_first=True, + rnn_type=rnn_type, + film_type=film_type, + film_cond_type=film_cond_type + ) + else: + raise Exception(f"Unknown RNN type {rnn_type}") + + self.out_feats = out_feats + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.hid_feats = hid_feats + self.embed_dropout_rate = embed_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + if out_feats is None: + out_feats = hid_feats + + self.out_feats = out_feats + if out_feats != hid_feats: + self.output_proj = nn.Linear(hid_feats, out_feats) + else: + self.output_proj = None + + def get_config(self): + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_layers": self.num_layers, + "hid_feats": self.hid_feats, + "out_feats": self.out_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + "rnn_type": self.rnn_type, + "film_type": self.film_type, + "film_cond_type": self.film_cond_type, + "blank_id": self.blank_id, + } + return config + + def forward( + self, + y: torch.Tensor, + lang_condition: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: previous y_{ prepended. + states: tuple of tensors containing RNN layers states + Returns: + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the states i for RNN layers with shape (num_layers, N, C). + """ + embed = self.embedding(y) + embed = self.embed_dropout(embed) + out, (h, c) = self.rnn(embed, states, lang_condition) + if self.output_proj: + out = self.output_proj(out) + + return out, (h, c) + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) + +class TransducerConvPredictor(nn.Module): + """ RNN-T prediction network based on Convolutions + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + blank_id: The ID of the blank symbol. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + """ + + def __init__( + self, + vocab_size: int, + embed_dim: int, + condition_size: int, + out_feats: Optional[int] = None, + context_size: int = 2, + embed_dropout_rate: float = 0.0, + hid_act: str = "relu", + blank_id: int = 0, + ): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + assert context_size >= 1, context_size + if context_size > 1: + self.conv = nn.Conv1d( + in_channels=embed_dim, + out_channels=embed_dim, + kernel_size=context_size, + padding=0, + groups=out_feats // 4, + bias=False, + ) + + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.embed_dropout_rate = embed_dropout_rate + self.context_size = context_size + self.hid_act = AF.create(hid_act) + + if out_feats is None: + out_feats = embed_dim + + self.out_feats = out_feats + if out_feats != embed_feats: + self.output_proj = nn.Linear(embed_dim, out_feats) + else: + self.output_proj = None + + def get_config(self): + hid_act = AF.get_config(self.hid_act) + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "out_feats": self.out_feats, + "context_size": self.context_size, + "embed_dropout_rate": self.embed_dropout_rate, + "blank_id": self.blank_id, + "hid_act": hid_act, + } + return config + + def forward( + self, + y: torch.Tensor, + states: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, None]: + """ + Args: + y: + A 2-D tensor of shape (N, U). + # need_pad: + # True to left pad the input. Should be True during training. + # False to not pad the input. Should be False during inference. + Returns: + Return a tensor of shape (N, U, decoder_dim). + """ + y = y.to(torch.int64) + embed = self.embedding(y) + if self.context > 1: + embed = embed.transpose(1, 2) + if states is None: + embed = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + else: + raise NotImplementedError() + embed = self.conv(embed).transpose(1, 2) + + out = self.hid_act(embed) + if self.output_proj: + out = self.output_proj(out) + + return out, None + + # # this stuff about clamp() is a temporary fix for a mismatch + # # at utterance start, we use negative ids in beam_search.py + # if torch.jit.is_tracing(): + # # This is for exporting to PNNX via ONNX + # embedding_out = self.embedding(y) + # else: + # embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1) + # if self.context_size > 1: + # embedding_out = embedding_out.permute(0, 2, 1) + # if need_pad is True: + # embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + # else: + # # During inference time, there is no need to do extra padding + # # as we only need one output + # assert embedding_out.size(-1) == self.context_size + # embedding_out = self.conv(embedding_out) + # embedding_out = embedding_out.permute(0, 2, 1) + # embedding_out = F.relu(embedding_out) + # return embedding_out + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + ): + logging.info("changing predictor config") + + if override_dropouts: + logging.info("overriding predictor dropouts") + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 6b508b0e..bea52c95 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -4,20 +4,23 @@ """ from .activation_factory import ActivationFactory -from .attention import (LocalScaledDotProdAttRelPosEncV1, - LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1, - ScaledDotProdAttV1) +from .attention import ( + LocalScaledDotProdAttRelPosEncV1, + LocalScaledDotProdAttV1, + ScaledDotProdAttRelPosEncV1, + ScaledDotProdAttV1, +) from .audio_feats import * from .audio_feats_factory import AudioFeatsFactory from .calibrators import LinBinCalibrator from .dropout import DropConnect1d, DropConnect2d, Dropout1d from .global_pool import * from .interpolate import Interpolate +from .lora import LoRAFactory from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import (ConvPosEncoder, NoPosEncoder, PosEncoder, - RelPosEncoder) +from .pos_encoder import ConvPosEncoder, NoPosEncoder, PosEncoder, RelPosEncoder from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index a8398dac..6d0b4df4 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -315,7 +315,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--dither", type=float, - default=1, + default=1.0 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5e38494f..4967a2c5 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -42,8 +42,9 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): multiplied by the input data. """ if weights is None: + time_dim = self.dim if self.dim >= 0 else x.dim() + self.dim return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=self.dim + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=time_dim ) if weights.dim() == x.dim(): @@ -599,7 +600,7 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): """standardizes the weights to have shape (batch, max_length).""" if weights is None: return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=2 ) if weights.dim() == x.dim(): @@ -780,9 +781,13 @@ def forward(self, x, x_lengths=None, weights=None): x = x.transpose(1, self.dim) # x = (batch, feat_dim, time) + # logging.info("x_lengths",x_lengths) + # logging.info("weights_bef",weights) weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) x_inner = self.conv1(x) # (batch, inner_dim, time) + # logging.info("weights_aft",weights) # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) + # logging.info('weights shape={} {}'.format(weights.shape, weights.dtype)) if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) @@ -797,8 +802,9 @@ def forward(self, x, x_lengths=None, weights=None): if attn.dtype == torch.half: min_value = -65504 else: - min_value = -1e200 + min_value = -1e20 mask = weights.eq(0) + # logging.info("attn", attn.shape, mask.shape) attn = attn.masked_fill(mask, min_value) attn = nnf.softmax(attn, dim=-1) diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py new file mode 100644 index 00000000..91279119 --- /dev/null +++ b/hyperion/torch/layers/lora.py @@ -0,0 +1,120 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Union + +import loralib as lora +import torch.nn as nn +from loralib import mark_only_lora_as_trainable + + +def repr_lora(self, str_base): + if isinstance(self.lora_dropout, nn.Dropout): + lora_dropout = self.lora_dropout.p + else: + lora_dropout = 0 + + str_lora = f", r={self.r}, alpha={self.lora_alpha}, dropout={lora_dropout}, merge_weights={self.merge_weights})" + return str_base[:-1] + str_lora + + +class LinearLoRA(lora.Linear): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class EmbeddingLoRA(lora.Embedding): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv1dLoRA(lora.Conv1d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv2dLoRA(lora.Conv2d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv3dLoRA(lora.Conv3d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class LoRAFactory: + def create_from_pretrained( + layer: Union[nn.Embedding, nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d], + r: int = 8, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + merge_weights: bool = False, + ): + if isinstance(layer, nn.Embedding): + lora_layer = EmbeddingLoRA( + layer.num_embeddings, + layer.embedding_dim, + padding_idx=layer.padding_idx, + max_norm=layer.max_norm, + norm_type=layer.norm_type, + scale_grad_by_freq=layer.scale_grad_by_freq, + sparse=layer.sparse, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + + elif isinstance(layer, nn.Linear): + bias = layer.bias is not None + lora_layer = LinearLoRA( + layer.in_features, + layer.out_features, + bias=bias, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + if isinstance(layer, nn.Conv1d): + lora_class = Conv1dLoRA + elif isinstance(layer, nn.Conv2d): + lora_class = Conv2dLoRA + elif isinstance(layer, nn.Conv3d): + lora_class = Conv3dLoRA + + bias = layer.bias is not None + lora_layer = lora_class( + layer.in_channels, + layer.out_channels, + layer.kernel_size, + stride=layer.stride, + padding=layer.padding, + dilation=layer.dilation, + groups=layer.groups, + bias=bias, + padding_mode=layer.padding_mode, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + return lora_layer diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index bf3ce279..55cc2f52 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,3 +4,4 @@ """ from .bce_with_llr import BCEWithLLR +from .focal_loss import FocalLoss \ No newline at end of file diff --git a/hyperion/torch/losses/focal_loss.py b/hyperion/torch/losses/focal_loss.py new file mode 100644 index 00000000..f2a0d32a --- /dev/null +++ b/hyperion/torch/losses/focal_loss.py @@ -0,0 +1,48 @@ +from torch import nn +import torch +from torch.nn import functional as F +import logging +class FocalLoss(nn.Module): + def __init__(self, alpha=0.25, gamma=2, size_average=True): + """ + Focal loss implementation: -alpha(1-yi)**gamma * ce_loss(xi,yi) + + :param alpha: scalar or list. Class weights. If scalar, the same weight applies for all classes. + :param gamma: scalar. Difficult-to-easy sample regulation parameter. + :param size_average: bool. Whether to average the loss over the batch. + :param device: str. Device to place the tensors. + """ + super(FocalLoss,self).__init__() + self.gamma = gamma + self.size_average = size_average + self.alpha = alpha + logging.info("FocalLoss: alpha={}, gamma={}, size_average={}".format(alpha, gamma, size_average)) + + def forward(self, preds, labels): + """ + Compute the focal loss. + + :param preds: Predicted classes. size:[B,N,C] or [B,C] + :param labels: Actual classes. size:[B,N] or [B] + :return: scalar. Loss value. + """ + preds = preds.view(-1, preds.size(-1)) + preds_logsoft = F.log_softmax(preds, dim=1) + preds_softmax = torch.exp(preds_logsoft) + + preds_softmax = preds_softmax.gather(1, labels.view(-1, 1)) + preds_logsoft = preds_logsoft.gather(1, labels.view(-1, 1)) + + if isinstance(self.alpha, torch.Tensor): + alpha = self.alpha.gather(0, labels.view(-1)) + else: # if alpha is a scalar + alpha = self.alpha + + loss = -torch.mul(torch.pow((1 - preds_softmax), self.gamma), preds_logsoft) + + loss = torch.mul(alpha, loss.t()) + if self.size_average: + loss = loss.mean() + else: + loss = loss.sum() + return loss diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index 7a2e82f8..3f7b2ec7 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -7,7 +7,11 @@ from functools import partial import torch -from torch._six import inf + +try: + from torch import inf +except: + from torch._six import inf from .lr_scheduler import LRScheduler diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 06838ddd..74bb5ed2 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -7,11 +7,21 @@ from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE +from .transducer import RNNTransducer, RNNRNNTransducer +from .wav2languageid import HFWav2Vec2ResNet1dLanguageID from .wav2transducer import (HFWav2Vec2ConformerV1RNNTransducer, HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) -from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) + HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer, + HFWav2Vec2RNNFiLMTransducer) +from .wav2xvectors import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, + Wav2ResNetXVector, + Wav2ResNet1dXVector, +) +from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D, HFWav2Vec2RNNFiLMTransducerResnet1D + from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py index 2272793e..3a0f1dee 100644 --- a/hyperion/torch/models/plda/splda.py +++ b/hyperion/torch/models/plda/splda.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from ...utils.math import invert_trimat +from ...utils.math_funcs import invert_trimat from .plda_base import PLDABase diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index 984e15ec..476d63b5 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -7,6 +7,7 @@ from .conformer_v1_rnn_transducer import ConformerV1RNNTransducer from .rnn_rnn_transducer import RNNRNNTransducer from .rnn_transducer import RNNTransducer, RNNTransducerOutput +from .rnn_film_transducer import RNNFiLMTransducer from .transducer import Transducer #from .conformer import Conformer diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py new file mode 100644 index 00000000..6f82e101 --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_film_transducer.py @@ -0,0 +1,268 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ....utils import HypDataClass +from ....utils.misc import filter_func_args +from ...narchs import RNNFiLMTransducerDecoder +from ...torch_model import TorchModel + + +@dataclass +class RNNTransducerOutput(HypDataClass): + + loss: torch.Tensor + loss_simple: Optional[torch.Tensor] = None + loss_pruned: Optional[torch.Tensor] = None + h_feats: Optional[List[torch.Tensor]] = None + + +class RNNFiLMTransducer(TorchModel): + """ Base-class for RNN-T in + "Sequence Transduction with Recurrent Neural Networks" + https://arxiv.org/pdf/1211.3711.pdf + + Attributes: + encoder: Encoder network module + decoder: RNN-T Decoder config. dictionary or module. + """ + + def __init__( + self, + encoder: Union[TorchModel, None], + decoder: Union[Dict, RNNFiLMTransducerDecoder], + ): + super().__init__() + if encoder is not None: + assert isinstance(encoder, TorchModel) + if isinstance(decoder, dict): + decoder = RNNFiLMTransducerDecoder(**decoder) + else: + assert isinstance(decoder, RNNFiLMTransducerDecoder) + + self.encoder = encoder + self.decoder = decoder + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: Union[Dict, k2.RaggedTensor], + lang: torch.Tensor, + ) -> RNNTransducerOutput: + """ + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + y: ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + - Token logits with shape = (N, vocab_size) + - RNN-T loss. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lengths.size(0) == y.dim0 + assert torch.all( + x_lengths[:-1] >= x_lengths[1:] + ), f"x_lengths={x_lengths}" # check x_lengths are sorted + assert lang.size(0) == y.dim0 + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + dec_output = self.decoder(x, x_lengths, y, lang) + output = RNNTransducerOutput(*dec_output) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + lang: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[List[int]]: + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + lang: language id for each utterance with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert x.size(0) == x_lengths.size(0) + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + batch_size = x.size(0) + y = [] + for i in range(batch_size): + x_i = x[i:i + 1, :x_lengths[i]] + y_i = self.decoder.decode(x_i, + lang, + method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + y.append(y_i) + + return y + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen"] + + def get_config(self): + if self.encoder is None: + enc_cfg = None + else: + enc_cfg = self.encoder.get_config() + del enc_cfg["class_name"] + + dec_cfg = self.decoder.get_config() + del dec_cfg["class_name"] + config = { + "encoder": enc_cfg, + "decoder": dec_cfg, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + # get arguments for pooling + args = {} + decoder_args = RNNFiLMTransducerDecoder.filter_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducerDecoder.add_class_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def get_regularization_loss(self): + reg_loss = 0.0 + total_params = 0 + + for param in self.parameters(): + reg_loss += torch.norm(param)**2 + total_params += torch.numel(param) + + reg_loss = (reg_loss) / total_params + + return reg_loss + + + def change_config( + self, + decoder: Dict, + ): + logging.info("changing decoder config") + self.decoder.change_config(**decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = {} + decoder_args = RNNFiLMTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducerDecoder.add_finetune_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument("--decoding-method", + default="time_sync_beam_search", + choices=[ + "greedy", "time_sync_beam_search", + "align_length_sync_beam_search" + ]) + + parser.add_argument("--beam-width", + default=5, + type=int, + help="beam width for beam search") + parser.add_argument("--max-sym-per-frame", + default=3, + type=int, + help="max symbols RNN-T can emit in 1 frame") + parser.add_argument("--max-sym-per-utt", + default=1000, + type=int, + help="max symbols RNN-T can emit in 1 frame") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return filter_func_args(RNNFiLMTransducer.infer, kwargs) diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index 3326ef81..c9a65b5d 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -63,7 +63,7 @@ def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, - y: k2.RaggedTensor, + y: Union[Dict, k2.RaggedTensor], ) -> RNNTransducerOutput: """ Args: @@ -201,7 +201,7 @@ def change_config( @staticmethod def filter_finetune_args(**kwargs): args = {} - decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + decoder_args = RNNTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) args["decoder"] = decoder_args return args diff --git a/hyperion/torch/models/vae/__init__.py b/hyperion/torch/models/vae/__init__.py new file mode 100644 index 00000000..f4883a15 --- /dev/null +++ b/hyperion/torch/models/vae/__init__.py @@ -0,0 +1,5 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" diff --git a/hyperion/torch/models/wav2languageid/__init__.py b/hyperion/torch/models/wav2languageid/__init__.py new file mode 100644 index 00000000..849a30a6 --- /dev/null +++ b/hyperion/torch/models/wav2languageid/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_wav2vec2resnet1d_languageid import HFWav2Vec2ResNet1dLanguageID \ No newline at end of file diff --git a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py new file mode 100644 index 00000000..ff3a83a7 --- /dev/null +++ b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py @@ -0,0 +1,412 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import contextlib +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +# import torch.nn.functional as nnf + +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class HFWav2LanguageID(TorchModel): + """Abstract Base class for language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + languageid: language identification model object. + feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_end=-1, feat_fusion_method="weighted-avg" + ): + + super().__init__() + self.hf_feats = hf_feats + self.languageid = languageid + self.feat_fusion_start = feat_fusion_start + if feat_fusion_end == -1: + feat_fusion_end = self.hf_feats.num_encoder_layers + self.feat_fusion_end = feat_fusion_end + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.feat_fusion_end + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start : self.feat_fusion_end + 1] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + # logging.info(torch.tensor(norm_weights.values).to(device)) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + @property + def sample_frequency(self): + return self.hf_feats.sample_frequency + + def compute_prototype_affinity(self): + return self.languageid.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.languageid.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.languageid.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + assert(len(hid_feats) == self.hf_feats.num_encoder_layers + 1) + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the language identification encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the language identification encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.languageid( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the languageid just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.languageid.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.languageid.freeze_preembed_layers() + elif mode in ["ft-languageid", "ft-languageid-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalanguageid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.languageid._train("ft-embed_affine") + elif train_mode in [ + "ft-languageid", + "hf-feats-frozen", + "ft-languageid-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.languageid._train("full") + else: + raise ValueError(f"invalanguageid train_mode={train_mode}") + + @staticmethod + def valanguageid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-languageid", + "hf-feats-frozen", + "ft-languageid-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valanguageid_args = ( + "hf_feats", + "languageid", + "feat_fusion_start", + "feat_fusion_end", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valanguageid_args if k in kwargs) + return args + + def get_config(self): + + hf_cfg = self.hf_feats.get_config() + xvec_cfg = self.languageid.get_config() + del hf_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "languageid": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_end": self.feat_fusion_end, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, languageid): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to" + "the feat_fusion_end" + ), + ) + + + parser.add_argument( + "--feat-fusion-end", + default=-1, + type=int, + help=( + "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to" + "the feat_fusion_end" + ), + ) + + + + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="languageid options", + ) diff --git a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py new file mode 100644 index 00000000..fb64f060 --- /dev/null +++ b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py @@ -0,0 +1,100 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ...tpm import HFWav2Vec2 +from .hf_wav2languageid import HFWav2LanguageID + + +class HFWav2Vec2ResNet1dLanguageID(HFWav2LanguageID): + """Class extracting Wav2Vec2 + ResNet1d language identifications from waveform. + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + languageid: ResNet1dLanguageID configuration dictionary or object. + feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start: int = 0, + feat_fusion_end: int = -1, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_end, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + + base_args = HFWav2LanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + HFWav2LanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index 71e82b98..4d3c55b3 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -9,3 +9,4 @@ HFWav2Vec2ConformerV1RNNTransducer from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer +from .hf_wav2vec2rnn_film_transducer import HFWav2Vec2RNNFiLMTransducer diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py index b23a0769..2550ab3c 100644 --- a/hyperion/torch/models/wav2transducer/beam_search.py +++ b/hyperion/torch/models/wav2transducer/beam_search.py @@ -227,6 +227,9 @@ def beam_search( B = B[:beam] break t += 1 - best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + try: + best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + except: + return "" ys = best_hyp.ys[1:] # [1:] to remove the blank return ys diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py new file mode 100644 index 00000000..b0a0bfea --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -0,0 +1,416 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...torch_model import TorchModel +from ...utils import remove_silence +from ...layer_blocks import FiLM +from ..transducer import RNNFiLMTransducer + + +class HFWav2RNNFiLMTransducer(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg"): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNFiLMTransducer(**transducer) + else: + assert isinstance(transducer, RNNFiLMTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + self.transducer = transducer + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "film-weighted-avg": + self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type) for _ in range(num_layers)]) + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "film-fused-feature": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + self.film = FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type) + elif self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + def _fuse_hid_feats(self, hid_feats, lang_condition): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + lang_condition: language condition Tensor. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start:] + if self.feat_fusion_method == "film-weighted-avg": + film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) + film_hid_feats = torch.stack(film_hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(film_hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "film-fused-feature": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + feats = self.film(feats, lang_condition) + elif self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def forward_feats(self, + x, + x_lengths, + lang: torch.Tensor, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + + + lang_condition = self.transducer.decoder.lang_embedding(lang) + + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + condition_features=lang_condition, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats, lang_condition) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + languageid, + x_lengths=None, + text=None, + return_feat_layers=None, + # return_enc_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + languageid: language id torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, languageid, return_feat_layers) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + output = self.transducer( + feats, + feat_lengths, + text, + languageid, + ) + + if return_feat_layers: + output.h_feats = hid_feats + + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + languageid: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + languageid: language id torch.long tensor with shape=(batch,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + # import pdb; pdb.set_trace() + languageid = languageid[0] + feats, _, feat_lengths = self.forward_feats(x, x_lengths, languageid) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + y = self.transducer.infer(feats, + feat_lengths, + languageid, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + return y + + def unfreeze_film(self): + for name, param in self.named_parameters(): + if "film" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + if "lang_embedding" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method in ["weighted-avg", "film-weighted-avg", "film-fused-feature"]: + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-film", "ft-film-grad"]: + self.freeze() + self.unfreeze_film() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-film", + "ft-transducer", + "hf-feats-frozen", + "ft-film-grad", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-film", + "ft-embed-affine", + "ft-transducer", + "ft-film-grad", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=""" + the input to x-vector model will fuse the wav2vec + layers from feat_fusion_start to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNFiLMTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index 1d16675c..8fc59a3d 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -226,6 +226,9 @@ def freeze_feat_fuser(self): def freeze_hf_feats(self): self.hf_feats.freeze() + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() @@ -247,6 +250,15 @@ def set_train_mode(self, mode): elif mode == "hf-feat-extractor-frozen": self.unfreeze() self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") else: raise ValueError(f"invalid train_mode={mode}") @@ -270,6 +282,9 @@ def _train(self, train_mode: str): "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ]: self.hf_feats.train() self.transducer._train("full") @@ -287,6 +302,9 @@ def valid_train_modes(): "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ] @staticmethod diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py new file mode 100644 index 00000000..9ee37287 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py @@ -0,0 +1,107 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer +from ..transducer import RNNFiLMTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID + +class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNFiLMTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(transducer, dict): + # transducer["decoder"]["in_feats"] = hf_feats.hidden_size + # transducer["joiner"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in transducer: + # del transducer["class_name"] + # transducer = Transducer(**transducer) + # else: + # assert isinstance(transducer, Transducer) + # assert transducer.decoder.in_feats == hf_feats.hidden_size + # assert transducer.joiner.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNFiLMTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_class_args(parser, prefix="transducer") + HFWav2RNNFiLMTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + diff --git a/hyperion/torch/models/wav2transducer_languageid/__init__.py b/hyperion/torch/models/wav2transducer_languageid/__init__.py new file mode 100644 index 00000000..bc785608 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D +from .hf_wav2vec2rnn_film_transducer_languageid import HFWav2Vec2RNNFiLMTransducerResnet1D \ No newline at end of file diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py new file mode 100644 index 00000000..551d34de --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -0,0 +1,767 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ....utils import HypDataClass +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNFiLMTransducer, RNNTransducerOutput +from .hf_wav2rnn_transducer_languageid import RNNTransducerLanguageIDOutput +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ...layer_blocks import FiLM + + +class HFWav2RNNFiLMTransducerLanguageID(TorchModel): + """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + languageid: language identification model object. + feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + languageid: Union[Dict, TorchModel], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "film-weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp= 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + loss_weight_embed: float = 0.005, + loss_reg_weight_transducer: float = 0.0, + loss_reg_weight_lid: float = 0.0, + lid_length: float = 3.0, + ): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNFiLMTransducer(**transducer) + else: + assert isinstance(transducer, RNNFiLMTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + + self.transducer = transducer + self.languageid = languageid + self.feat_fusion_start_transducer = feat_fusion_start_transducer + self.feat_fusion_start_lid = feat_fusion_start_lid + self.feat_fusion_method_transducer = feat_fusion_method_transducer + self.feat_fusion_method_lid = feat_fusion_method_lid + self.loss_lid_type = loss_lid_type + self.loss_class_weight = loss_class_weight + self.loss_class_weight_exp = loss_class_weight_exp + + if loss_lid_type == "CE" or loss_lid_type is None: + self.loss_lid = nn.CrossEntropyLoss() + elif loss_lid_type == "weightedCE": + self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp)) + logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp)) + elif loss_lid_type == "focal_loss": + self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True) + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.loss_weight_embed = loss_weight_embed + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid + self.lid_length = lid_length + self._hf_context = contextlib.nullcontext() + if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]: + self.transducer_fuser, self.film, _ = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + elif self.transducer.decoder.film_cond_type == "lid_pred_embed": + self.transducer_fuser, self.film, self.lid_film = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + self.languageid_fuser, _, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) + + def _make_fuser(self, method, start): + feat_fuser = None + film = None + lid_film = None + if method == "last": + return feat_fuser, None, None + num_layers = self.hf_feats.num_encoder_layers + 1 - start + layer_dim = self.hf_feats.hidden_size + if method == "film-weighted-avg": + film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + lid_film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "film-fused-feature": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + film = FiLM(layer_dim, self.transducer.decoder.condition_size) + lid_film = FiLM(layer_dim, self.transducer.decoder.condition_size) + elif method == "weighted-avg": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "linear": + feat_fuser = nn.Linear(num_layers, 1, bias=False) + feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif method == "cat": + feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + return feat_fuser, film, lid_film + + def _fuse_transducer_hid_feats(self, hid_feats, lang_condition): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + lang: language id Tensor. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]: + lang_condition = self.transducer.decoder.lang_embedding(lang_condition) + film = self.film + else: + film = self.lid_film + hid_feats = hid_feats[self.feat_fusion_start_transducer:] + if self.feat_fusion_method_transducer == "film-weighted-avg": + film_hid_feats = tuple(film[i](hid_feats[i], lang_condition) for i in range(len(film))) + film_hid_feats = torch.stack(film_hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(film_hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_transducer == "film-fused-feature": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + feats = film(feats, lang_condition) + elif self.feat_fusion_method_transducer == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_transducer == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.transducer_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_transducer == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.transducer_fuser(hid_feats) + elif self.feat_fusion_method_transducer == "last": + feats = hid_feats[-1] + + return feats + + + def _fuse_lid_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start_lid:] + if self.feat_fusion_method_lid == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.languageid_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_lid == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.languageid_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_lid == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.languageid_fuser(hid_feats) + elif self.feat_fusion_method_lid == "last": + feats = hid_feats[-1] + + return feats + + def forward_lid_feats(self, + x, + x_lengths, + lang=None, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=True, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + + hid_feats = hf_output["hidden_states"] + feats = self._fuse_lid_hid_feats(hid_feats) + + + feats = feats.transpose(1, 2) + + return feats, hid_feats, feat_lengths + + def compute_embed_loss(self, lang_embed, languageid): + # comput the loss for the embeding between the film and lid_film + lang_condition = self.transducer.decoder.lang_embedding(languageid) + + # for the encoder + film_scale = self.film.linear_scale(lang_condition) + lid_film_scale = self.lid_film.linear_scale(lang_embed) + film_shift = self.film.linear_shift(lang_condition) + lid_film_shift = self.lid_film.linear_shift(lang_embed) + loss_embed_encode = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + # for the predictor + loss_embed_predictor = 0 + for i in range(2): + film_scale = self.transducer.decoder.predictor.rnn.films[i].linear_scale(lang_condition) + lid_film_scale = self.transducer.decoder.predictor.rnn.lid_films[i].linear_scale(lang_embed) + film_shift = self.transducer.decoder.predictor.rnn.films[i].linear_shift(lang_condition) + lid_film_shift = self.transducer.decoder.predictor.rnn.lid_films[i].linear_shift(lang_embed) + loss_embed_predictor += torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + + # for the joiner + film_scale = self.transducer.decoder.joiner.film.linear_scale(lang_condition) + lid_film_scale = self.transducer.decoder.joiner.lid_film.linear_scale(lang_embed) + film_shift = self.transducer.decoder.joiner.film.linear_shift(lang_condition) + lid_film_shift = self.transducer.decoder.joiner.lid_film.linear_shift(lang_embed) + loss_embed_joiner = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + + loss_embed = loss_embed_encode + loss_embed_predictor + loss_embed_joiner + + return loss_embed + + def forward( + self, + x, + x_lengths=None, + text=None, + languageid=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=[0], + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats( + x, x_lengths, return_feat_layers) + + lid_len = int(self.lid_length * 50) + min_len = torch.min(feat_lengths).item() + if min_len > lid_len: + lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item() + feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] + + + output = self.languageid( + feats_languageid, + None, + languageid, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + loss_lid = self.loss_lid(output["logits"], languageid) + + + loss_embed = torch.tensor(0) + if self.transducer.decoder.film_cond_type == "lid_pred_embed": + loss_embed = self.compute_embed_loss(output["h_classif"][0], languageid) + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0]) # (N, T, C) + trans_output = self.transducer( + feats_transducer, + feat_lengths, + text, + output["h_classif"][0] + # lid_logits + ) + elif self.transducer.decoder.film_cond_type == "one-hot": + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, languageid) + output = self.transducer( + feats, + feat_lengths, + text, + languageid, + ) + elif self.transducer.decoder.film_cond_type == "lid_pred": + _, lid_pred = torch.max(output["logits"], dim=-1) + # logging.info("logits", output["logits"]) + # logging.info("lid_pred", lid_pred) + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_pred) + trans_output = self.transducer( + feats_transducer, + feat_lengths, + text, + lid_pred + # lid_logits + ) + + if return_feat_layers: + trans_output.h_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + + # loss_reg_lid = 0 + # if self.loss_reg_weight_lid > 0: + loss_reg_lid = self.languageid.get_regularization_loss() + + # loss_reg_transducer = 0 + # if self.loss_reg_weight_transducer > 0: + loss_reg_transducer = self.transducer.get_regularization_loss() + + + + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed + self.loss_reg_weight_lid * loss_reg_lid + self.loss_reg_weight_transducer * loss_reg_transducer, + loss_transducer=trans_output.loss, + loss_lid=loss_lid, + loss_embed=loss_embed if self.transducer.decoder.film_cond_type == "lid_pred_embed" else None, + loss_reg_lid=loss_reg_lid, + loss_reg_transducer=loss_reg_transducer, + loss_transducer_simple=trans_output.loss_simple, + loss_transducer_pruned=trans_output.loss_pruned, + h_feats=trans_output.h_feats, + logits=output["logits"] if return_logits else None) + # logits=lid_logits if return_logits else None) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + + feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats( + x, x_lengths, None) + # logging.info(f"feat_lengths: {feat_lengths}") + # logging.info(f"feats_languageid.shape: {feats_languageid.shape}") + # logging.info(f"feats_languageid: {feats_languageid}") + + + output = self.languageid( + feats_languageid, + None, + None, + return_enc_layers=None, + return_classif_layers=[0], + return_logits=True, + ) + + # output = self.languageid( + # feats_languageid, + # feat_lengths, + # None, + # return_enc_layers=None, + # return_classif_layers=[0], + # return_logits=True, + # ) + + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0]) # (N, T, C) + + + text = self.transducer.infer(feats_transducer, + feat_lengths, + lang=output["h_classif"][0], + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + return text, output["logits"] + + def unfreeze_lid_film(self): + for name, param in self.named_parameters(): + if "lid_film" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + + + def unfreeze_film(self): + for name, param in self.named_parameters(): + if "film" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + + + def freeze_lid(self): + self.languageid.freeze() + + def freeze_film(self): + for name, param in self.named_parameters(): + # logging.info(f"parameter {name}") + if "film" in name and "lid_film" not in name: + logging.info(f"freezing {name}") + param.requires_grad = False + if "lang_embedding" in name: + logging.info(f"freezing {name}") + param.requires_grad = False + + def freeze_lid_feat_fuser(self): + if self.languageid_fuser is None: + return + + if self.feat_fusion_method_lid == "weighted-avg": + self.languageid_fuser.requires_grad = False + return + + for param in self.languageid_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + logging.info("setting train mode to %s", mode) + logging.info("train mode was %s", self._train_mode) + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + if mode == "freeze-gt-film": + self.unfreeze() + self.freeze_film() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-gt-film", "ft-gt-film-grad"]: + self.freeze() + self.unfreeze_film() + elif mode in ["ft-film", "ft-film-grad"]: + self.freeze() + self.unfreeze_lid_film() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_film() + self.freeze_lid_feat_fuser() + self.freeze_lid() + # self.unfreeze_lid_film() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-gt-film", + "ft-film", + "freeze-gt-film", + "ft-transducer", + "hf-feats-frozen", + "ft-gt-film-grad", + "ft-film-grad", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "freeze-gt-film", + "ft-gt-film", + "ft-film", + "ft-embed-affine", + "ft-transducer", + "ft-gt-film-grad", + "ft-film-grad", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start_transducer", + "feat_fusion_start_lid", + "feat_fusion_method_transducer", + "feat_fusion_method_lid", + "loss_lid_type", + "loss_class_weight", + "loss_class_weight_exp", + "loss_weight_transducer", + "loss_weight_lid", + "loss_weight_embed", + "loss_reg_weight_transducer", + "loss_reg_weight_lid", + "languageid", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + lid_cfg = self.languageid.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + del lid_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "languageid": lid_cfg, + "feat_fusion_start_transducer": self.feat_fusion_start_transducer, + "feat_fusion_start_lid": self.feat_fusion_start_lid, + "feat_fusion_method_transducer": self.feat_fusion_method_transducer, + "feat_fusion_method_lid": self.feat_fusion_method_lid, + "loss_lid_type": self.loss_lid_type, + "loss_class_weight": self.loss_class_weight, + "loss_class_weight_exp": self.loss_class_weight_exp, + "loss_weight_transducer": self.loss_weight_transducer, + "loss_weight_lid": self.loss_weight_lid, + "loss_weight_embed": self.loss_weight_embed, + "loss_reg_weight_transducer": self.loss_reg_weight_transducer, + "loss_reg_weight_lid": self.loss_reg_weight_lid, + "lid_length": self.lid_length, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, loss_weight_transducer, loss_weight_lid, loss_weight_embed, loss_reg_weight_transducer, loss_reg_weight_lid, lid_length, hf_feats, transducer, languageid): + logging.info("changing hf wav2film_transducer_languageid config") + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.loss_weight_embed = loss_weight_embed + self.lid_length = lid_length + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid + + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start-transducer", + default=0, + type=int, + help=""" + the input to transducer model will fuse the wav2vec + layers from feat_fusion_start_transducer to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-start-lid", + default=0, + type=int, + help=""" + the input to lid model will fuse the wav2vec + layers from feat_fusion_start_lid to + the wav2vec num_layers""", + ) + + parser.add_argument( + "--feat-fusion-method-transducer", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + parser.add_argument( + "--feat-fusion-method-lid", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + choices=["CE", "weightedCE", "focal_loss"], + help=("loss type for language identification"), + ) + parser.add_argument( + "--loss-class-weight", + default=None, + type=str, + help=("class weight for language identification"), + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=("class weight exponent for language identification"), + ) + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--loss-weight-embed", + default=0.005, + type=float, + help=""" + The weight of the embedding loss + """, + ) + parser.add_argument( + "--loss-reg-weight-transducer", + default=0.0, + type=float, + help=""" + The weight of the transducer regularization loss + """, + ) + parser.add_argument( + "--loss-reg-weight-lid", + default=0.0, + type=float, + help=""" + The weight of the lid regularization loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNFiLMTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py new file mode 100644 index 00000000..b2da0920 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -0,0 +1,587 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ....utils import HypDataClass +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNTransducerOutput +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID + +@dataclass +class RNNTransducerLanguageIDOutput(HypDataClass): + loss: torch.Tensor # Total loss + loss_transducer: torch.Tensor # Loss from the transducer + loss_lid: torch.Tensor # Loss from the language ID + loss_embed: Optional[torch.Tensor] = None # Loss from the embedding + loss_reg_lid: Optional[torch.Tensor] = None # Regularization loss from the language ID + loss_reg_transducer: Optional[torch.Tensor] = None # Regularization loss from the transducer + loss_transducer_simple: Optional[torch.Tensor] = None # Simple loss from the transducer, if available + loss_transducer_pruned: Optional[torch.Tensor] = None # Pruned loss from the transducer, if available + h_feats: Optional[List[torch.Tensor]] = None # Hidden features, if available + logits: Optional[torch.Tensor] = None # Logits from languageid, if available + + +class HFWav2RNNTransducerLanguageID(TorchModel): + """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + languageid: language identification model object. + feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + languageid: Union[Dict, TorchModel], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp= 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, + ): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + + self.transducer = transducer + self.languageid = languageid + self.feat_fusion_start_transducer = feat_fusion_start_transducer + self.feat_fusion_start_lid = feat_fusion_start_lid + self.feat_fusion_method_transducer = feat_fusion_method_transducer + self.feat_fusion_method_lid = feat_fusion_method_lid + self.loss_lid_type = loss_lid_type + self.loss_class_weight = loss_class_weight + self.loss_class_weight_exp = loss_class_weight_exp + + if loss_lid_type == "CE" or loss_lid_type is None: + self.loss_lid = nn.CrossEntropyLoss() + elif loss_lid_type == "weightedCE": + self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp)) + logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp)) + elif loss_lid_type == "focal_loss": + self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True) + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.lid_length = lid_length + self._hf_context = contextlib.nullcontext() + self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) + + def _make_fuser(self, method, start): + if method == "last": + feat_fuser = None + return feat_fuser + num_layers = self.hf_feats.num_encoder_layers + 1 - start + layer_dim = self.hf_feats.hidden_size + if method == "weighted-avg": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "linear": + feat_fuser = nn.Linear(num_layers, 1, bias=False) + feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif method == "cat": + feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + return feat_fuser + + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start_transducer:] + if self.feat_fusion_method_transducer == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_transducer_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + norm_lid_weights = nn.functional.softmax(self.languageid_fuser, dim=-1) + feats_transducer = torch.sum(hid_feats * norm_transducer_weights, dim=-1) + feats_languageid = torch.sum(hid_feats * norm_lid_weights, dim=-1) + elif self.feat_fusion_method_transducer == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1) + feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_transducer == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats_transducer = self.transducer_fuser(hid_feats) + feats_languageid = self.languageid_fuser(hid_feats) + elif self.feat_fusion_method_transducer == "last": + feats = hid_feats[-1] + + return feats_transducer, feats_languageid + + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method_transducer == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats) + # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_lid, self.languageid_fuser) + else: + hid_feats = None + feats_transducer = hf_output["last_hidden_state"] + feats_languageid = hf_output["last_hidden_state"] + + feats_transducer = feats_transducer.transpose(1, 2) + feats_languageid = feats_languageid.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats_transducer, feats_languageid, hid_feats, feat_lengths + + # def languageid_chunk(self, feats, lengths): + # sr = self.hf_feats.get_config()["sample_frequency"] + # strides = self.hf_feats.get_config()["conv_stride"] + + # total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32)) + + # chunk_length = int(self.lid_length * sr / total_stride) + + # # Check if all samples are longer than chunk_length + # if any(len < chunk_length for len in lengths): + # return feats + + # start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths] + + # chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)]) + + # return chunks + + + def forward( + self, + x, + x_lengths=None, + text=None, + languageid=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers) + + lid_len = int(self.lid_length * 50) + min_len = torch.min(feat_lengths).item() + if min_len > lid_len: + lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item() + feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] + + + # feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths) + + feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + logits = self.languageid( + feats_languageid, + None, + languageid, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + # loss_lid = nn.CrossEntropyLoss()(logits, languageid) + loss_lid = self.loss_lid(logits, languageid) + + trans_output = self.transducer( + feats_transducer, + feat_lengths, + text, + ) + + + if return_feat_layers: + trans_output.h_feats = hid_feats + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, + loss_transducer=trans_output.loss, + loss_lid=loss_lid, + loss_transducer_simple=trans_output.loss_simple, + loss_transducer_pruned=trans_output.loss_pruned, + h_feats=trans_output.h_feats, + logits=logits if return_logits else None) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + feats_transducer, feats_languageid, _, feat_lengths = self.forward_feats(x, x_lengths) + # logging.info(f"feat_lengths: {feat_lengths}") + # logging.info(f"feats_transducer.shape: {feats_transducer.shape}") + # logging.info(f"feats_languageid.shape: {feats_languageid.shape}") + # logging.info(f"feats_transducer: {feats_transducer}") + # logging.info(f"feats_languageid: {feats_languageid}") + lid = self.languageid( + feats_languageid.float(), + None, + None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ) + + + feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + text = self.transducer.infer(feats_transducer, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + return text, lid + + # def freeze_feat_fuser(self): + # if self.feat_fuser is None: + # return + + # if self.feat_fusion_method_transducer == "weighted-avg": + # self.feat_fuser.requires_grad = False + # return + + # for param in self.feat_fuser.parameters(): + # param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start_transducer", + "feat_fusion_start_lid", + "feat_fusion_method_transducer", + "feat_fusion_method_lid", + "loss_lid_type", + "loss_class_weight", + "loss_class_weight_exp", + "loss_weight_transducer", + "loss_weight_lid", + "languageid", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + lid_cfg = self.languageid.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + del lid_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "languageid": lid_cfg, + "feat_fusion_start_transducer": self.feat_fusion_start_transducer, + "feat_fusion_start_lid": self.feat_fusion_start_lid, + "feat_fusion_method_transducer": self.feat_fusion_method_transducer, + "feat_fusion_method_lid": self.feat_fusion_method_lid, + "loss_lid_type": self.loss_lid_type, + "loss_class_weight": self.loss_class_weight, + "loss_class_weight_exp": self.loss_class_weight_exp, + "loss_weight_transducer": self.loss_weight_transducer, + "loss_weight_lid": self.loss_weight_lid, + "lid_length": self.lid_length, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + # def change_config(self, hf_feats, transducer, languageid): + def change_config(self, loss_weight_transducer, loss_weight_lid, lid_length, hf_feats, transducer, languageid): + logging.info("changing hf wav2transducer config") + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.lid_length = lid_length + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid + + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start-transducer", + default=0, + type=int, + help=""" + the input to transducer model will fuse the wav2vec + layers from feat_fusion_start_transducer to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-start-lid", + default=0, + type=int, + help=""" + the input to lid model will fuse the wav2vec + layers from feat_fusion_start_lid to + the wav2vec num_layers""", + ) + + parser.add_argument( + "--feat-fusion-method-transducer", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + parser.add_argument( + "--feat-fusion-method-lid", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + choices=["CE", "weightedCE", "focal_loss"], + help=("loss type for language identification"), + ) + parser.add_argument( + "--loss-class-weight", + default=None, + type=str, + help=("class weight for language identification"), + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=("class weight exponent for language identification"), + ) + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py new file mode 100644 index 00000000..cad64e99 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py @@ -0,0 +1,227 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNFiLMTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID +from ..wav2transducer import HFWav2Vec2RNNFiLMTransducer + + +from .hf_wav2rnn_film_transducer_languageid import HFWav2RNNFiLMTransducerLanguageID + + +class HFWav2Vec2RNNFiLMTransducerResnet1D(HFWav2RNNFiLMTransducerLanguageID): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNFiLMTransducer], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp: float = 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + loss_weight_embed: float = 0.005, + loss_reg_weight_transducer: float = 0.0, + loss_reg_weight_lid: float = 0.0, + lid_length: float = 3.0, + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(languageid, dict): + # languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in languageid: + # del languageid["class_name"] + # languageid = ResNet1dLanguageID(**languageid) + # else: + # assert isinstance(languageid, ResNet1dLanguageID) + # assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + # hf_feats = wav2transducer.hf_feats + # transducer = wav2transducer.transducer + # languageid = wav2languageid.languageid + + + super().__init__(hf_feats, transducer, languageid, + feat_fusion_start_transducer=feat_fusion_start_transducer, + feat_fusion_start_lid=feat_fusion_start_lid, + feat_fusion_method_transducer=feat_fusion_method_transducer, + feat_fusion_method_lid=feat_fusion_method_lid, + loss_lid_type=loss_lid_type, + loss_class_weight=loss_class_weight, + loss_class_weight_exp=loss_class_weight_exp, + loss_weight_transducer=loss_weight_transducer, + loss_weight_lid=loss_weight_lid, + loss_reg_weight_transducer=loss_reg_weight_transducer, + loss_reg_weight_lid=loss_reg_weight_lid, + loss_weight_embed=loss_weight_embed, + lid_length=lid_length) + + + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNFiLMTransducerLanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_class_args(parser, prefix="transducer") + # HFWav2RNNFiLMTransducer.add_class_args(parser) + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + HFWav2RNNFiLMTransducerLanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + # "loss_lid_type", + # "loss_class_weight_exp", + "loss_weight_transducer", + "loss_weight_lid", + "loss_weight_embed", + "loss_reg_weight_transducer", + "loss_reg_weight_lid", + "lid_length", + ) + + base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + type=str, + help=""" + The type of the loss for language id + """, + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=""" + The exponent of the class weight for language id + """, + ) + + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--loss-weight-embed", + default=0.005, + type=float, + help=""" + The weight of the embedding loss + """, + ) + + parser.add_argument( + "--loss-reg-weight-transducer", + default=0.0, + type=float, + help=""" + The weight of the transducer regularization loss + """, + ) + + parser.add_argument( + "--loss-reg-weight-lid", + default=0.0, + type=float, + help=""" + The weight of the lid regularization loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py new file mode 100644 index 00000000..28d51679 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -0,0 +1,187 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID +from ..wav2transducer import HFWav2Vec2RNNTransducer + + +from .hf_wav2rnn_transducer_languageid import HFWav2RNNTransducerLanguageID + + +class HFWav2Vec2RNNTransducerResnet1D(HFWav2RNNTransducerLanguageID): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNTransducer], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp: float = 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(languageid, dict): + # languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in languageid: + # del languageid["class_name"] + # languageid = ResNet1dLanguageID(**languageid) + # else: + # assert isinstance(languageid, ResNet1dLanguageID) + # assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + # hf_feats = wav2transducer.hf_feats + # transducer = wav2transducer.transducer + # languageid = wav2languageid.languageid + + + super().__init__(hf_feats, transducer, languageid, + feat_fusion_start_transducer=feat_fusion_start_transducer, + feat_fusion_start_lid=feat_fusion_start_lid, + feat_fusion_method_transducer=feat_fusion_method_transducer, + feat_fusion_method_lid=feat_fusion_method_lid, + loss_lid_type=loss_lid_type, + loss_class_weight=loss_class_weight, + loss_class_weight_exp=loss_class_weight_exp, + loss_weight_transducer=loss_weight_transducer, + loss_weight_lid=loss_weight_lid, + lid_length=lid_length) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducerLanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNTransducer.add_class_args(parser, prefix="transducer") + # HFWav2RNNTransducer.add_class_args(parser) + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + HFWav2RNNTransducerLanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "loss_weight_transducer", + "loss_weight_lid", + "lid_length", + ) + + base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + type=str, + help=""" + The type of the loss for language id + """, + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=""" + The exponent of the class weight for language id + """, + ) + + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNTransducer.add_finetune_args(parser, prefix="transducer") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 5599fa1e..925f1172 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -5,10 +5,9 @@ import contextlib import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence @@ -26,12 +25,9 @@ class HFWav2XVector(TorchModel): than one layer is used. """ - def __init__(self, - hf_feats, - xvector, - feat_fusion_start=0, - feat_fusion_method="weighted-avg"): - + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): super().__init__() self.hf_feats = hf_feats self.xvector = xvector @@ -51,12 +47,9 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, - num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, - layer_dim, - bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -71,7 +64,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start:] + hid_feats = hid_feats[self.feat_fusion_start :] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -125,14 +118,14 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats(self, - x, - x_lengths, - return_feat_layers=None, - chunk_length=0, - detach_chunks=False): - return_hid_states = (False if return_feat_layers is None - and self.feat_fusion_method == "last" else True) + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) with self._hf_context: hf_output = self.hf_feats( x, @@ -154,7 +147,8 @@ def forward_feats(self, # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) for i, f in enumerate(hid_feats) + f.transpose(1, 2) + for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -194,7 +188,8 @@ def forward( "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, return_feat_layers + ) output = self.xvector( feats, feat_lengths, @@ -225,21 +220,21 @@ def extract_embed( embed_layer=None, detach_chunks=False, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) + x, x_lengths = remove_silence(x, vad_samples, x_lengths) feats, _, feat_lengths = self.forward_feats( - x, - x_lengths, - chunk_length=hf_chunk_length, - detach_chunks=detach_chunks) - xvec_chunk_length = int(xvec_chunk_length * - self.hf_feats.sample_frequency * - feats.size(-1) // x.size(-1)) - return self.xvector.extract_embed(feats, feat_lengths, - xvec_chunk_length, embed_layer, - detach_chunks) + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) def freeze_feat_fuser(self): if self.feat_fuser is None: @@ -258,6 +253,26 @@ def freeze_hf_feats(self): def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + + def has_param_groups(self): + return self.hf_feats.has_param_groups() + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + param_groups = self.hf_feats.trainable_param_groups() + if self.feat_fusion_method == "weighted-avg": + if self.feat_fuser.requires_grad: + param_groups.append({"params": self.feat_fuser}) + else: + param_groups.append({"params": self.feat_fuser.parameters()}) + + param_groups.append({"params": self.xvector.trainable_parameters()}) + return param_groups + def set_train_mode(self, mode): if mode == self._train_mode: return @@ -281,12 +296,21 @@ def set_train_mode(self, mode): elif mode == "hf-feat-extractor-frozen": self.unfreeze() self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") else: raise ValueError(f"invalid train_mode={mode}") logging.info("train mode set to %s", mode) - if "nograd" in mode: + if "nograd" in mode or mode == "ft-embed-affine": logging.info("using torch.no_grad for hf_feats") self._hf_context = torch.no_grad() else: @@ -295,18 +319,20 @@ def set_train_mode(self, mode): self._train_mode = mode def _train(self, train_mode: str): - if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode == "ft-embed-affine": self.hf_feats.train() self.xvector._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", - "hf-feats-frozen", - "ft-xvector-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ]: self.hf_feats.train() self.xvector._train("full") @@ -324,6 +350,9 @@ def valid_train_modes(): "ft-xvector-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ] @staticmethod @@ -338,7 +367,6 @@ def filter_args(**kwargs): return args def get_config(self): - hf_cfg = self.hf_feats.get_config() xvec_cfg = self.xvector.get_config() del hf_cfg["class_name"] @@ -360,7 +388,6 @@ def change_config(self, hf_feats, xvector): @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -369,16 +396,19 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-start", default=0, type=int, - help= - ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers"), + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), ) parser.add_argument( "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=("method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]"), + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), ) if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 0d9f1bc4..0e4faded 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 1f7283a0..11d643af 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 4c21f478..4bbc0c4c 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import contextlib import logging from jsonargparse import ActionParser, ArgumentParser @@ -35,6 +36,23 @@ def __init__(self, feats, xvector): self.feats = feats self.xvector = xvector + self._feats_context = contextlib.nullcontext() + + @property + def sample_frequency(self): + return self.feats.sample_frequency + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) def rebuild_output_layer( self, @@ -58,8 +76,9 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def compute_prototype_affinity(self): - return self.xvector.compute_prototype_affinity() + def change_config(self, xvector): + logging.info("changing wav2xvector config") + self.xvector.change_config(**xvector) def forward( self, @@ -73,15 +92,28 @@ def forward( return_output=True, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) - - # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1)) - return self.xvector(feats, feat_lengths, y, enc_layers, classif_layers, - return_output) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + n = torch.sum(~torch.isfinite(feats)) + if n > 0: + print( + "feats", + n, + torch.sum(torch.isnan(feats)), + torch.sum(torch.any(torch.isnan(x), dim=-1)), + x.dtype, + feats.dtype, + flush=True, + ) + return self.xvector( + feats, feat_lengths, y, enc_layers, classif_layers, return_output + ) def extract_embed( self, @@ -94,18 +126,54 @@ def extract_embed( detach_chunks=False, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) - feats = feats.transpose(1, 2) - return self.xvector.extract_embed(feats, feat_lengths, chunk_length, - embed_layer, detach_chunks) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + chunk_length = int(chunk_length * feats.shape[1] / x.shape[-1]) + + return self.xvector.extract_embed( + feats, feat_lengths, chunk_length, embed_layer, detach_chunks + ) def set_train_mode(self, mode): - self.xvector.set_train_mode(mode) + if mode == self._train_mode: + return + + if mode == "full-feats-grad": + self._feats_context = contextlib.nullcontext() + xvector_mode = "full" + else: + logging.info("using torch.no_grad for feats") + self._feats_context = torch.no_grad() + + self.xvector.set_train_mode(xvector_mode) + self._train_mode = mode + + def _train(self, train_mode: str): + + self.feats.train() + if train_mode in ["frozen"]: + super()._train(train_mode) + elif train_mode in ["full-feats-grad", "full"]: + self.xvector._train("full") + elif train_mode == "ft-embed-affine": + self.xvector._train("ft-embed_affine") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "full-feats-grad", + ] def get_config(self): feat_cfg = self.feats.get_config() @@ -119,7 +187,7 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) @staticmethod - def filter_args(*kwargs): + def filter_args(**kwargs): """Filters Wav2XVector class arguments from arguments dictionary. Args: @@ -150,5 +218,4 @@ def add_class_args(parser, prefix=None): AudioFeatsMVN.add_class_args(parser, prefix="feats") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 8556104a..80203f5d 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -6,10 +6,9 @@ from enum import Enum from typing import Optional -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock @@ -52,7 +51,6 @@ def __init__( in_feats=None, proj_feats=None, ): - super().__init__() # encoder network @@ -355,15 +353,16 @@ def forward_hid_feats( max_in_length = x.size(-1) x = self._pre_enc(x) h_enc, x = self.encoder_net.forward_hid_feats( - x, return_enc_layers, return_logits=True + x, return_enc_layers, return_output=True ) output = {"h_enc": h_enc} if not return_logits and return_classif_layers is None: return output - + # logging.info(f"forward_hid_feats: x.shape={x.shape}") x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + # logging.info(f"x_lengths: {x_lengths}") p = self.pool_net(x, x_lengths=x_lengths) - h_classif, y_pred = self.classif_net.forward_hid_feats( + h_classif = self.classif_net.forward_hid_feats( p, y, return_classif_layers, return_logits=return_logits ) if return_logits: @@ -407,7 +406,6 @@ def extract_embed_slidwin( embed_layer=None, detach_chunks=False, ): - if feat_frame_shift is not None: # assume win_length/shift are in secs, transform to frames # pass feat times from msecs to secs @@ -464,7 +462,6 @@ def compute_slidwin_timestamps( feat_frame_shift=10, feat_snip_edges=False, ): - P = self.compute_slidwin_left_padding( win_length, win_shift, @@ -495,7 +492,6 @@ def compute_slidwin_left_padding( feat_frame_shift=10, feat_snip_edges=False, ): - # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 feat_frame_length = feat_frame_length / 1000 @@ -526,7 +522,6 @@ def compute_slidwin_left_padding( return P1 + P2 def get_config(self): - enc_cfg = self.encoder_net.get_config() pool_cfg = PF.get_config(self.pool_net) @@ -558,6 +553,21 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + + def get_regularization_loss(self): + reg_loss = 0.0 + total_params = 0 + + for param in self.parameters(): + reg_loss += torch.norm(param)**2 + total_params += torch.numel(param) + + reg_loss = (reg_loss) / total_params + + return reg_loss + + + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) @@ -694,42 +704,14 @@ def valid_train_modes(): @staticmethod def filter_args(**kwargs): - # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) args = filter_func_args(ClassifHead.__init__, kwargs) args["pool_net"] = pool_args return args - # valid_args = ( - # "num_classes", - # "embed_dim", - # "num_embed_layers", - # "hid_act", - # "loss_type", - # "cos_scale", - # "margin", - # "margin_warmup_epochs", - # "intertop_k", - # "intertop_margin", - # "num_subcenters", - # "use_norm", - # "norm_before", - # "in_feats", - # "proj_feats", - # "dropout_rate", - # "norm_layer", - # "head_norm_layer", - # "head_use_in_norm", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # args["pool_net"] = pool_args - # return args - @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -750,7 +732,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 4fe8b4ed..049f5d23 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -22,6 +22,7 @@ from .resnet_factory import ResNetFactory from .rnn_encoder import RNNEncoder from .rnn_transducer_decoder import RNNTransducerDecoder +from .rnn_film_transducer_decoder import RNNFiLMTransducerDecoder from .spinenet import * from .spinenet_factory import SpineNetFactory from .tdnn import TDNNV1 diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 160ee61b..440c22b6 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -32,7 +32,12 @@ def __init__( if mvn is not None: mvn = MVN.filter_args(**mvn) self.mvn_cfg = mvn - if mvn["norm_mean"] or mvn["norm_var"]: + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): self.mvn = MVN(**mvn) self.spec_augment = None @@ -45,6 +50,10 @@ def __init__( self.trans = trans self.aug_after_mvn = aug_after_mvn + @property + def sample_frequency(self): + return self.audio_feats.fs + @property def fs(self): return self.audio_feats.fs @@ -79,7 +88,7 @@ def forward(self, x, x_lengths=None): if self.trans: f = f.transpose(1, 2).contiguous() - return f + return f, f_lengths def get_config(self): config = { diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 9f9b280b..e5d90f4f 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -402,7 +402,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index f5ab74d5..172a3d70 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -389,7 +389,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 0c331a5e..6cf7f4ca 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -28,7 +28,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -362,7 +362,7 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 4106cbfd..68679e0b 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -410,7 +410,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index ce7b9677..bc7e4b33 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -29,7 +29,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -367,7 +367,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py index cdbf1940..a47f304e 100644 --- a/hyperion/torch/narchs/fcnet.py +++ b/hyperion/torch/narchs/fcnet.py @@ -125,7 +125,7 @@ def __init__( in_units, hid_units, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, dropout_rate=0, norm_layer=None, diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 858cf4ea..5d3b9793 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -10,10 +10,16 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, - ResNetEndpointBlock, ResNetInputBlock, - SEResNetBasicBlock, SEResNetBNBlock) +from ..layer_blocks import ( + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, + SEResNetBasicBlock, + SEResNetBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import scale_seq_lengths, seq_lengths_to_mask @@ -69,7 +75,7 @@ def __init__( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 0c577174..9332724f 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -9,9 +9,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, - ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, - SEResNet1dBNDecBlock) +from ..layer_blocks import ( + DC1dDecBlock, + ResNet1dBasicDecBlock, + ResNet1dBNDecBlock, + SEResNet1dBasicDecBlock, + SEResNet1dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF @@ -34,7 +38,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -450,7 +454,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 5bdad186..97b244f3 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -12,10 +12,16 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, - Res2Net1dBNBlock, ResNet1dBasicBlock, - ResNet1dBNBlock, ResNet1dEndpoint, - SEResNet1dBasicBlock, SEResNet1dBNBlock) +from ..layer_blocks import ( + DC1dEncBlock, + Res2Net1dBasicBlock, + Res2Net1dBNBlock, + ResNet1dBasicBlock, + ResNet1dBNBlock, + ResNet1dEndpoint, + SEResNet1dBasicBlock, + SEResNet1dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from ..utils import seq_lengths_to_mask @@ -37,7 +43,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, drop_connect_rate=0, @@ -472,7 +478,7 @@ def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False) if self.head_channels > 0: x = self.head_block(x) - return x + return h, x def get_config(self): @@ -675,7 +681,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 426b37f5..0afa1acc 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -10,9 +10,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, - ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, - SEResNet2dBNDecBlock) +from ..layer_blocks import ( + DC2dDecBlock, + ResNet2dBasicDecBlock, + ResNet2dBNDecBlock, + SEResNet2dBasicDecBlock, + SEResNet2dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF @@ -35,7 +39,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -457,7 +461,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 84e6599e..a7fd047e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -11,10 +11,15 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, - Res2Net2dBNBlock, ResNet2dBasicBlock, - ResNet2dBNBlock, SEResNet2dBasicBlock, - SEResNet2dBNBlock) +from ..layer_blocks import ( + DC2dEncBlock, + Res2Net2dBasicBlock, + Res2Net2dBNBlock, + ResNet2dBasicBlock, + ResNet2dBNBlock, + SEResNet2dBasicBlock, + SEResNet2dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import seq_lengths_to_mask @@ -38,7 +43,7 @@ class ResNet2dEncoder(NetArch): resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -65,7 +70,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -511,7 +516,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index 2d17a6d7..35ed9af0 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -146,7 +146,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -341,7 +341,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py new file mode 100644 index 00000000..17bbe515 --- /dev/null +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -0,0 +1,952 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from dataclasses import dataclass +import logging +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +from ...utils.misc import filter_func_args +from ...utils.text import add_sos +from ..layer_blocks import TransducerFiLMJoiner as FiLMJoiner +from ..layer_blocks import TransducerJoiner as Joiner +from ..layer_blocks import TransducerRNNFiLMPredictor as FiLMRNNPredictor +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor +from .net_arch import NetArch + + +@dataclass +class Hypothesis: + ys: List[int] # lid_pred sequences + log_prob: float # log prob of ys + + # Optional LSTM predictor state. + pred_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + +class RNNFiLMTransducerDecoder(NetArch): + """ RNN-T Decoder composed of Predictor and Joiner networks + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py + + Attributes: + in_feats: input features dimension (encoder output) + vocab_size: Number of tokens of the modeling unit including blank. + predictor: Dictionary with the predictor options. + joiner: Dictionary with the joiner options. + blank_id: id of the null symbol. + rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned. + rnnt_type: rnn-t variation between regular, modified or constrained. + delay_penalty: penalize symbol delay, which is used to make symbol + emit earlier. + reduction: type of reduction for rnn-t loss between sum or mean + prune_range: how many symbols to keep for each frame in k2 rnn-t + pruned loss. + lm_scale: language model scale in rnn-t smoothed loss. + am_scale: acoustic model scale in rnn-t smoothed loss. + simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss. + pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1. + """ + + def __init__( + self, + in_feats: int, + vocab_size: int, + predictor: Dict, + joiner: Dict, + blank_id: int = 0, + rnnt_loss: str = "k2_pruned", + rnnt_type: str = "regular", + delay_penalty: float = 0.0, + reduction: str = "sum", + prune_range: int = 5, + lm_scale: float = 0.25, + am_scale: float = 0.0, + simple_loss_scale: float = 0.5, + pruned_warmup_steps: int = 2000, + langs_size: int = 13, + condition_size: int = 64, + film_type: str = "linear", + film_cond_type: str = "one-hot", + ): + + super().__init__() + self.in_feats = in_feats + self.vocab_size = vocab_size + self.predictor_args = predictor + self.joiner_args = joiner + self.blank_id = blank_id + self.rnnt_loss = rnnt_loss + self.rnnt_type = rnnt_type + self.delay_penalty = delay_penalty + self.reduction = reduction + self.prune_range = prune_range + self.lm_scale = lm_scale + self.am_scale = am_scale + self.simple_loss_scale = simple_loss_scale + self.pruned_warmup_steps = pruned_warmup_steps + self.condition_size = condition_size + self.film_cond_type = film_cond_type + self.film_type = film_type + + + self._make_predictor() + self._make_joiner() + # make embedding layer for language id + self.lang_embedding = nn.Embedding(langs_size, condition_size) + if self.film_cond_type == "lid_pred": + self.lang_embedding = nn.Embedding(langs_size, condition_size) + # self.lid_lang_embedding = nn.Linear(langs_size, condition_size) + + if self.rnnt_loss == "k2_pruned": + self.simple_am_proj = nn.Linear(in_feats, vocab_size) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, + vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, + dtype=torch.int)) + + def _make_predictor(self): + pred_type = self.predictor_args["pred_type"] + self.predictor_args["in_feats"] = self.in_feats + self.predictor_args["vocab_size"] = self.vocab_size + self.predictor_args["blank_id"] = self.blank_id + self.predictor_args["condition_size"] = self.condition_size + # Add FiLM args to the predictor args + if pred_type == "rnn": + pred_args = filter_func_args(FiLMRNNPredictor.__init__, + self.predictor_args) + self.predictor = FiLMRNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type) + elif pred_type == "rnn_original": + pred_args = filter_func_args(RNNPredictor.__init__, + self.predictor_args) + self.predictor = RNNPredictor(**pred_args) + # elif pred_type == "conv": + # pred_args = filter_func_args(ConvPredictor.__init__, + # self.predictor_args) + # self.predictor = ConvPredictor(**pred_args) + else: + raise ValueError(f"Unknown predictor type {pred_type}") + + def _make_joiner(self): + joiner_type = self.joiner_args["joiner_type"] + # Add FiLM args to the joiner args + + if joiner_type == "basic": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats, + self.vocab_size, self.condition_size, film_type=self.film_type, film_cond_type=self.film_cond_type) + elif joiner_type == "original_joiner": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, + self.vocab_size) + else: + raise ValueError(f"Unknown joiner type {joiner_type}") + + def get_config(self): + config = { + "in_feats": self.in_feats, + "vocab_size": self.vocab_size, + "predictor": self.predictor_args, + "joiner": self.joiner_args, + "blank_id": self.blank_id, + "rnnt_loss": self.rnnt_loss, + "rnnt_type": self.rnnt_type, + "delay_penalty": self.delay_penalty, + "reduction": self.reduction, + "prune_range": self.prune_range, + "lm_scale": self.lm_scale, + "am_scale": self.am_scale, + "simple_loss_scale": self.simple_loss_scale, + "pruned_warmup_steps": self.pruned_warmup_steps, + "condition_size": self.condition_size, + "film_cond_type": self.film_cond_type, + "film_type": self.film_type, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x, pred_out) + else: + logits = self.joiner(x, pred_out, lang_embedding) + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + x_lengths = x_lengths.to(torch.int32) + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lengths, + target_lengths=y_lengths, + blank=self.blank_id, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x, pred_out) + else: + logits = self.joiner(x, pred_out, lang_embedding) + + with torch.cuda.amp.autocast(enabled=False): + loss = k2.rnnt_loss( + logits=logits.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + am_simple = self.simple_am_proj(x) + lm_simple = self.simple_lm_proj(pred_out) + with torch.cuda.amp.autocast(enabled=False): + loss_simple, (px_grad, py_grad) = k2.rnnt_loss_smoothed( + lm=lm_simple.float(), + am=am_simple.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + lm_only_scale=self.lm_scale, + am_only_scale=self.am_scale, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + return_grad=True, + ) + + # ranges : [B, T, prune_range] + ranges = k2.get_rnnt_prune_ranges( + px_grad=px_grad, + py_grad=py_grad, + boundary=boundary, + s_range=self.prune_range, + ) + + # am_pruned : [B, T, prune_range, encoder_dim] + # lm_pruned : [B, T, prune_range, decoder_dim] + am_pruned, lm_pruned = k2.do_rnnt_pruning( + am=self.joiner.enc_proj(x), + lm=self.joiner.pred_proj(pred_out), + ranges=ranges, + ) + + # logits : [B, T, prune_range, vocab_size] + + # project_input=False since we applied the decoder's input projections + # prior to do_rnnt_pruning (this is an optimization for speed). + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(am_pruned, lm_pruned, project_input=False) + else: + logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False) + + + with torch.cuda.amp.autocast(enabled=False): + loss_pruned = k2.rnnt_loss_pruned( + logits=logits.float(), + symbols=y_padded, + ranges=ranges, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + + if self.cur_step > self.pruned_warmup_steps: + simple_loss_scale = self.simple_loss_scale + pruned_loss_scale = 1.0 + else: + r = self.cur_step / self.pruned_warmup_steps + simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) + pruned_loss_scale = 0.1 + 0.9 * r + self.cur_step += 1 + # print(simple_loss_scale, pruned_loss_scale) + + loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned + + return loss, loss_simple, loss_pruned + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang_embedding: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + # embed lang + # logging.info(f"lang_embedding.shape: {lang_embedding.shape}") + # import pdb; pdb.set_trace() + if self.film_cond_type == "one-hot": + lang_embedding = self.lang_embedding(lang_embedding) + elif self.film_cond_type == "lid_pred": + lang_embedding = self.lang_embedding(lang_embedding) #self.lid_lang_embedding(lang_embedding) + # logging.info(f"lang_embedding.shape: {lang_embedding.shape}") + # logging.info(f"film_cond_type: {self.film_cond_type}") + # get y_lengths + row_splits = y.shape.row_splits(1) + y_lengths = row_splits[1:] - row_splits[:-1] + # shift y adding token + sos_y = add_sos(y, sos_id=self.blank_id) + sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + # apply predictor and joiner + if self.predictor_args["pred_type"] == "rnn": + pred_out, _ = self.predictor(sos_y_padded, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, _ = self.predictor(sos_y_padded) + loss_simple = loss_pruned = None + if self.rnnt_loss == "k2_pruned": + loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( + x, x_lengths, y, y_lengths, pred_out, lang_embedding) + elif self.rnnt_loss == "k2": + loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out, lang_embedding) + elif self.rnnt_loss == "torchaudio": + loss_simple = loss_pruned = None + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, + pred_out, lang_embedding) + + return loss, loss_simple, loss_pruned + + def decode(self, + x: torch.Tensor, + lang: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, ) -> List[int]: + + # embed lang + # if self.film_cond_type in ["one-hot", "lid_pred"]: + # lang_embedding = self.lang_embedding(lang) + + if self.film_cond_type == "one-hot": + lang_embedding = self.lang_embedding(lang) + elif self.film_cond_type == "lid_pred": + lang_embedding = self.lang_embedding(lang) + + # lang_embedding = self.lid_lang_embedding(lang) + if method == "time_sync_beam_search": + return self.decode_time_sync_beam_search(x, + lang_embedding, + x_lengths, + beam_width=beam_width) + elif method == "align_length_sync_beam_search": + return self.decode_align_length_sync_beam_search( + x, + x_lengths, + lang_embedding, + beam_width=beam_width, + max_sym_per_utt=max_sym_per_utt) + elif method == "greedy": + return self.decode_greedy(x, + lang_embedding, + x_lengths, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + def decode_greedy(self, + x: torch.Tensor, + lang_embedding: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[int]: + """ + Args: + x: encoder embeddings with shape = (N, T, C) + Returns: + Decoded tokens + """ + assert x.ndim == 3 + + # support only batch_size == 1 for now + assert x.size(0) == 1, x.size(0) + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device, + dtype=torch.int64).reshape(1, 1) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, (h, c) = self.predictor(sos) + T = x.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + while t < T and sym_per_utt < max_sym_per_utt: + x_t = x[:, t:t + 1, :] + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) # (1, 1, 1, vocab_size) + # logits is + + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c)) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, (h, c) = self.predictor(y, (h, c)) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + def decode_time_sync_beam_search(self, + x: torch.Tensor, + lang_embedding: torch.Tensor, + x_lengths: torch.Tensor = None, + beam_width: int = 5) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, state = self.predictor(sos) + T = x.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + + while t < T and u < max_u: + x_t = x[:, t:t + 1, :] + A = B + B = [] + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + # pred_out, pred_state = self.predictor( + # pred_in, + # lang_embedding, + # y_star.pred_state, + # ) + if self.predictor_args["pred_type"] == "rnn": + pred_out, pred_state = self.predictor( + pred_in, + lang_embedding, + y_star.pred_state, + ) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) + + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + pred_state=y_star.pred_state, + ) + B.append(new_y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B = sorted( + [ + hyp + for hyp in B if hyp.log_prob > A_most_probable.log_prob + ], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + t += 1 + + try: + best_hyp = max(B, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + except: + return "" + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def decode_align_length_sync_beam_search( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + lang_embedding: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, state = self.predictor(sos) + + T = x.size(1) + #t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + #max_u = 20000 # terminate after this number of steps + #u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + F = [] + #for t < T and u < max_u: + for i in range(T + max_sym_per_utt): + A = [] + for y_star in B: + #while u < max_u: + u = len(y_star.ys) - 1 + t = i - u + if t >= T: + continue + + #y_star = max(A, key=lambda hyp: hyp.log_prob) + #A.remove(y_star) + x_t = x[:, t:t + 1, :] + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + if self.predictor_args["pred_type"] == "rnn": + pred_out, pred_state = self.predictor( + pred_in, + lang_embedding, + y_star.pred_state, + ) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() # (vocab_size,) + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.pred_state here + pred_state=y_star.pred_state, + ) + A.append(new_y_star) + if t == T - 1: + F.append(y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + # check whether B contains more than "beam_width" elements more probable + # than the most probable in A + #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B0 = sorted( + [hyp for hyp in A], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + B = [] + B_ys = set() + for hyp in B0: + hyp_ys = tuple(hyp.ys) # to make ys hashable + if hyp_ys not in B_ys: + B.append(hyp) + B_ys.add(hyp_ys) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + + best_hyp = max(F, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + prune_range: Optional[int] = None, + reduction: Optional[str] = None, + ): + logging.info("changing decoder config") + self.predictor.change_config(override_dropouts, embed_dropout_rate, + rnn_dropout_rate) + if prune_range is not None: + self.prune_range = prune_range + if reduction is not None: + self.reduction = reduction + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNFiLMTransducerDecoder.__init__, kwargs) + return args + + @staticmethod + def filter_finetune_args(**kwargs): + args = filter_func_args(RNNFiLMTransducerDecoder.change_config, kwargs) + return args + + @staticmethod + def add_pred_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--pred-type", + default="rnn", + choices=["rnn", "conv"], + help= + """type of predictor between RNN and Convolutional [rnn, conv]""") + pred_parser.add_argument("--embed-dim", + default=1024, + type=int, + help=("token embedding dimension")) + pred_parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for predictor input embeddings")) + pred_parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """) + pred_parser.add_argument( + "--rnn-type", + default="lstm", + choices=["lstm", "gru"], + help= + """type of recurrent network for thep predictor in [lstm, gru]""") + + + + + pred_parser.add_argument("--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """) + + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""") + pred_parser.add_argument("--out-feats", + default=512, + type=int, + help="""output features of the predictor""") + pred_parser.add_argument("--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""") + + parser.add_argument("--predictor", + action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_joiner_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--joiner-type", + default="basic", + choices=["basic"], + help= + """type of joiner network, there is only basic joiner for now""") + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""") + parser.add_argument("--joiner", + action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + if "blank_id" not in skip: + parser.add_argument("--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model")) + if "vocab_size" not in skip: + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + RNNFiLMTransducerDecoder.add_pred_args(parser) + RNNFiLMTransducerDecoder.add_joiner_args(parser) + parser.add_argument( + "--rnnt-loss", + default="k2_pruned", + choices=["torchaudio", "k2", "k2_pruned"], + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + parser.add_argument( + "--rnnt-type", + default="regular", + choices=["regular", "modified", "constrained"], + help= + """type of rnn-t loss between regular, modified or constrained.""") + parser.add_argument( + "--delay-penalty", + default=0.0, + type=float, + help= + """penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""") + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + parser.add_argument( + "--prune-range", + default=None, + type=Optional[int], + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + + parser.add_argument("--condition-size", + type=int, + required=True, + help=("condition vector dimension")) + + parser.add_argument("--film-cond-type", + default="one-hot", + choices=["one-hot", "lid_pred", "lid_pred_embed"], + help=("type of the condition of FiLM layer")) + + parser.add_argument("--film-type", + default="linear", + choices=["linear", "tanh"], + help=("type of the FiLM layer")) + parser.add_argument( + "--lm-scale", + default=0.25, + type=float, + help="""language model scale in rnn-t smoothed loss""") + parser.add_argument( + "--am-scale", + default=0.0, + type=float, + help="""acoustic model scale in rnn-t smoothed loss""") + parser.add_argument( + "--simple-loss-scale", + default=0.5, + type=float, + help="""weight of rnn-t simple loss when using k2 pruned loss""") + parser.add_argument( + "--pruned-warmup-steps", + default=2000, + type=int, + help="""number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1""") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model.")) + parser.add_argument("--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 8db6c23a..4dc5a5a4 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -4,11 +4,12 @@ """ from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +import logging +from typing import Dict, List, Optional, Tuple, Union import torchaudio import torchaudio.functional -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo import torch import torch.nn as nn @@ -16,7 +17,7 @@ try: import k2 except ModuleNotFoundError: - from ...utils import dummy_k2 as k2 + from ..utils import dummy_k2 as k2 from ...utils.misc import filter_func_args from ...utils.text import add_sos @@ -76,6 +77,7 @@ def __init__( am_scale: float = 0.0, simple_loss_scale: float = 0.5, pruned_warmup_steps: int = 2000, + # film: bool=False, ): super().__init__() @@ -99,10 +101,8 @@ def __init__( if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) - self.simple_lm_proj = nn.Linear(self.predictor.out_feats, - vocab_size) - self.register_buffer("cur_step", torch.as_tensor(0, - dtype=torch.int)) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, dtype=torch.int)) def _make_predictor(self): pred_type = self.predictor_args["pred_type"] @@ -110,12 +110,10 @@ def _make_predictor(self): self.predictor_args["vocab_size"] = self.vocab_size self.predictor_args["blank_id"] = self.blank_id if pred_type == "rnn": - pred_args = filter_func_args(RNNPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) self.predictor = RNNPredictor(**pred_args) elif pred_type == "conv": - pred_args = filter_func_args(ConvPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args) self.predictor = ConvPredictor(**pred_args) self.predictor_args["out_feats"] = self.predictor.embed_dim else: @@ -127,8 +125,7 @@ def _make_joiner(self): if joiner_type == "basic": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] - self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size) + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, self.vocab_size) else: raise ValueError(f"Unknown joiner type {joiner_type}") @@ -152,9 +149,14 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_torchaudio( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): logits = self.joiner(x, pred_out) # rnnt_loss requires 0 padded targets # Note: y does not start with SOS @@ -170,14 +172,17 @@ def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -195,15 +200,18 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2_pruned( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -266,7 +274,7 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) pruned_loss_scale = 0.1 + 0.9 * r self.cur_step += 1 - #print(simple_loss_scale, pruned_loss_scale) + # print(simple_loss_scale, pruned_loss_scale) loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned @@ -288,44 +296,48 @@ def forward( loss_simple = loss_pruned = None if self.rnnt_loss == "k2_pruned": loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( - x, x_lengths, y, y_lengths, pred_out) + x, x_lengths, y, y_lengths, pred_out + ) elif self.rnnt_loss == "k2": loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out) elif self.rnnt_loss == "torchaudio": loss_simple = loss_pruned = None - loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, - pred_out) + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, pred_out) return loss, loss_simple, loss_pruned - def decode(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - method="time_sync_beam_search", - beam_width: int = 5, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + def decode( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: if method == "time_sync_beam_search": - return self.decode_time_sync_beam_search(x, - x_lengths, - beam_width=beam_width) + return self.decode_time_sync_beam_search( + x, x_lengths, beam_width=beam_width + ) elif method == "align_length_sync_beam_search": return self.decode_align_length_sync_beam_search( + x, x_lengths, beam_width=beam_width, max_sym_per_utt=max_sym_per_utt + ) + elif method == "greedy": + return self.decode_greedy( x, x_lengths, - beam_width=beam_width, - max_sym_per_utt=max_sym_per_utt) - elif method == "greedy": - return self.decode_greedy(x, - x_lengths, - max_sym_per_frame=max_sym_per_frame, - max_sym_per_utt=max_sym_per_utt) - - def decode_greedy(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + + def decode_greedy( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: """ Args: x: encoder embeddings with shape = (N, T, C) @@ -339,8 +351,7 @@ def decode_greedy(self, blank_id = self.blank_id device = x.device - sos = torch.tensor([blank_id], device=device, - dtype=torch.int64).reshape(1, 1) + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) t = 0 @@ -350,7 +361,7 @@ def decode_greedy(self, sym_per_utt = 0 while t < T and sym_per_utt < max_sym_per_utt: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] logits = self.joiner(x_t, pred_out) # (1, 1, 1, vocab_size) # logits is @@ -371,10 +382,9 @@ def decode_greedy(self, return hyp - def decode_time_sync_beam_search(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - beam_width: int = 5) -> List[int]: + def decode_time_sync_beam_search( + self, x: torch.Tensor, x_lengths: torch.Tensor = None, beam_width: int = 5 + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -389,11 +399,10 @@ def decode_time_sync_beam_search(self, max_u = 20000 # terminate after this number of steps u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} while t < T and u < max_u: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] A = B B = [] @@ -406,13 +415,9 @@ def decode_time_sync_beam_search(self, cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -443,7 +448,7 @@ def decode_time_sync_beam_search(self, topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -452,9 +457,7 @@ def decode_time_sync_beam_search(self, new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) @@ -462,12 +465,9 @@ def decode_time_sync_beam_search(self, # check whether B contains more than "beam" elements more probable # than the most probable in A A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B = sorted( - [ - hyp - for hyp in B if hyp.log_prob > A_most_probable.log_prob - ], + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], key=lambda hyp: hyp.log_prob, reverse=True, ) @@ -483,17 +483,21 @@ def decode_time_sync_beam_search(self, break t += 1 - best_hyp = max(B, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + try: + best_hyp = max(B, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + except: + return "" ys = best_hyp.ys[1:] # [1:] to remove the blank return ys def decode_align_length_sync_beam_search( - self, - x: torch.Tensor, - x_lengths: torch.Tensor, - beam_width: int = 5, - max_sym_per_utt: int = 1000) -> List[int]: + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000, + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -503,39 +507,34 @@ def decode_align_length_sync_beam_search( sos = torch.tensor([blank_id], device=device).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) - #t = 0 + # t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] - #max_u = 20000 # terminate after this number of steps - #u = 0 + # max_u = 20000 # terminate after this number of steps + # u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} F = [] - #for t < T and u < max_u: + # for t < T and u < max_u: for i in range(T + max_sym_per_utt): A = [] for y_star in B: - #while u < max_u: + # while u < max_u: u = len(y_star.ys) - 1 t = i - u if t >= T: continue - #y_star = max(A, key=lambda hyp: hyp.log_prob) - #A.remove(y_star) - x_t = x[:, t:t + 1, :] + # y_star = max(A, key=lambda hyp: hyp.log_prob) + # A.remove(y_star) + x_t = x[:, t : t + 1, :] # Note: y_star.ys is unhashable, i.e., cannot be used # as a key into a dict cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -563,7 +562,7 @@ def decode_align_length_sync_beam_search( topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -572,20 +571,16 @@ def decode_align_length_sync_beam_search( new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) # check whether B contains more than "beam_width" elements more probable # than the most probable in A - #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B0 = sorted( - [hyp for hyp in A], - key=lambda hyp: hyp.log_prob, - reverse=True, + [hyp for hyp in A], key=lambda hyp: hyp.log_prob, reverse=True, ) B = [] B_ys = set() @@ -605,8 +600,7 @@ def decode_align_length_sync_beam_search( B = B[:beam_width] break - best_hyp = max(F, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + best_hyp = max(F, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) ys = best_hyp.ys[1:] # [1:] to remove the blank return ys @@ -615,10 +609,16 @@ def change_config( override_dropouts=False, embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, + prune_range: Optional[int] = None, + reduction: Optional[str] = None, ): logging.info("changing decoder config") self.predictor.change_config(override_dropouts, embed_dropout_rate, rnn_dropout_rate) + if prune_range is not None: + self.prune_range = prune_range + if reduction is not None: + self.reduction = reduction @staticmethod def filter_args(**kwargs): @@ -638,49 +638,58 @@ def add_pred_args(parser): "--pred-type", default="rnn", choices=["rnn", "conv"], - help= - """type of predictor between RNN and Convolutional [rnn, conv]""") - pred_parser.add_argument("--embed-dim", - default=1024, - type=int, - help=("token embedding dimension")) + help="""type of predictor between RNN and Convolutional [rnn, conv]""", + ) + pred_parser.add_argument( + "--embed-dim", default=1024, type=int, help=("token embedding dimension") + ) pred_parser.add_argument( "--embed-dropout-rate", default=0.0, type=float, - help=("dropout prob for predictor input embeddings")) - pred_parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help="""dropout prob for decoder RNN """) + help=("dropout prob for predictor input embeddings"), + ) + pred_parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """, + ) pred_parser.add_argument( "--rnn-type", default="lstm", choices=["lstm", "gru"], - help= - """type of recurrent network for thep predictor in [lstm, gru]""") - - pred_parser.add_argument("--num-layers", - default=2, - type=int, - help="""number of layers of the predictor """) - - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the predictor""") - pred_parser.add_argument("--out-feats", - default=512, - type=int, - help="""output features of the predictor""") - pred_parser.add_argument("--context-size", - default=2, - type=int, - help="""context length of the convolutional - predictor, 1->bigram, 2-> trigram,...""") - - parser.add_argument("--predictor", - action=ActionParser(parser=pred_parser)) + help="""type of recurrent network for thep predictor in [lstm, gru]""", + ) + + pred_parser.add_argument( + "--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """, + ) + + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""", + ) + pred_parser.add_argument( + "--out-feats", + default=512, + type=int, + help="""output features of the predictor""", + ) + pred_parser.add_argument( + "--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""", + ) + + parser.add_argument("--predictor", action=ActionParser(parser=pred_parser)) @staticmethod def add_joiner_args(parser): @@ -690,39 +699,43 @@ def add_joiner_args(parser): "--joiner-type", default="basic", choices=["basic"], - help= - """type of joiner network, there is only basic joiner for now""") - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the joiner""") - parser.add_argument("--joiner", - action=ActionParser(parser=pred_parser)) + help="""type of joiner network, there is only basic joiner for now""", + ) + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""", + ) + parser.add_argument("--joiner", action=ActionParser(parser=pred_parser)) @staticmethod - def add_class_args(parser, - prefix=None, - skip=set(["in_feats", "blank_id", "vocab_size"])): + def add_class_args( + parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"]) + ): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - required=True, - help=("input feature dimension")) + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) if "blank_id" not in skip: - parser.add_argument("--blank-id", - type=int, - default=0, - help=("blank id from tokenizer model")) + parser.add_argument( + "--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model"), + ) if "vocab_size" not in skip: - parser.add_argument("--vocab-size", - type=int, - required=True, - help=("output prediction dimension")) + parser.add_argument( + "--vocab-size", + type=int, + required=True, + help=("output prediction dimension"), + ) RNNTransducerDecoder.add_pred_args(parser) RNNTransducerDecoder.add_joiner_args(parser) @@ -730,56 +743,62 @@ def add_class_args(parser, "--rnnt-loss", default="k2_pruned", choices=["torchaudio", "k2", "k2_pruned"], - help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""", + ) parser.add_argument( "--rnnt-type", default="regular", choices=["regular", "modified", "constrained"], - help= - """type of rnn-t loss between regular, modified or constrained.""") + help="""type of rnn-t loss between regular, modified or constrained.""", + ) parser.add_argument( "--delay-penalty", default=0.0, type=float, - help= - """penalize symbol delay, which is used to make symbol emit earlier - for streaming models.""") + help="""penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""", + ) parser.add_argument( "--reduction", default="sum", choices=["sum", "mean"], - help="""type of reduction for rnn-t loss between sum or mean""") + help="""type of reduction for rnn-t loss between sum or mean""", + ) parser.add_argument( "--prune-range", - default=5, - type=int, + default=None, + type=Optional[int], help="""how many symbols to keep for each frame in k2 rnn-t - pruned loss.""") + pruned loss.""", + ) parser.add_argument( "--lm-scale", default=0.25, type=float, - help="""language model scale in rnn-t smoothed loss""") + help="""language model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--am-scale", default=0.0, type=float, - help="""acoustic model scale in rnn-t smoothed loss""") + help="""acoustic model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--simple-loss-scale", default=0.5, type=float, - help="""weight of rnn-t simple loss when using k2 pruned loss""") + help="""weight of rnn-t simple loss when using k2 pruned loss""", + ) parser.add_argument( "--pruned-warmup-steps", default=2000, type=int, help="""number of steps to warm up the k2 rnn-t pruned loss - from 0.1 to 1""") + from 0.1 to 1""", + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -794,16 +813,35 @@ def add_finetune_args(parser, prefix=None, skip=set()): action=ActionYesNo, help=( "whether to use the dropout probabilities passed in the " - "arguments instead of the defaults in the pretrained model.")) - parser.add_argument("--embed-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings"), + ) + parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN "), + ) + + + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 117c0733..4349dbe1 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -11,9 +11,17 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (BlockSpec, Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, ResNetInputBlock, - SpineConv, SpineEndpoints, SpineResample) +from ..layer_blocks import ( + BlockSpec, + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetInputBlock, + SpineConv, + SpineEndpoints, + SpineResample, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from .net_arch import NetArch @@ -111,7 +119,7 @@ def __init__( do_endpoint_conv=True, concat_ax=3, upsampling_type="nearest", - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 092cbd0e..871b37e9 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -44,7 +44,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -243,7 +243,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 901cc9d0..77f69b9c 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -21,7 +21,7 @@ def create( kernel_size=3, dilation=1, dilation_factor=1, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_units=0, out_act=None, dropout_rate=0, @@ -194,7 +194,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 4468185e..f8b50491 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -64,7 +64,7 @@ def __init__( in_layer_type="conv2d-sub", rel_pos_enc=False, causal_pos_enc=False, - hid_act="relu6", + hid_act="relu", norm_before=True, concat_after=False, padding_idx=-1, @@ -408,7 +408,7 @@ def add_class_args(parser, prefix=None, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 65e5884d..e7020e1d 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,19 +2,19 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os from collections import OrderedDict as ODict from copy import deepcopy from enum import Enum from typing import Optional +from pathlib import Path import torch import torch.nn as nn class TorchModel(nn.Module): - """Base class for all Pytorch Models and NNet architectures - """ + """Base class for all Pytorch Models and NNet architectures""" + registry = {} def __init_subclass__(cls, **kwargs): @@ -45,6 +45,12 @@ def non_trainable_parameters(self, recurse: bool = True): if not param.requires_grad: yield param + def has_param_groups(self): + return False + + def trainable_param_groups(self): + return self.trainable_parameters() + def freeze(self): for param in self.parameters(): param.requires_grad = False @@ -104,15 +110,12 @@ def valid_train_modes(): return ["full", "frozen"] def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) - - config = self.get_config() - torch.save({ - "model_cfg": self.get_config(), - "model_state_dict": self.state_dict() - }) + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}, + file_path, + ) @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @@ -132,8 +135,7 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) if state_dict is not None: @@ -148,14 +150,15 @@ def get_loss(self): @property def device(self): - devices = {param.device - for param in self.parameters() - } | {buf.device - for buf in self.buffers()} + devices = {param.device for param in self.parameters()} | { + buf.device for buf in self.buffers() + } if len(devices) != 1: raise RuntimeError( "Cannot determine device: {} different devices found".format( - len(devices))) + len(devices) + ) + ) return next(iter(devices)) @@ -171,7 +174,7 @@ def _fix_cfg_compatibility(class_obj, cfg): Fixed configuration dictionary. """ # for compatibility with older x-vector models - XVector = torch_model_registry["xvector"] + XVector = TorchModel.registry["XVector"] if issubclass(class_obj, XVector): # We renamed AM-softmax scale parameer s to cos_scale if "s" in cfg: @@ -190,8 +193,9 @@ def auto_load(file_path, extra_objs={}, map_location=None): cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] - if class_name in torch_model_registry: - class_obj = torch_model_registry[class_name] + print(TorchModel.registry) + if class_name in TorchModel.registry: + class_obj = TorchModel.registry[class_name] elif class_name in extra_objs: class_obj = extra_objs[class_name] else: @@ -217,5 +221,4 @@ def auto_load(file_path, extra_objs={}, map_location=None): # if it failed the 3 trials raise exception raise err # remove module prefix when is trained with dataparallel - state_dict = ODict( - (p.sub("", k), v) for k, v in state_dict.items()) + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index b2198924..638bf561 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import HubertConfig, HubertModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -135,6 +134,14 @@ class HFHubert(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -182,8 +189,15 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -199,6 +213,14 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -277,6 +299,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property @@ -287,6 +318,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.hubert.modeling_hubert as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.HubertAttention): + module.dropout = activation_dropout + if isinstance(module, t.HubertFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return @@ -586,7 +643,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index e1f21153..d2638acd 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -6,15 +6,14 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2Config, Wav2Vec2Model - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase - +from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel class HFWav2Vec2(HFWav2VecBase): r"""This is wrapper over HuggingFace Wav2Vec2 model. @@ -148,6 +147,14 @@ class HFWav2Vec2(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -197,11 +204,24 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + override_lora: bool = False, + override_condition: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = False, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -214,9 +234,23 @@ def __init__( ignore_pretrained=ignore_pretrained, override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, + override_lora=override_lora, + override_condition=override_condition, left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + use_condition=use_condition, + condition_size=condition_size, + condition_components=condition_components, + condition_type=condition_type, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -247,6 +281,8 @@ def __init__( self.change_config( override_dropouts=self.override_dropouts, override_spec_augment=self.override_spec_augment, + override_lora=self.override_lora, + override_condition=self.override_condition, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, @@ -300,6 +336,21 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + if use_condition: + self._make_condition_layers( + condition_size, + condition_components, + condition_type, + ) + self.ignore_pretrained = True @property @@ -681,7 +732,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index b0a815c7..9f799ded 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -8,15 +8,18 @@ from turtle import right from typing import List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor +from ....utils.misc import filter_func_args +from ...layers import LoRAFactory +import loralib as lora from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs +from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel class HFWav2VecBase(TorchModel): @@ -53,6 +56,14 @@ class HFWav2VecBase(TorchModel): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -68,9 +79,23 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + override_lora: bool = False, + override_condition: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = False, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -82,8 +107,22 @@ def __init__( self.ignore_pretrained = ignore_pretrained self.override_dropouts = override_dropouts self.override_spec_augment = override_spec_augment + self.override_lora = override_lora + self.override_condition = override_condition self.right_encoder_context = right_encoder_context self.left_encoder_context = left_encoder_context + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + self.use_condition = use_condition + self.condition_size = condition_size + self.condition_components = condition_components + self.condition_type = condition_type if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -147,6 +186,7 @@ def __init__( self._feature_encoder_context = None self._frame_shift = None + self.hf_model = None def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. @@ -215,15 +255,57 @@ def out_shape(self, in_shape): C = self.hf_model.config.hidden_size return (in_shape[0], out_length, C) - def change_config(self, override_dropouts, override_spec_augment, **kwargs): + def change_config( + self, + override_dropouts: bool, + override_spec_augment: bool, + override_lora: bool, + override_condition: bool, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + use_condition: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", + **kwargs, + ): if override_spec_augment: - logging.info("overriding speech augment") + logging.info(f"overriding speech augment with args={kwargs}") self.change_spec_augment(**kwargs) if override_dropouts: - logging.info("overriding hf model dropouts") + logging.info(f"overriding hf model dropouts with args={kwargs}") self.change_dropouts(**kwargs) + if override_lora: + logging.info("overriding LoRA config") + self.change_lora( + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + + if override_condition: + logging.info(f"overriding Condition config") + self.change_condition( + use_condition=use_condition, + condition_size=condition_size, + condition_components=condition_components, + condition_type=condition_type, + ) + + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + def change_spec_augment( self, apply_spec_augment: bool = True, @@ -243,12 +325,185 @@ def change_spec_augment( self.hf_model.config.mask_feature_length = mask_feature_length self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks + def change_lora( + self, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = False, + ): + if not self.use_lora: + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + pass + else: + # TODO + pass + else: + if use_lora: + # TODO + pass + else: + # TODO + pass + + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + + def _make_lora_layers( + self, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + counts = {k: 0 for k in lora_components} + self._recursive_replace_layer_by_lora( + self.hf_model, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + for k, v in counts.items(): + logging.info("count of LoRA layers for %s = %d", k, v) + assert v > 0, f"did not make any {k} LoRA" + + @staticmethod + def _recursive_replace_layer_by_lora( + model: nn.Module, + counts: dict, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + for name, module in model.named_children(): + if len(list(module.children())) > 0: + HFWav2VecBase._recursive_replace_layer_by_lora( + module, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + if isinstance(module, nn.Linear) and name in lora_components: + lora_layer = LoRAFactory.create_from_pretrained( + module, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=lora_merge_weights, + ) + setattr(model, name, lora_layer) + counts[name] += 1 + + def change_condition(self, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", + ): + if not self.use_condition: + if use_condition: + self._make_condition_layers( + condition_size, + condition_components, + condition_type, + ) + else: + pass + else: + if use_condition: + pass + else: + pass + self.use_condition = use_condition + self.condition_size = condition_size + self.condition_components = condition_components + self.condition_type = condition_type + + def _make_condition_layers(self, + condition_size: int, + condition_components: List[str], + condition_type: str, + ): + # TODO: copy weight from self.hf_model to self.hf_model_with_condition + config = self.hf_model.config + config.condition_size = condition_size + config.condition_components = condition_components + config.condition_type = condition_type + + hf_model_with_condition = Wav2Vec2CondModel(config) + self._copy_condition_weights(self.hf_model, hf_model_with_condition) + # TODO: make weight for the FiLM layers (0,1) + self.hf_model = hf_model_with_condition + + + def _copy_condition_weights(self, hf_model, hf_model_with_condition): + for name, param in hf_model.named_parameters(): + if name in hf_model_with_condition.state_dict(): + hf_model_with_condition.state_dict()[name].data.copy_(param.data) + def change_dropouts(self, **kwargs): pass # needs to be overloaded def freeze_feature_encoder(self): self.hf_model.freeze_feature_encoder() + def freeze_except_lora(self, bias=None): + bias = "none" if bias is None else bias + from ...layers.lora import mark_only_lora_as_trainable + + mark_only_lora_as_trainable(self.hf_model, bias=bias) + + def has_param_groups(self): + return self.feat_extract_lr is not None or self.encoder_lr is not None + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + if self.feat_extract_lr == self.encoder_lr: + return [{"params": self.trainable_parameters(), "lr": self.encoder_lr}] + + param_groups = [ + {"params": self.hf_model.feature_extractor.parameters()}, + {"params": self.hf_model.feature_projection.parameters()}, + {"params": self.hf_model.encoder.parameters()}, + ] + if self.hf_model.adapter is not None: + param_groups.append({"params": self.hf_model.adapter.parameters()}) + + if self.feat_extract_lr is not None: + param_groups[0]["lr"] = self.feat_extract_lr + param_groups[1]["lr"] = self.feat_extract_lr + + if self.encoder_lr is not None: + param_groups[2]["lr"] = self.encoder_lr + if len(param_groups) == 4: + param_groups[3]["lr"] = self.encoder_lr + + return param_groups + @property def hf_config(self): return self.hf_model.config @@ -257,14 +512,14 @@ def _normalize(self, x, x_mask=None): """Normalizes the audio to have zero mean and unit variance.""" if x_mask is None: x = x - x.mean(dim=1, keepdim=True) - std = torch.sqrt((x ** 2).mean(dim=1, keepdim=True) + 1e-7) + std = torch.sqrt((x**2).mean(dim=1, keepdim=True) + 1e-7) x = x / std else: x_mask = x_mask.to(dtype=x.dtype) x_samples = torch.mean(x_mask, dim=1, keepdim=True) x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples - x2_mean = torch.mean(x ** 2 * x_mask, dim=1, keepdim=True) / x_samples - std = torch.sqrt(x2_mean - x_mean ** 2 + 1e-7) + x2_mean = torch.mean(x**2 * x_mask, dim=1, keepdim=True) / x_samples + std = torch.sqrt(x2_mean - x_mean**2 + 1e-7) x = (x - x_mean) / std return x @@ -283,6 +538,7 @@ def forward( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, chunk_length: float = 0, @@ -313,11 +569,12 @@ def forward( (tuple(torch.FloatTensor)). """ if chunk_length == 0 or x.size(1) < chunk_length * self.sample_frequency: - return self.forward_impl(x, x_lengths, return_attentions, return_hid_states) + return self.forward_impl(x, x_lengths, condition_features, return_attentions, return_hid_states) else: return self.forward_long_impl( x, x_lengths, + condition_features, return_attentions, return_hid_states, chunk_length, @@ -328,6 +585,7 @@ def forward_impl( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, ): @@ -356,12 +614,41 @@ def forward_impl( """ max_in_length = x.size(-1) x, x_mask = self._preprocess(x, x_lengths) - output = self.hf_model( - x, - x_mask, - output_attentions=return_attentions, - output_hidden_states=return_hid_states, - ) + # if ddp_get_rank() == 0: + # lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # print( + # "lora\nw=", + # lora_layer.weight[:3, :3], + # "\na=", + # lora_layer.lora_A[:3, :3], + # "\nb=", + # lora_layer.lora_B[:3, :3], + # "\n", + # "merged=", + # lora_layer.merged, + # "training=", + # lora_layer.training, + # flush=True, + # ) + # assert self.training == lora_layer.training + # assert self.training == (not lora_layer.merged) + + if condition_features is not None: + output = self.hf_model( + x, + condition_features, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + + else: + output = self.hf_model( + x, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) max_out_length = output.last_hidden_state.size(1) feat_lengths = ( None @@ -376,6 +663,7 @@ def forward_long_impl( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, chunk_length: float = 120.0, @@ -432,12 +720,21 @@ def forward_long_impl( stop_i = min(start + chunk_length + right_context, x.size(1)) x_i = x[:, start_i:stop_i] x_mask_i = None if x_mask is None else x_mask[start_i:stop_i] - output_i = self.hf_model( - x_i, - x_mask_i, - output_attentions=return_attentions, - output_hidden_states=return_hid_states, - ) + if condition_features is not None: + output_i = self.hf_model( + x_i, + x_mask_i, + condition_features=condition_features, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + else: + output_i = self.hf_model( + x_i, + x_mask_i, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) if i < num_chunks - 1: start_out_i = max( @@ -499,14 +796,6 @@ def forward_long_impl( else scale_seq_lengths(x_lengths, max_out_length, max_in_length) ) output["hidden_states_lengths"] = feat_lengths - # print( - # "lens", - # mol0, - # max_out_length, - # output.last_hidden_state.size(1), - # output.hidden_states[0].size(1), - # flush=True, - # ) return output def get_config(self): @@ -524,9 +813,23 @@ def get_config(self): "ignore_pretrained": self.ignore_pretrained, "override_dropouts": self.override_dropouts, "override_spec_augment": self.override_spec_augment, + "override_lora": self.override_lora, + "override_condition": self.override_condition, "left_encoder_context": self.left_encoder_context, "right_encoder_context": self.right_encoder_context, "sample_frequency": self.sample_frequency, + "feat_extract_lr": self.feat_extract_lr, + "encoder_lr": self.encoder_lr, + "use_lora": self.use_lora, + "lora_components": self.lora_components, + "lora_rank": self.lora_rank, + "lora_alpha": self.lora_alpha, + "lora_dropout": self.lora_dropout, + "lora_merge_weights": self.lora_merge_weights, + "use_condition": self.use_condition, + "condition_size": self.condition_size, + "condition_components": self.condition_components, + "condition_type": self.condition_type, } base_config = super().get_config() @@ -539,24 +842,106 @@ def save(self, file_path: str): @staticmethod def filter_args(**kwargs): - valid_args = ( - "pretrained_model_path", - "normalize_input", - "use_input_attention_mask", - "cache_dir", - "force_download", - "resume_download", - "revision", - "drop_layers_gt", - "ignore_pretrained", - "override_dropouts", - "override_spec_augment", - "left_encoder_context", - "right_encoder_context", - "sample_frequency", + return filter_func_args(HFWav2VecBase.__init__, kwargs) + # valid_args = ( + # "pretrained_model_path", + # "normalize_input", + # "use_input_attention_mask", + # "cache_dir", + # "force_download", + # "resume_download", + # "revision", + # "drop_layers_gt", + # "ignore_pretrained", + # "override_dropouts", + # "override_spec_augment", + # "left_encoder_context", + # "right_encoder_context", + # "sample_frequency", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + @staticmethod + def _add_lr_args(parser): + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) + + @staticmethod + def _add_lora_args(parser): + parser.add_argument( + "--use-lora", + default=False, + action=ActionYesNo, + help="use low-rank adapters", + ) + parser.add_argument( + "--lora-components", + default=["q_proj", "v_proj"], + nargs="+", + choices=[ + "k_proj", + "q_proj", + "v_proj", + "out_proj", + "intermediate_dense", + "output_dense", + ], + help="list of components where we apply LoRA, eg [Wq, Wv]", + ) + parser.add_argument("--lora-rank", default=4, help="rank of LoRA") + parser.add_argument("--lora-alpha", default=1.0, help="scale for LoRA") + parser.add_argument("--lora-dropout", default=0.0, help="dropout rate for LoRA") + parser.add_argument( + "--lora-merge-weights", + default=True, + action=ActionYesNo, + help="lora weights are merged with the pretrained weights at inference.", + ) + + def _add_condition_args(parser): + parser.add_argument( + "--use-condition", + default=False, + action=ActionYesNo, + help="use condition", + ) + parser.add_argument( + "--condition-size", + default=128, + type=int, + help="size of the condition", + ) + parser.add_argument( + "--condition-components", + default=["attention"], + nargs="+", + choices=["attention"], + help="list of components where we apply condition, eg [attention]", + ) + parser.add_argument( + "--condition-type", + default="one-hot", + choices=["one-hot", "learned"], + help="type of condition", ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args + @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -570,7 +955,6 @@ def add_class_args(parser, prefix=None, skip=set()): help=("file path or HuggingFace Hub path to pre-trained model"), ) - parser.add_argument( "--normalize-input", default=True, @@ -660,17 +1044,22 @@ def add_class_args(parser, prefix=None, skip=set()): ), ) + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) + HFWav2VecBase._add_condition_args(parser) + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): - valid_args = ( - "override_dropouts", - "override_spec_augment", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args + return filter_func_args(HFWav2VecBase.change_config, kwargs) + # valid_args = ( + # "override_dropouts", + # "override_spec_augment", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -696,6 +1085,22 @@ def add_finetune_args(parser, prefix=None, skip=set()): "arguments instead of the defaults in the pretrained model." ), ) + parser.add_argument( + "--override-lora", + default=False, + action=ActionYesNo, + help=("whether to change the config of LoRA layers in the model."), + ) + + parser.add_argument( + "--override-condition", + default=False, + action=ActionYesNo, + help=("whether to change the config of condition layers in the model."), + ) + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) + HFWav2VecBase._add_condition_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 0d5c5ad3..1db5fa23 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import WavLMConfig, WavLMModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -148,6 +147,14 @@ class HFWavLM(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -200,8 +207,15 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -217,6 +231,14 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -300,6 +322,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property @@ -310,6 +341,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.wavlm.modeling_wavlm as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.WavLMAttention): + module.dropout = activation_dropout + if isinstance(module, t.WavLMFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return @@ -655,7 +712,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py new file mode 100644 index 00000000..ceeda9a9 --- /dev/null +++ b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py @@ -0,0 +1,2477 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Wav2Vec2 model.""" + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from transformers.models.wav2vec2.modeling_wav2vec2 import is_deepspeed_zero3_enabled +from transformers.modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + MaskedLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + Wav2Vec2BaseModelOutput, + XVectorOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + cached_file, + is_safetensors_available, + logging, + replace_return_docstrings, +) + +from transformers import Wav2Vec2Config + +from ....layer_blocks import FiLM + +WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin" +WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors" + +if is_safetensors_available(): + from safetensors.torch import load_file as safe_load_file + + +logger = logging.get_logger(__name__) + + +_HIDDEN_STATES_START_POSITION = 2 + +# General docstring +_CONFIG_FOR_DOC = "Wav2Vec2Config" + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h" +_EXPECTED_OUTPUT_SHAPE = [1, 292, 768] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'" +_CTC_EXPECTED_LOSS = 53.48 + +# Audio class docstring +_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks" +_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'" +_SEQ_CLASS_EXPECTED_LOSS = 6.54 + +# Frame class docstring +_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd" +_FRAME_EXPECTED_OUTPUT = [0, 0] + +# Speaker Verification docstring +_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv" +_XVECTOR_EXPECTED_OUTPUT = 0.98 + + +WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/wav2vec2-base-960h", + "facebook/wav2vec2-large-960h", + "facebook/wav2vec2-large-960h-lv60", + "facebook/wav2vec2-large-960h-lv60-self", + # See all Wav2Vec2 models at https://huggingface.co/models?filter=wav2vec2 +] + + +@dataclass +class Wav2Vec2ForPreTrainingOutput(ModelOutput): + """ + Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions. + + Args: + loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss. + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: torch.FloatTensor = None + projected_quantized_states: torch.FloatTensor = None + codevector_perplexity: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + contrastive_loss: Optional[torch.FloatTensor] = None + diversity_loss: Optional[torch.FloatTensor] = None + + +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.sum(-1).detach().tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +def _sample_negative_indices( + features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None +): + """ + Sample `num_negatives` vectors from feature vectors. + """ + batch_size, sequence_length = features_shape + + # generate indices of the positive vectors themselves, repeat them `num_negatives` times + sequence_length_range = np.arange(sequence_length) + + # get `num_negatives` random vector indices from the same utterance + sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) + + mask_time_indices = ( + mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) + ) + + for batch_idx in range(batch_size): + high = mask_time_indices[batch_idx].sum() - 1 + mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]] + + feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives)) + sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives)) + # avoid sampling the same positive vector, but keep the distribution uniform + sampled_indices[sampled_indices >= feature_indices] += 1 + + # remap to actual indices + sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices] + + # correct for batch size + sampled_negative_indices[batch_idx] += batch_idx * sequence_length + + return sampled_negative_indices + + +class Wav2Vec2NoLayerNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2LayerNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + + hidden_states = hidden_states.transpose(-2, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(-2, -1) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2GroupNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2PositionalConvEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + deepspeed.zero.register_external_parameter(self, self.conv.weight_v) + deepspeed.zero.register_external_parameter(self, self.conv.weight_g) + else: + self.conv = weight_norm(self.conv, name="weight", dim=2) + + self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2SamePadLayer(nn.Module): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class Wav2Vec2FeatureEncoder(nn.Module): + """Construct the features from raw audio waveform""" + + def __init__(self, config): + super().__init__() + + if config.feat_extract_norm == "group": + conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [ + Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1) + ] + elif config.feat_extract_norm == "layer": + conv_layers = [ + Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers) + ] + else: + raise ValueError( + f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" + ) + self.conv_layers = nn.ModuleList(conv_layers) + self.gradient_checkpointing = False + self._requires_grad = True + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def forward(self, input_values): + hidden_states = input_values[:, None] + + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True + + for conv_layer in self.conv_layers: + if self._requires_grad and self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(conv_layer), + hidden_states, + ) + else: + hidden_states = conv_layer(hidden_states) + + return hidden_states + + +class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + +class Wav2Vec2FeatureProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) + self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) + self.dropout = nn.Dropout(config.feat_proj_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states, norm_hidden_states + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2 +class Wav2Vec2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class Wav2Vec2FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + self.intermediate_dropout = nn.Dropout(config.activation_dropout) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class Wav2Vec2CondEncoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.condition_type = config.condition_type + self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear") + + + def forward(self, hidden_states, condition_features, attention_mask=None, output_attentions=False): + attn_residual = hidden_states + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = self.condition_layer(hidden_states, condition_features) + hidden_states = attn_residual + hidden_states + + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states + self.feed_forward(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2CondEncoderLayerStableLayerNorm(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if getattr(config, "adapter_attn_dim", None) is not None: + self.adapter_layer = Wav2Vec2AttnAdapterLayer(config) + else: + self.adapter_layer = None + + self.condition_type = config.condition_type + self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear") + + def forward( + self, + hidden_states: torch.Tensor, + condition_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + attn_residual = hidden_states + hidden_states = self.layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = self.condition_layer(hidden_states, condition_features) + hidden_states = attn_residual + hidden_states + hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states)) + + if self.adapter_layer is not None: + hidden_states = hidden_states + self.adapter_layer(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2CondEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList([Wav2Vec2CondEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.tensor, + condition_features: torch.tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = layer( + hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2CondEncoderStableLayerNorm(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList( + [Wav2Vec2CondEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + condition_features, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens are not attended to + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.dropout(hidden_states) + + deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = layer( + hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2GumbelVectorQuantizer(nn.Module): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information. + """ + + def __init__(self, config): + super().__init__() + self.num_groups = config.num_codevector_groups + self.num_vars = config.num_codevectors_per_group + + if config.codevector_dim % self.num_groups != 0: + raise ValueError( + f"`config.codevector_dim {config.codevector_dim} must be divisible " + f"by `config.num_codevector_groups` {self.num_groups} for concatenation" + ) + + # storage for codebook variables (codewords) + self.codevectors = nn.Parameter( + torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) + ) + self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) + + # can be decayed for training + self.temperature = 2 + + @staticmethod + def _compute_perplexity(probs, mask=None): + if mask is not None: + mask_extended = mask.flatten()[:, None, None].expand(probs.shape) + probs = torch.where(mask_extended, probs, torch.zeros_like(probs)) + marginal_probs = probs.sum(dim=0) / mask.sum() + else: + marginal_probs = probs.mean(dim=0) + + perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states, mask_time_indices=None): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = torch.softmax( + hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_( + -1, codevector_idx.view(-1, 1), 1.0 + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) + + perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) + + codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) + codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1) + + return codevectors, perplexity + + +class Wav2Vec2Adapter(nn.Module): + def __init__(self, config): + super().__init__() + + # feature dim might need to be down-projected + if config.output_hidden_size != config.hidden_size: + self.proj = nn.Linear(config.hidden_size, config.output_hidden_size) + self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size) + else: + self.proj = self.proj_layer_norm = None + + self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)) + self.layerdrop = config.layerdrop + + def forward(self, hidden_states): + # down project hidden_states if necessary + if self.proj is not None and self.proj_layer_norm is not None: + hidden_states = self.proj(hidden_states) + hidden_states = self.proj_layer_norm(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + + for layer in self.layers: + layerdrop_prob = np.random.random() + if not self.training or (layerdrop_prob > self.layerdrop): + hidden_states = layer(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2AdapterLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.output_hidden_size, + 2 * config.output_hidden_size, + config.adapter_kernel_size, + stride=config.adapter_stride, + padding=1, + ) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + return hidden_states + + +class Wav2Vec2AttnAdapterLayer(nn.Module): + def __init__(self, config): + """ + Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed + up training throughput. + """ + super().__init__() + self.input_dim = config.adapter_attn_dim + self.hidden_dim = config.hidden_size + + self.norm = nn.LayerNorm(self.hidden_dim) + self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim) + self.act_fn = nn.ReLU() + self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim) + + def forward(self, hidden_states: torch.FloatTensor): + hidden_states = self.norm(hidden_states) + + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act_fn(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +class Wav2Vec2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Wav2Vec2Config + base_model_prefix = "wav2vec2" + main_input_name = "input_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. + if isinstance(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + # gumbel softmax requires special init + elif isinstance(module, Wav2Vec2GumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2PositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2FeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths( + self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None + ): + """ + Computes the output length of the convolutional layers + """ + + add_adapter = self.config.add_adapter if add_adapter is None else add_adapter + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + if add_adapter: + for _ in range(self.config.num_adapter_layers): + input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride) + + return input_lengths + + def _get_feature_vector_attention_mask( + self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None + ): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) + output_lengths = output_lengths.to(torch.long) + + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (Wav2Vec2CondEncoder, Wav2Vec2CondEncoderStableLayerNorm, Wav2Vec2FeatureEncoder)): + module.gradient_checkpointing = value + + def _get_adapters(self): + if self.config.adapter_attn_dim is None: + raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.") + + adapter_weights = {} + for name, module in self.named_modules(): + if isinstance(module, Wav2Vec2AttnAdapterLayer): + for param_name, param in module.named_parameters(): + adapter_weights[".".join([name, param_name])] = param + + if isinstance(self, Wav2Vec2ForCTC): + for name, param in self.lm_head.named_parameters(): + adapter_weights[".".join(["lm_head", name])] = param + + return adapter_weights + + def init_adapter_layers(self): + """ + (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning + """ + # init attention adapters + for module in self.modules(): + if isinstance(module, Wav2Vec2AttnAdapterLayer): + self._init_weights(module) + + # init lm head + if isinstance(self, Wav2Vec2ForCTC): + self._init_weights(self.lm_head) + + def load_adapter(self, target_lang: str, force_load=True, **kwargs): + r""" + Load a language adapter model from a pre-trained adapter model. + + Parameters: + target_lang (`str`): + Has to be a language id of an existing adapter weight. Adapter weights are stored in the format + adapter..safetensors or adapter..bin + force_load (`bool`, defaults to `True`): + Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/". + + + + mirror (`str`, *optional*): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. + + + + Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to + use this method in a firewalled environment. + + + + Examples: + + ```python + >>> from transformers import Wav2Vec2ForCTC, AutoProcessor + + >>> ckpt = "facebook/mms-1b-all" + >>> processor = AutoProcessor.from_pretrained(ckpt) + >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng") + >>> # set specific language + >>> processor.tokenizer.set_target_lang("spa") + >>> model.load_adapter("spa") + ``` + """ + if self.config.adapter_attn_dim is None: + raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.") + + if target_lang == self.target_lang and not force_load: + logger.warning(f"Adapter weights are already set to {target_lang}.") + return + + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + token = kwargs.pop("token", None) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + model_path_or_id = self.config._name_or_path + state_dict = None + + # 1. Let's first try loading a safetensors adapter weight + if use_safetensors is not False: + filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang) + + try: + weight_path = cached_file( + model_path_or_id, + filename=filepath, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + cache_dir=cache_dir, + ) + + state_dict = safe_load_file(weight_path) + + except EnvironmentError: + if use_safetensors: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + + except Exception: + # For any other exception, we throw a generic error. + if use_safetensors: + raise EnvironmentError( + f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" + f" directory containing a file named {filepath}." + ) + + # 2. If this didn't work let's try loading a PyTorch adapter weight + if state_dict is None: + filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang) + + try: + weight_path = cached_file( + model_path_or_id, + filename=filepath, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + cache_dir=cache_dir, + ) + + state_dict = torch.load(weight_path, map_location="cpu") + + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" + f" directory containing a file named {filepath}." + ) + + adapter_weights = self._get_adapters() + unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys()) + missing_keys = set(adapter_weights.keys()) - set(state_dict.keys()) + + if len(unexpected_keys) > 0: + raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.") + elif len(missing_keys) > 0: + raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.") + + # make sure now vocab size is correct + target_vocab_size = state_dict["lm_head.weight"].shape[0] + if target_vocab_size != self.config.vocab_size: + self.lm_head = nn.Linear( + self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype + ) + self.config.vocab_size = target_vocab_size + + # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights + state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()} + self.load_state_dict(state_dict, strict=False) + + # set target language corectly + self.target_lang = target_lang + + +WAV_2_VEC_2_START_DOCSTRING = r""" + Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech + Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael + Auli. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving etc.). + + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +WAV_2_VEC_2_INPUTS_DOCSTRING = r""" + Args: + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install + soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and + conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, + 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + + + `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask == + True`. For all models whose processor has `config.return_attention_mask == False`, such as + [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be + passed to avoid degraded performance when doing batched inference. For such models `input_values` should + simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly + different results depending on whether `input_values` is padded or not. + + + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.", + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2CondModel(Wav2Vec2PreTrainedModel): + def __init__(self, config: Wav2Vec2Config): + super().__init__(config) + self.config = config + self.feature_extractor = Wav2Vec2FeatureEncoder(config) + self.feature_projection = Wav2Vec2FeatureProjection(config) + + # model only needs masking vector if mask prob is > 0.0 + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_()) + + if config.do_stable_layer_norm: + self.encoder = Wav2Vec2CondEncoderStableLayerNorm(config) + else: + self.encoder = Wav2Vec2CondEncoder(config) + + self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.feature_extractor._freeze_parameters() + + def _mask_hidden_states( + self, + hidden_states: torch.FloatTensor, + mask_time_indices: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://arxiv.org/abs/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return hidden_states + + # generate indices & apply SpecAugment along time axis + batch_size, sequence_length, hidden_size = hidden_states.size() + + if mask_time_indices is not None: + # apply SpecAugment along time axis with given mask_time_indices + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + elif self.config.mask_time_prob > 0 and self.training: + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) + mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + hidden_states[mask_feature_indices] = 0 + + return hidden_states + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Wav2Vec2BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + condition_features: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Wav2Vec2BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + condition_features=condition_features, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states) + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING) +class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): + def __init__(self, config: Wav2Vec2Config): + super().__init__(config) + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) + + self.quantizer = Wav2Vec2GumbelVectorQuantizer(config) + + self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim) + self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim) + + # Initialize weights and apply final processing + self.post_init() + + def set_gumbel_temperature(self, temperature: int): + """ + Set the Gumbel softmax temperature to a given value. Only necessary for training + """ + self.quantizer.temperature = temperature + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + @staticmethod + def compute_contrastive_logits( + target_features: torch.FloatTensor, + negative_features: torch.FloatTensor, + predicted_features: torch.FloatTensor, + temperature: int = 0.1, + ): + """ + Compute logits for contrastive loss based using cosine similarity as the distance measure between + `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. + """ + target_features = torch.cat([target_features, negative_features], dim=0) + + logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as( + target_features + ) + + # apply temperature + logits = logits / temperature + return logits + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.BoolTensor] = None, + sampled_negative_indices: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): + Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. + Required input for pre-training. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining + >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") + >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 + + >>> # compute masked indices + >>> batch_size, raw_sequence_length = input_values.shape + >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() + >>> mask_time_indices = _compute_mask_indices( + ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 + ... ) + >>> sampled_negative_indices = _sample_negative_indices( + ... features_shape=(batch_size, sequence_length), + ... num_negatives=model.config.num_negatives, + ... mask_time_indices=mask_time_indices, + ... ) + >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long) + >>> sampled_negative_indices = torch.tensor( + ... data=sampled_negative_indices, device=input_values.device, dtype=torch.long + ... ) + + >>> with torch.no_grad(): + ... outputs = model(input_values, mask_time_indices=mask_time_indices) + + >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) + >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) + + >>> # show that cosine similarity is much higher than random + >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5 + tensor(True) + + >>> # for contrastive loss training model should be put into train mode + >>> model = model.train() + >>> loss = model( + ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices + ... ).loss + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if mask_time_indices is not None: + mask_time_indices = mask_time_indices.to(torch.bool) + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + mask_time_indices=mask_time_indices, + return_dict=return_dict, + ) + + # 1. project all transformed features (including masked) to final vq dim + transformer_features = self.project_hid(outputs[0]) + + # 2. quantize all (unmasked) extracted features and project to final vq dim + extract_features = self.dropout_features(outputs[1]) + + if attention_mask is not None: + # compute reduced attention_mask correponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + quantized_features, codevector_perplexity = self.quantizer( + extract_features, mask_time_indices=mask_time_indices + ) + quantized_features = self.project_q(quantized_features) + + loss = contrastive_loss = diversity_loss = None + if sampled_negative_indices is not None: + batch_size, sequence_length, hidden_size = quantized_features.shape + + # for training, we sample negatives + # 3. sample K negatives (distractors) quantized states for contrastive loss + # if attention_mask is passed, make sure that padded feature vectors cannot be sampled + # sample negative quantized vectors BTC => (BxT)C + negative_quantized_features = quantized_features.view(-1, hidden_size)[ + sampled_negative_indices.long().view(-1) + ] + negative_quantized_features = negative_quantized_features.view( + batch_size, sequence_length, -1, hidden_size + ).permute(2, 0, 1, 3) + + # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa` + # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf + logits = self.compute_contrastive_logits( + quantized_features[None, :], + negative_quantized_features, + transformer_features, + self.config.contrastive_logits_temperature, + ) + + # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low), + # its cosine similarity will be masked + neg_is_pos = (quantized_features == negative_quantized_features).all(-1) + + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + + # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) = + # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) + logits = logits.transpose(0, 2).reshape(-1, logits.size(0)) + target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten() + + contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum") + # 7. compute diversity loss: \mathbf{L}_d + num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups + diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum() + + # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d + loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss + + if not return_dict: + if loss is not None: + return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + + return Wav2Vec2ForPreTrainingOutput( + loss=loss, + projected_states=transformer_features, + projected_quantized_states=quantized_features, + codevector_perplexity=codevector_perplexity, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + contrastive_loss=contrastive_loss, + diversity_loss=diversity_loss, + ) + + +@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV_2_VEC_2_START_DOCSTRING) +class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + warnings.warn( + "The class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.", FutureWarning + ) + + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout = nn.Dropout(config.final_dropout) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + def forward( + self, + input_values: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, MaskedLMOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.wav2vec2( + input_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.lm_head(hidden_states) + + if not return_dict: + output = (logits,) + outputs[2:] + return output + + return MaskedLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): + def __init__(self, config, target_lang: Optional[str] = None): + super().__init__(config) + + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout = nn.Dropout(config.final_dropout) + + self.target_lang = target_lang + + if config.vocab_size is None: + raise ValueError( + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." + ) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def tie_weights(self): + """ + This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when + passing `target_lang=...` to `from_pretrained(...)`. + + This method is **not** supposed to be called by the user and is prone to be changed in the future. + """ + + # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to + # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to + # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is + # ok to repurpose this function here. + target_lang = self.target_lang + + if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None: + raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.") + elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None: + logger.info("By default `target_lang` is set to 'eng'.") + elif target_lang is not None: + self.load_adapter(target_lang, force_load=True) + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_CTC_EXPECTED_OUTPUT, + expected_loss=_CTC_EXPECTED_LOSS, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, CausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + if labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long) + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + flattened_targets = labels.masked_select(labels_mask) + + # ctc_loss doesn't support fp16 + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + + with torch.backends.cudnn.flags(enabled=False): + loss = nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@add_start_docstrings( + """ + Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like + SUPERB Keyword Spotting. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_SEQ_CLASS_CHECKPOINT, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + if attention_mask is None: + pooled_output = hidden_states.mean(dim=1) + else: + padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) + hidden_states[~padding_mask] = 0.0 + pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.num_labels = config.num_labels + + self.init_weights() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_FRAME_CLASS_CHECKPOINT, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_FRAME_EXPECTED_OUTPUT, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AMSoftmaxLoss(nn.Module): + def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): + super(AMSoftmaxLoss, self).__init__() + self.scale = scale + self.margin = margin + self.num_labels = num_labels + self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() + + def forward(self, hidden_states, labels): + labels = labels.flatten() + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) + cos_theta = torch.mm(hidden_states, weight) + psi = cos_theta - self.margin + + onehot = nn.functional.one_hot(labels, self.num_labels) + logits = self.scale * torch.where(onehot.bool(), psi, cos_theta) + loss = self.loss(logits, labels) + + return loss + + +class TDNNLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] + self.out_conv_dim = config.tdnn_dim[layer_id] + self.kernel_size = config.tdnn_kernel[layer_id] + self.dilation = config.tdnn_dilation[layer_id] + + self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) + self.activation = nn.ReLU() + + def forward(self, hidden_states): + hidden_states = hidden_states.unsqueeze(1) + hidden_states = nn.functional.unfold( + hidden_states, + (self.kernel_size, self.in_conv_dim), + stride=(1, self.in_conv_dim), + dilation=(self.dilation, 1), + ) + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.kernel(hidden_states) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +@add_start_docstrings( + """ + Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) + + tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] + self.tdnn = nn.ModuleList(tdnn_layers) + + self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim) + self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim) + + self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels) + + self.init_weights() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the TDNN layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return (input_length - kernel_size) // stride + 1 + + for kernel_size in self.config.tdnn_kernel: + input_lengths = _conv_out_length(input_lengths, kernel_size, 1) + + return input_lengths + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_XVECTOR_CHECKPOINT, + output_type=XVectorOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_XVECTOR_EXPECTED_OUTPUT, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, XVectorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + + for tdnn_layer in self.tdnn: + hidden_states = tdnn_layer(hidden_states) + + # Statistic Pooling + if attention_mask is None: + mean_features = hidden_states.mean(dim=1) + std_features = hidden_states.std(dim=1) + else: + feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) + tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) + mean_features = [] + std_features = [] + for i, length in enumerate(tdnn_output_lengths): + mean_features.append(hidden_states[i, :length].mean(dim=0)) + std_features.append(hidden_states[i, :length].std(dim=0)) + mean_features = torch.stack(mean_features) + std_features = torch.stack(std_features) + statistic_pooling = torch.cat([mean_features, std_features], dim=-1) + + output_embeddings = self.feature_extractor(statistic_pooling) + logits = self.classifier(output_embeddings) + + loss = None + if labels is not None: + loss = self.objective(logits, labels) + + if not return_dict: + output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return XVectorOutput( + loss=loss, + logits=logits, + embeddings=output_embeddings, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) \ No newline at end of file diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index c1530608..3c96c84f 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -5,7 +5,12 @@ from .dvae_trainer import DVAETrainer from .torch_trainer import TorchTrainer + + + +from .languageid_trainer import LanguageIDTrainer from .transducer_trainer import TransducerTrainer +from .transducer_languageid_trainer import TransducerLanguageIDTrainer from .vae_trainer import VAETrainer from .vq_dvae_trainer import VQDVAETrainer from .vq_vae_trainer import VQVAETrainer diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py new file mode 100644 index 00000000..ef252693 --- /dev/null +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -0,0 +1,239 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import TorchTrainer +# from ..losses.focal_loss import FocalLoss +# from torchvision.ops.focal_loss import sigmoid_focal_loss + + +class LanguageIDTrainer(TorchTrainer): + """Trainer to train Language identification style models. + + Attributes: + model: Language identification model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + input_key="x", + target_key="language", + loss_weight=None, + loss_weight_exp=0.5, + ): + + if loss == "CE" or loss is None: + loss = nn.CrossEntropyLoss() + elif loss == "weightedCE": + loss = nn.CrossEntropyLoss(weight=torch.tensor(loss_weight.values, dtype=torch.float).to(device)**(-loss_weight_exp)) + logging.info(torch.tensor(loss_weight.values).to(device)**(-loss_weight_exp)) + elif loss == "focal_loss": + loss = FocalLoss(alpha=torch.tensor(focal_weight.values).to(device)**(-loss_weight_exp), gamma=2, size_average=True) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + batch_keys = [ + self.input_key, self.target_key + ] + + self.model.update_loss_margin(self.cur_epoch) + + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + input_data, target = tensors_subset( + data, batch_keys, self.device) + # input_data, input_lengths, target = tensors_subset( + # data, batch_keys, self.device) + batch_size = input_data.shape[0] + + with self.amp_autocast(): + # TODO: Check and Modify output, loss from the model + # output, loss = self.model(data, + # x_lengths=audio_length, + # y=target) + # loss = loss.mean() / self.grad_acc_steps + output = self.model(input_data, y=target) + loss = self.loss(output, target).mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + self.update_model() + + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + logs["lr"] = self._get_lr() + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() + return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + batch_keys = [ + self.input_key, self.target_key + ] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset( + data, batch_keys, self.device) + # input_data, input_lengths, target = tensors_subset( + # data, batch_keys, self.device) + batch_size = input_data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with self.amp_autocast(): + output = self.model(input_data, y=target) + loss = self.loss(output, target).mean() / self.grad_acc_steps + + # output, loss = self.model(data, + # x_lengths=audio_length, + # y=target) + # output = self.model(data) + # loss = self.loss(output, target) + + batch_metrics["loss"] = loss.mean().item() + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(LanguageIDTrainer.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, + train_modes=train_modes, + skip=super_skip) + if "target_key" not in skip: + parser.add_argument("--target-key", + default="language", + help="dict. key for nnet targets") + if "loss" not in skip: + parser.add_argument("--loss", + default=None, + choices=["CE", "weightedCE", "focal_loss"], + help="loss function") + if "loss_weight_exp" not in skip: + parser.add_argument("--loss-weight-exp", + default=0.5, + type=float, + help="focal loss weight exponent") + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index a6f20a8e..4af86b5e 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -21,13 +21,17 @@ from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args -from ..loggers import (CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, - WAndBLogger) +from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger from ..lr_schedulers import LRScheduler as LRS from ..lr_schedulers import LRSchedulerFactory as LRSF from ..optim import OptimizerFactory as OF -from ..utils import (FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP, - tensors_subset) +from ..utils import ( + FairFullyShardedDDP, + FairShardedDDP, + MetricAcc, + TorchDDP, + tensors_subset, +) class DDPType(str, Enum): @@ -72,6 +76,7 @@ class TorchTrainer(object): input_key: dict. key for nnet input. target_key: dict. key for nnet targets. """ + def __init__( self, model, @@ -113,8 +118,9 @@ def __init__( self.exp_path = Path(exp_path) if loggers is None: - self.loggers = self._default_loggers(log_interval, use_tensorboard, - use_wandb, wandb) + self.loggers = self._default_loggers( + log_interval, use_tensorboard, use_wandb, wandb + ) elif isinstance(loggers, list): self.loggers = LoggerList(loggers) else: @@ -149,29 +155,25 @@ def __init__( self.rank = dist.get_rank() self.world_size = dist.get_world_size() if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, - self.model, - oss=oss) + self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( - self.model, device_ids=[device], output_device=device, + self.model, + device_ids=[device], + output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=True) + self.optimizer = self._make_optimizer(optim, self.model, oss=True) self.model = FairShardedDDP(self.model, self.optimizer) else: if self.rank == 0: @@ -184,9 +186,7 @@ def __init__( mixed_precision=self.use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=False) + self.optimizer = self._make_optimizer(optim, self.model, oss=False) else: self.optimizer = self._make_optimizer(optim, self.model) @@ -216,9 +216,9 @@ def __init__( if self.rank == 0: logging.info("init SWA model") self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR(self.optimizer, - swa_lr=self.swa_lr, - anneal_epochs=self.swa_anneal_epochs) + self.swa_scheduler = SWALR( + self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + ) def set_epoch(self, data_loader): try: @@ -246,14 +246,16 @@ def fit(self, train_data, val_data=None): val_logs = {} self.loggers.on_train_begin(epochs=self.epochs) + if self.cur_epoch == 0: + self.save_checkpoint() + # exit() for epoch in range(self.cur_epoch, self.epochs): self.set_epoch(train_data) self.loggers.on_epoch_begin(epoch, batches=len(train_data)) if self.lr_scheduler is not None: # this is needed by cosine scheduler epoch_updates = int(len(train_data) / self.grad_acc_steps) - self.lr_scheduler.on_epoch_begin(epoch, - epoch_updates=epoch_updates) + self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) logs = self.train_epoch(train_data) if val_data is not None: @@ -275,8 +277,7 @@ def fit(self, train_data, val_data=None): self.save_checkpoint(logs) if self.in_swa: - self.loggers.on_epoch_begin(self.cur_epoch, - batches=len(train_data)) + self.loggers.on_epoch_begin(self.cur_epoch, batches=len(train_data)) self.model = self.swa_model.module logs = self.bn_update_epoch(train_data) @@ -351,16 +352,16 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.train() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data) + output = self.model(x) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() @@ -381,9 +382,9 @@ def bn_update_epoch(self, data_loader): def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): if self.ddp: if self.ddp_type == DDPType.DDP: - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) return if self.ddp_type == DDPType.FULLY_SHARDED_DDP: # we have to use the member function in FullyShardedDDP class @@ -395,24 +396,26 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): optim.clip_grad_norm(grad_clip, norm_type=grad_clip_norm) # if no DDP clip normally - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) def update_model(self): """Updates the model and does gradding clipping.""" if self.use_amp: if self.grad_clip > 0: self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() else: if self.grad_clip > 0: - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.optimizer.step() @@ -426,7 +429,9 @@ def _make_optimizer(self, optim, model, oss=False): opt_args["oss"] = oss if self.rank == 0: logging.info("optimizer args={}".format(opt_args)) - optimizer = OF.create(model.parameters(), **opt_args) + + # optimizer = OF.create(model.parameters(), **opt_args) + optimizer = OF.create(model.trainable_param_groups(), **opt_args) return optimizer def _make_lr_sched(self, lr_sched, optim): @@ -441,26 +446,27 @@ def _make_lr_sched(self, lr_sched, optim): lr_sched = LRSF.create(optim, **args) return lr_sched - def _default_loggers(self, log_interval, use_tensorboard, use_wandb, - wandb): + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) csv_log = CSVLogger(self.exp_path / "train.log", append=True) loggers = [prog_log, csv_log] if use_tensorboard: loggers.append( - TensorBoardLogger(self.exp_path / "tb", interval=log_interval)) + TensorBoardLogger(self.exp_path / "tb", interval=log_interval) + ) if use_wandb: loggers.append( - WAndBLogger(**wandb, - path=self.exp_path / "wandb", - interval=log_interval)) + WAndBLogger( + **wandb, path=self.exp_path / "wandb", interval=log_interval + ) + ) return LoggerList(loggers) def _get_lr(self): """Returns the current learning rate to show in the loggers""" - for param_group in self.optimizer.param_groups: - return param_group["lr"] + lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] + return max(lrs) def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: @@ -478,7 +484,8 @@ def _compute_grad_acc_steps(self, data_loader): return self.grad_acc_steps = int( - math.ceil(self.eff_batch_size / batch_size / self.world_size)) + math.ceil(self.eff_batch_size / batch_size / self.world_size) + ) logging.info( "Setting grad_acc_steps=%d for " "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", @@ -502,30 +509,24 @@ def checkpoint(self, logs=None): logs: logs containing the current value of the metrics. """ checkpoint = { - "epoch": - self.cur_epoch, - "rng_state": - torch.get_rng_state(), - "model_cfg": - self.model.get_config(), - "model_state_dict": - self.model.state_dict(), - "optimizer_state_dict": - self.optimizer.state_dict(), - "loss_state_dict": - self.loss.state_dict() if self.loss is not None else None, + "epoch": self.cur_epoch, + "rng_state": torch.get_rng_state(), + "model_cfg": self.model.get_config(), + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.optimizer.state_dict(), + "loss_state_dict": self.loss.state_dict() + if self.loss is not None + else None, } if self.lr_scheduler is not None: - checkpoint[ - "lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() if logs is not None: checkpoint["logs"] = logs if self.in_swa: checkpoint["swa_model_state_dict"] = self.swa_model.state_dict() - checkpoint[ - "swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() + checkpoint["swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() return checkpoint @@ -535,8 +536,9 @@ def save_checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ - if self.ddp and (self.ddp_type == DDPType.OSS_DDP - or self.ddp_type == DDPType.OSS_SHARDED_DDP): + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): # Not sure what this does, just copying from the example in # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py # Check the checkpointing in the case of the OSS optimizer @@ -591,17 +593,16 @@ def load_checkpoint(self, file_path): if self.loss is not None: self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict( - checkpoint["lr_scheduler_state_dict"]) + self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) if self.do_swa: if "swa_model_state_dict" in checkpoint: - self.swa_model.load_state_dict( - checkpoint["swa_model_state_dict"]) + self.swa_model.load_state_dict(checkpoint["swa_model_state_dict"]) self.swa_scheduler.load_state_dict( - checkpoint["swa_scheduler_state_dict"]) + checkpoint["swa_scheduler_state_dict"] + ) else: self.swa_scheduler = SWALR( self.optimizer, @@ -627,6 +628,7 @@ def load_last_checkpoint(self): for epoch in range(self.epochs, 0, -1): file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch) if os.path.isfile(file_path): + logging.info("Loading checkpoint %s" % file_path) return self.load_checkpoint(file_path) return None @@ -681,13 +683,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): "--eff-batch-size", type=int, default=None, - help= - "effective total batch size, if given, it overrides grad_acc_steps", + help="effective total batch size, if given, it overrides grad_acc_steps", ) - parser.add_argument("--epochs", - type=int, - default=200, - help="number of epochs") + parser.add_argument("--epochs", type=int, default=200, help="number of epochs") if train_modes is not None: parser.add_argument( "--train-mode", @@ -707,19 +705,12 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="use tensorboard logger", ) - parser.add_argument("--use-wandb", - action="store_true", - default=False, - help="use wandb logger") - parser.add_argument("--wandb.project", - default=None, - help="wandb project name") - parser.add_argument("--wandb.group", - default=None, - help="wandb group name") - parser.add_argument("--wandb.name", - default=None, - help="wandb display name") + parser.add_argument( + "--use-wandb", action="store_true", default=False, help="use wandb logger" + ) + parser.add_argument("--wandb.project", default=None, help="wandb project name") + parser.add_argument("--wandb.group", default=None, help="wandb group name") + parser.add_argument("--wandb.name", default=None, help="wandb display name") # parser.add_argument( # '--wandb.path', default=None, # help='wandb directory') @@ -748,10 +739,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) - parser.add_argument("--grad-clip", - type=float, - default=0, - help="gradient clipping norm value") + parser.add_argument( + "--grad-clip", type=float, default=0, help="gradient clipping norm value" + ) parser.add_argument( "--grad-clip-norm", default=2, @@ -764,10 +754,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=0, help="start epoch for SWA, if 0 it does not use SWA", ) - parser.add_argument("--swa-lr", - type=float, - default=1e-3, - help="learning rate for SWA phase") + parser.add_argument( + "--swa-lr", type=float, default=1e-3, help="learning rate for SWA phase" + ) parser.add_argument( "--swa-anneal-epochs", type=int, @@ -786,7 +775,6 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py new file mode 100644 index 00000000..d38ab9a9 --- /dev/null +++ b/hyperion/torch/trainers/transducer_languageid_trainer.py @@ -0,0 +1,221 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import TorchTrainer + + +class TransducerLanguageIDTrainer(TorchTrainer): + """Trainer to train ASR style models. + + Attributes: + model: ASR model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + input_key="x", + target_key=["text", "language"], + ): + + loss = None + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1] + ] + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + self.sp = data_loader.dataset.sp + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + + # # TODO: Check and Modify data, target + # data, audio_length, target = data.to(self.device), audio_length.to( + # self.device), target.to(self.device) + #print(data.keys(), batch_keys, flush=True) + input_data, input_lengths, text, languageid = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] + + with self.amp_autocast(): + output = self.model(input_data, + x_lengths=input_lengths, + text=text, + languageid=languageid) + loss = output.loss + loss = loss.mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + self.update_model() + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output["logits"], languageid) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + logs["lr"] = self._get_lr() + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() + return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1] + ] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + + input_data, input_lengths, text, languageid = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] + + # data, audio_length, target = data.to( + # self.device), audio_length.to(self.device), target.to( + # self.device) + # batch_size = data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with self.amp_autocast(): + output = self.model(input_data, + x_lengths=input_lengths, + text=text, + languageid=languageid) + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output["logits"], languageid) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, + train_modes=train_modes, + skip=super_skip) + if "target_key" not in skip: + parser.add_argument("--target-keys", + default=["text", "language"], + help="list of dict. key for nnet targets") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a9a9d98f..a59cbe14 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -88,35 +88,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # loss, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - @record def train_epoch(self, data_loader): """Training epoch loop @@ -130,16 +101,16 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data, y=target) + output = self.model(x, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 9541d7b0..0f6ccd9b 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -106,13 +106,13 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) with torch.no_grad(): - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): - output = self.model(feats, y=target) + output = self.model(feats, feats_lengths, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -159,12 +159,12 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): - output = self.model(feats) + output = self.model(feats, feats_lengths) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 1aefb3d4..4f006c0a 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -6,19 +6,16 @@ import logging import os -from fairscale.nn.data_parallel import \ - FullyShardedDataParallel as FullyShardedDDP -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP - import torch import torch.distributed as dist import torch.nn as nn +from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP from .devices import open_device def add_ddp_args(parser): - parser.add_argument( "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" ) @@ -50,7 +47,6 @@ def filter_ddp_args(**kwargs): def ddp_init( gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None ): - rank = node_id * num_gpus + gpu_id world_size = num_nodes * num_gpus @@ -62,15 +58,16 @@ def ddp_init( os.environ["MASTER_PORT"] = master_port logging.info( - f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" ) dist.init_process_group( "nccl", rank=rank, world_size=world_size, ) + torch.cuda.set_device(rank) torch.tensor([0]).to(gpu_id) - device = torch.device('cuda', gpu_id) + device = torch.device("cuda", gpu_id) return device, rank, world_size # return gpu_id, rank, world_size diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index fb93b439..934b4b90 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -17,9 +17,7 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): if max_in_length == max_out_length: return lengths - return torch.div(lengths * max_out_length, - max_in_length, - rounding_mode="floor") + return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): @@ -29,7 +27,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): lengths: sequence lengths with shape=(batch,). If None, it returns None max_length: maximum length of the sequence. dtype: dtype for the mask. - time_dim: dimension corresponding to time in the mask. This will + time_dim: dimension > 0 corresponding to time in the mask. This will return a view of the mask which will adapt to the shape of the tensor where we want to apply the mask. This has to be a positive integer. @@ -40,6 +38,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): if lengths is None: return None + assert time_dim > 0 assert lengths.dim() == 1 if max_length is None: diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index b2a3810f..46c09080 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -4,8 +4,8 @@ """ import torch -import torch.cuda.amp as amp import torch.nn as nn +import torch.cuda.amp as amp def l2_norm(x, dim=1, axis=None): @@ -104,3 +104,5 @@ def get_selfsim_tarnon(y, return_mask=False): mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1) return y_bin, mask + + diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index db035987..e8ad5056 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,8 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .info_table import InfoTable from .class_info import ClassInfo from .dataset import Dataset +from .enrollment_map import EnrollmentMap from .feature_set import FeatureSet from .hyp_dataclass import HypDataClass from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix @@ -12,6 +14,7 @@ from .recording_set import RecordingSet from .rttm import RTTM from .scp_list import SCPList + # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .segment_set import SegmentSet diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 70ee82c8..a15e3099 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -70,8 +70,45 @@ def load(cls, file_path, sep=None): if ext == "": # if no extension we load as kaldi utt2spk file df = pd.read_csv( - file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str}, + file_path, sep=" ", header=None, names=["id"], dtype={"id": str}, ) return cls(df) return super().load(file_path, sep) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + assert df["id"].is_unique, """there are duplicated ids in original tables""" + if not df["class_idx"].is_unique: + logging.warning( + """class_idx in concat tables are not unique, + we will assign new class_idx""" + ) + df["class_idx"].drop(columns=["class_idx"], inplace=True) + return cls(df) + + def filter( + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + rebuild_idx=False, + ): + new_class_info = super().filter(predicate, items, iindex, columns, by, keep) + if rebuild_idx: + new_class_info.add_class_idx() + + return new_class_info diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index efb7c114..51f0f37a 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -2,52 +2,323 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import logging from pathlib import Path -from typing import Dict, Optional - +from typing import List, Dict, Optional, Union +from copy import deepcopy +import math +import numpy as np +import pandas as pd import yaml +from .info_table import InfoTable from .class_info import ClassInfo from .feature_set import FeatureSet from .misc import PathLike from .recording_set import RecordingSet from .segment_set import SegmentSet +from .enrollment_map import EnrollmentMap +from .trial_key import TrialKey +from .trial_ndx import TrialNdx +from .sparse_trial_key import SparseTrialKey class Dataset: - """ Class that contains all objects - (segments, recordings, features, class_infos) that - conform a dataset + """Class that contains all objects + (segments, recordings, features, class_infos) that + conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: RecordingSet object or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables + """ def __init__( self, - segments: SegmentSet, - classes: Optional[Dict[str, ClassInfo]] = None, - recordings: Optional[Dict[str, RecordingSet]] = None, - features: Optional[Dict[str, FeatureSet]] = None, + segments: Union[SegmentSet, PathLike], + classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, + recordings: Optional[Union[RecordingSet, PathLike]] = None, + features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, + enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, + trials: Optional[ + Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + ] = None, + sparse_trials: bool = False, + table_sep: Optional[str] = None, ): - self._segments = segments - self._classes = classes - self._recordings = recordings - self._features = features + if isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + assert isinstance(segments, (str, Path)) + self._segments = None + self._segments_path = Path(segments) + + self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) + + if isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + assert isinstance(recordings, (str, Path)) + self._recordings = None + self._recordings_path = Path(recordings) + + # self._recordings, self._recordings_paths = self._parse_dict_args( + # recordings, RecordingSet + # ) + + self._features, self._features_paths = self._parse_dict_args( + features, FeatureSet + ) + self._enrollments, self._enrollments_paths = self._parse_dict_args( + enrollments, + EnrollmentMap, + ) + self._trials, self._trials_paths = self._parse_dict_args( + trials, + (TrialKey, TrialNdx, SparseTrialKey), + ) + + self.sparse_trials = sparse_trials + self.table_sep = table_sep + self._files_to_delete = [] + + def get_dataset_files(self): + file_paths = [] + for file_path in [self._segments_path, self._recordings_path]: + if file_path is not None: + file_paths.append(file_path) + + for path_dict in [ + self._features_paths, + self._enrollments_paths, + self._trials_paths, + ]: + if path_dict is None: + continue + for k, v in path_dict.items(): + file_paths.append(v) + + return file_paths + + def _delete_files(self, dataset_dir): + if not self._files_to_delete: + return + + dataset_files = self.get_dataset_files() + for file_path in self._files_to_delete: + file_path = Path(file_path) + # if the file has been added again we don't delete + if file_path in dataset_files: + continue + + # if we are saving the dataset to another location + # we don't delete the one in the original + if file_path.parent == dataset_dir and file_path.is_file(): + file_path.unlink() + + def _parse_dict_args(self, data, types): + if data is None: + return None, None + + assert isinstance(data, dict) + objects = {k: (v if isinstance(v, types) else None) for k, v in data.items()} + paths = { + k: (v if isinstance(v, (str, Path)) else None) for k, v in data.items() + } + + return objects, paths + + def clone(self): + return deepcopy(self) + + def segments(self, keep_loaded: bool = True): + if self._segments is None: + assert self._segments_path is not None + segments = SegmentSet.load(self._segments_path, sep=self.table_sep) + if keep_loaded: + self._segments = segments + return segments - @property - def segments(self): return self._segments - @property - def recordings(self): + def __len__(self): + return len(self.segments()) + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + assert self._recordings_path is not None + recordings = RecordingSet.load(self._recordings_path, sep=self.table_sep) + if keep_loaded: + self._recordings = recordings + return recordings + return self._recordings - @property - def features(self): - return self._features + # def recordings_value(self, key: str, keep_loaded: bool = True): + # if self._recordings[key] is None: + # assert self._recordings_paths[key] is not None + # recordings = RecordingSet.load( + # self._recordings_paths[key], sep=self.table_sep + # ) + # if keep_loaded: + # self._recordings[key] = recordings + # return recordings + + # return self._recordings[key] - @property - def classes(self): - return self._classes + def features_keys(self): + if self._features is not None: + return self._features.keys() + elif self._features_path is not None: + return self._features_path.keys() + else: + return {} + + def features_value(self, key: str, keep_loaded: bool = True): + if self._features[key] is None: + assert self._features_paths[key] is not None + features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) + if keep_loaded: + self._features[key] = features + return features + + return self._features[key] + + def classes_keys(self): + if self._classes is not None: + return self._classes.keys() + elif self._classes_path is not None: + return self._classes_path.keys() + else: + return {} + + def classes_value(self, key: str, keep_loaded: bool = True): + if self._classes[key] is None: + assert self._classes_paths[key] is not None + classes = ClassInfo.load(self._classes_paths[key], self.table_sep) + if keep_loaded: + self._classes[key] = classes + return classes + + return self._classes[key] + + def enrollments_value(self, key: str, keep_loaded: bool = True): + if self._enrollments[key] is None: + assert self._enrollments_paths[key] is not None + enrollments = EnrollmentMap.load( + self._enrollments_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._enrollments[key] = enrollments + return enrollments + + return self._enrollments[key] + + def trials_value(self, key: str, keep_loaded: bool = True): + if self._trials[key] is None: + assert self._trials_paths[key] is not None + try: + if self.sparse_trials: + trials = SparseTrialKey.load(self._trials_paths[key]) + else: + trials = TrialKey.load(self._trials_paths[key]) + except: + trials = TrialNdx.load(self._trials_paths[key]) + + if keep_loaded: + self._trials[key] = trials + return trials + + return self._trials[key] + + # def recordings(self, keep_loaded: bool = True): + # if self._recordings is None: + # yield from () + # else: + # for key in self._recordings.keys(): + # yield key, self.recordings_value(key, keep_loaded) + + def features(self, keep_loaded: bool = True): + if self._features is None: + yield from () + else: + for key in self._features.keys(): + yield key, self.features_value(key, keep_loaded) + + def classes(self, keep_loaded: bool = True): + if self._classes is None: + yield from () + else: + for key in self._classes.keys(): + yield key, self.classes_value(key, keep_loaded) + + def enrollments(self, keep_loaded: bool = True): + if self._enrollments is None: + yield from () + else: + for key in self._enrollments.keys(): + yield key, self.enrollments_value(key, keep_loaded) + + def trials(self, keep_loaded: bool = True): + if self._trials is None: + yield from () + else: + for key in self._trials.keys(): + yield key, self.trials_value(key, keep_loaded) + + # def add_recordings(self, recordings: Dict[str, Union[RecordingSet, PathLike]]): + # recordings, recordings_paths = self._parse_dict_args(recordings, RecordingSet) + # if self._recordings is None: + # self._recordings = self._recordings_paths = {} + # self._recordings.update(recordings) + # self._recordings_paths.update(recordings_paths) + + # def add_features(self, features: Dict[str, Union[FeatureSet, PathLike]]): + # features, features_paths = self._parse_dict_args(features, FeatureSet) + # if self._features is None: + # self._features = self._features_paths = {} + # self._features.update(features) + # self._features_paths.update(features_paths) + + # def add_classes(self, classes: Dict[str, Union[ClassInfo, PathLike]]): + # classes, classes_paths = self._parse_dict_args(classes, ClassInfo) + # if self._classes is None: + # self._classes = self._classes_paths = {} + # self._classes.update(classes) + # self._classes_paths.update(classes_paths) + + # def add_enrollments(self, enrollments: Dict[str, Union[EnrollmentMap, PathLike]]): + # enrollments, enrollments_paths = self._parse_dict_args( + # enrollments, + # EnrollmentMap, + # ) + # if self._enrollments is None: + # self._enrollments = self._enrollments_paths = {} + # self._enrollments.update(enrollments) + # self._enrollments_paths.update(enrollments_paths) + + # def add_trials( + # self, trials: Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + # ): + # trials, trials_paths = self._parse_dict_args( + # trials, + # (TrialKey, TrialNdx, SparseTrialKey), + # ) + # if self._trials is None: + # self._trials = self._trials_paths = {} + # self._trials.update(trials) + # self._trials_paths.update(trials_paths) @staticmethod def resolve_dataset_path(dataset_path): @@ -64,97 +335,887 @@ def resolve_dataset_path(dataset_path): @staticmethod def resolve_file_path(dataset_dir, file_path): + dataset_dir = Path(dataset_dir) + file_path = Path(file_path) if file_path.is_file(): return file_path return dataset_dir / file_path - def save(self, dataset_path: PathLike): - """Saves all the dataset objects. + def save( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + force_save_all: bool = False, + ): + """Saves the dataset to disk. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save - the dataset info. + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + force_save_all: forces saving all tables even if they haven't changed, + otherwise, it only saves tables loaded in memory + and those that are not in the datadirectory + """ + if force_save_all: + self.save_all(dataset_path, update_paths, table_sep) + else: + self.save_changed(dataset_path, update_paths, table_sep) + + def save_changed( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves the tables that change in disk or tables + that are not in the ouput directory. + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) dataset = {} - if self.segments is not None: - file_name = "segments.csv" - dataset["segments"] = file_name - file_path = dataset_dir / file_name - self.segments.save(file_path) + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + if ( + self._segments is not None + or file_path != self._segments_path + or not file_path.exists() + ): + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_path + or not file_path.exists() + ): + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # if self._recordings is not None: + # file_names = {} + # for k in self._recordings.keys(): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # if ( + # self._recordings[k] is not None + # or file_path != self._recordings_paths[k] + # or not file_path.exists() + # ): + # v = self.recordings_value(k, keep_loaded=False) + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names - if self.recordings is not None: + if self._features is not None: file_names = {} - for k, v in self.recordings.items(): - file_name = k + ".csv" + for k in self._features.keys(): + file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name - v.save(file_path) + if ( + self._features[k] is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path - dataset["recordings"] = file_names + if file_names: + dataset["features"] = file_names - if self.features is not None: + if self._classes is not None: file_names = {} - for k, v in self.features.items(): - file_name = k + ".csv" + for k in self._classes.keys(): + file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name - v.save(file_path) + if ( + self._classes[k] is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path - dataset["features"] = file_names + if file_names: + dataset["classes"] = file_names - if self.classes is not None: + if self._enrollments is not None: file_names = {} - for k, v in self.classes.items(): - file_name = k + ".csv" + for k in self._enrollments.keys(): + file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name - v.save(file_path) + if ( + self._enrollments[k] is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + if self._trials is not None: + file_names = {} + for k in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials[k] is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + self._delete_files(dataset_dir) + + def save_all( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" + dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset = {} + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # file_names = {} + # for k, v in self.recordings(keep_loaded=False): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names + + file_names = {} + for k, v in self.features(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + file_names = {} + for k, v in self.classes(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: dataset["classes"] = file_names + file_names = {} + for k, v in self.enrollments(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self.trials(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + with open(dataset_file, "w") as f: yaml.dump(dataset, f) + self._delete_files(dataset_dir) + + def update_from_disk(self): + self.segments() + self.recordings() + # for k, v in self.recordings(): + # pass + + for k, v in self.features(): + pass + + for k, v in self.classes(): + pass + + for k, v in self.enrollments(): + pass + + for k, v in self.trials(): + pass + @classmethod - def load(cls, dataset_path: PathLike): + def load( + cls, dataset_path: PathLike, lazy: bool = True, sparse_trials: bool = False + ): """Loads all the dataset objects. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save the dataset info. + lazy: load data structures lazily when they are needed. + sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class """ dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) - with open(dataset_file, "w") as f: + with open(dataset_file, "r") as f: dataset = yaml.safe_load(f) assert "segments" in dataset - segments = SegmentSet.load( - Dataset.resolve_file_path(dataset_dir, dataset["segments"]) - ) + segments = Dataset.resolve_file_path(dataset_dir, dataset["segments"]) classes = None recordings = None features = None + enrollments = None + trials = None if "classes" in dataset: classes = {} - for k, v in dataset["classes"]: - classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + for k, v in dataset["classes"].items(): + classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: - recordings = {} - for k, v in dataset["recordings"]: - recordings[k] = RecordingSet.load( - Dataset.resolve_file_path(dataset_dir, v) - ) + recordings = Dataset.resolve_file_path(dataset_dir, dataset["recordings"]) + # recordings = {} + # for k, v in dataset["recordings"].items(): + # recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} - for k, v in dataset["features"]: - features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) + for k, v in dataset["features"].items(): + features[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "enrollments" in dataset: + enrollments = {} + for k, v in dataset["enrollments"].items(): + enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "trials" in dataset: + trials = {} + for k, v in dataset["trials"].items(): + trials[k] = Dataset.resolve_file_path(dataset_dir, v) + + dataset = cls( + segments, + classes, + recordings, + features, + enrollments, + trials, + sparse_trials=sparse_trials, + ) + if not lazy: + dataset.update_from_disk() + + return dataset + + def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if self._features is None: + self._features = {} + self._features_paths = {} + + if isinstance(features, (str, Path)): + self._features[features_name] = None + self._features_paths[features_name] = features + elif isinstance(features, FeatureSet): + self._features[features_name] = features + self._features_paths[features_name] = None + else: + raise ValueError() + + def set_segments( + self, + segments: Union[PathLike, SegmentSet], + ): + if isinstance(segments, (str, Path)): + self._segments = None + self._segments_path = segments + elif isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + raise ValueError() + + def set_recordings( + self, + recordings: Union[PathLike, RecordingSet], + update_seg_durs: bool = False, + ): + if isinstance(recordings, (str, Path)): + self._recordings = None + self._recordings_path = Path(recordings) + elif isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + raise ValueError() + + if update_seg_durs: + rec_ids = self.segments(keep_loaded=True).recordings() + self.segments()["duration"] = self.recordings().loc[rec_ids, "duration"] + + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if self._classes is None: + self._classes = {} + self._classes_paths = {} + + if isinstance(classes, (str, Path)): + self._classes[classes_name] = None + self._classes_paths[classes_name] = Path(classes) + elif isinstance(classes, ClassInfo): + self._classes[classes_name] = classes + self._classes_paths[classes_name] = None + else: + raise ValueError() + + def add_enrollments( + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], + ): + if self._enrollments is None: + self._enrollments = {} + self._enrollments_paths = {} + + if isinstance(enrollments, (str, Path)): + self._enrollments[enrollments_name] = None + self._enrollments_paths[enrollments_name] = Path(enrollments) + elif isinstance(enrollments, EnrollmentMap): + self._enrollments[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = None + else: + raise ValueError() + + def add_trials( + self, + trials_name: str, + trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], + ): + if self._trials is None: + self._trials = {} + self._trials_paths = {} + + if isinstance(trials, (str, Path)): + self._trials[trials_name] = None + self._trials_paths[trials_name] = Path(trials) + elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): + self._trials[trials_name] = trials + self._trials_paths[trials_name] = None + else: + raise ValueError() + + def remove_features(self, features_name: str): + if self._features_paths[features_name] is not None: + self._files_to_delete.append(self._features_paths[features_name]) + + del self._features[features_name] + del self._features_paths[features_name] + + def remove_recordings( + self, + ): + if self._recordings_path is not None: + self._files_to_delete.append(self._recordings_path) + + self._recordings = None + self._recordings_path = None + + # def remove_recordings( + # self, + # recordings_name: str, + # ): + # if self._recordingsr_paths[recordings_name] is not None: + # file_path = Path(self._recordings_paths[recordings_name]) + # if file_path.is_file(): + # file_path.unlink() + + # del self._recordings[recordings_name] + # del self._recordings_paths[recordings_name] + + def remove_classes(self, classes_name: str): + if self._classes_paths[classes_name] is not None: + self._files_to_delete.append(self._class_paths[class_name]) + + del self._classes[classes_name] + del self._classes_paths[classes_name] + + def remove_enrollments( + self, + enrollments_name: str, + ): + if self._enrollments_paths[enrollments_name] is not None: + self._files_to_delete.append(self._enrollments_paths[enrollments_name]) + + del self._enrollments[enrollments_name] + del self._enrollments_paths[enrollments_name] + + def remove_trials( + self, + trials_name: str, + ): + if self._trials_paths[trials_name] is not None: + self._files_to_delete.append(self._trials_paths[trials_name]) + + del self._trials[trials_name] + del self._trials_paths[trials_name] + + def add_cols_to_segments( + self, + right_table: Union[InfoTable, pd.DataFrame, PathLike], + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, (str, Path)): + file_path = Path(right_table) + if file_path.is_file(): + right_table = InfoTable.load(file_path) + else: + if right_table == "recordings": + right_table = self.recordings() + elif right_table in self.features_keys(): + right_table = self.features_value(right_table) + elif right_table in self.classes_keys(): + right_table = self.classes_value + else: + raise ValueError("%s not found", right_table) + + segments = self.segments(keep_loaded=True) + segments.add_columns(right_table, column_names, on=on, right_on=right_on) + + def clean(self, rebuild_class_idx=False): + rec_ids = self.segments().recordings() + # for k, table in self.recordings(): + # # table = table.loc[table["id"].isin(rec_ids)].copy() + # # self._recordings[k] = RecordingSet(table) + self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids)) + + ids = self.segments()["id"].values + for k, table in self.features(): + self._features[k] = table.filter(lambda df: df["id"].isin(ids)) + # table = table.loc[table["id"].isin(ids)].copy() + # self._features[k] = FeatureSet(table) + + for k, table in self.classes(): + class_ids = self.segments()[k].unique() + self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids)) + # table = table[table["id"].isin(class_ids)].copy() + # self._classes[k] = ClassInfo(table) + + remove_keys = [] + for k, table in self.enrollments(): + # table = table.loc[table["segmentid"].isin(ids)].copy() + table = table.filter(lambda df: df["segmentid"].isin(ids)) + if len(table) > 0: + self._enrollments[k] = table + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_enrollments(k) + + remove_keys = [] + for k, key in self.trials(): + keep_ids = [cur_id for cur_id in key.seg_set if cur_id in ids] + if keep_ids: + key = key.filter(key.model_set, keep_ids, keep=True) + self._trials[k] = key + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_trials(k) + + def _split_into_trials_and_cohort( + self, + segments: SegmentSet, + num_tar_trials: int, + num_trial_speakers: int, + seed: int, + ): + # select test speakers + rng = np.random.default_rng(seed=seed) + + spks = segments["speaker"].unique() + trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) + snorm_segments = SegmentSet(segments[~segments["speaker"].isin(trial_spks)]) + + trial_segments = segments[segments["speaker"].isin(trial_spks)] + # solution of 2nd degree eq. + # num_spks * n (n-1) /2 = num_trials + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_trial_speakers)) / 2) + ) + + n = num_trial_speakers * num_segs_per_spk + seg_ids = rng.choice(trial_segments["id"], size=(n,), replace=False) + trial_segments = SegmentSet(segments[segments["id"].isin(seg_ids)]) + seg_ids = trial_segments["id"].values + class_ids = trial_segments["speaker"].values + tar = np.zeros((n - 1, n), dtype=bool) + non = np.zeros((n - 1, n), dtype=bool) + + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + if class_ids[i] == class_ids[j]: + tar[i, j] = True + else: + non[i, j] = True + + logging.info("Got ntar=%d and nnon=%d", tar.sum(), non.sum()) + trials = TrialKey(seg_ids[:-1], seg_ids, tar, non) + df_enr = pd.DataFrame({"id": seg_ids[:-1], "segmentid": seg_ids[:-1]}) + enrollments = EnrollmentMap(df_enr) + return trials, enrollments, snorm_segments + + def split_into_trials_and_cohort( + self, + num_1k_tar_trials: int, + num_trial_speakers: int, + intra_gender: bool = True, + trials_name="trials_qmf", + seed=1123, + ): + """When training quality measure fusion in, e.g., VoxCeleb recipe. + We split the data into 2 parts: + 1) used to calculate SV scores to train the fusion + 2) cohort used to calculate the S-Norm parameters used in the QMF. + + The trials_file will be stored in the current dataset + A new dataset is created with only the cohort speakers + + Args: + num_1k_tar_trials: num of 1000 target trials. + num_trial_speakers: number of spks used to create trials. + intra_gender: if True, no cross gender trials are done. + + Returns: + Dataset used for trials with trial list. + Dataset used for cohort. + """ + num_tar_trials = num_1k_tar_trials * 1000 + if intra_gender: + num_tar_trials = num_tar_trials // 2 + num_trial_speakers = num_trial_speakers // 2 + segments = self.segments() + segments_male = SegmentSet(segments[segments["gender"] == "m"]) + segments_female = SegmentSet(segments[segments["gender"] == "f"]) + trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( + segments_male, + num_tar_trials, + num_trial_speakers, + seed, + ) + ( + trials_female, + enroll_female, + cohort_female, + ) = self._split_into_trials_and_cohort( + segments_female, + num_tar_trials, + num_trial_speakers, + seed, + ) + trials = TrialKey.merge([trials_male, trials_female]) + enroll = EnrollmentMap.cat([enroll_male, enroll_female]) + cohort = SegmentSet.cat([cohort_male, cohort_female]) + else: + segments = self.segments() + trials, enroll, cohort = self._split_into_trials_and_cohort( + segments, + num_tar_trials, + num_trial_speakers, + seed, + ) + + dataset_trials = self.clone() + segments = self.segments() + trials_segments = SegmentSet(segments.loc[segments["id"].isin(trials.seg_set)]) + dataset_trials.set_segments(trials_segments) + dataset_trials.add_trials("trials", trials) + dataset_trials.add_enrollments("enrollments", enroll) + dataset_trials.clean() + + dataset_cohort = self.clone() + dataset_cohort.set_segments(cohort) + dataset_cohort.clean() + + return dataset_trials, dataset_cohort + + def remove_short_segments(self, min_length: float, length_name: str = "duration"): + segments = self.segments() + self._segments = segments.filter(lambda df: df[length_name] >= min_length) + self.clean() + + def remove_classes_few_segments( + self, + class_name: str, + min_segs: int, + rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + keep_classes = classes[counts >= min_segs] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def rebuild_class_idx(self, class_name: str): + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def _segments_split(self, val_prob: float, rng: np.random.Generator): + segments = self.segments() + p = rng.permutation(len(segments)) + num_train = int(round((1 - val_prob) * len(p))) + + train_idx = p[:num_train] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_idx = p[num_train:] + val_segs = segments.filter(iindex=val_idx) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_classes( + self, + val_prob: float, + joint_classes: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[joint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + train_mask = np.zeros(len(segments), dtype=bool) + kk = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + count = len(idx) + p = rng.permutation(count) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + kk += count - num_train + train_idx = idx[p[:num_train]] + train_mask[train_idx] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_disjoint_classes( + self, + val_prob: float, + disjoint_classes: List[str], + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[disjoint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + p = rng.permutation(len(u_classes)) + class_ids = p[class_ids] + num_train = int(round((1 - val_prob) * len(segments))) + train_mask = np.zeros(len(segments), dtype=bool) + count_acc = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + train_mask[idx] = True + count = len(idx) + count_acc += count + if count_acc >= num_train: + break + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_and_disjoint_classes( + self, + val_prob: float, + joint_classes: List[str], + disjoint_clases: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + raise NotImplementedError("I'll implement this when I need it") + segments = self.segments() + j_classes = segments[joint_classes].apply("-".join, axis=1) + ju_classes, j_class_ids = np.unique(j_classes, return_inverse=True) + d_classes = segments[disjoint_classes].apply("-".join, axis=1) + du_classes, d_class_ids = np.unique(d_classes, return_inverse=True) + d_p = rng.permutation(len(du_classes)) + d_class_ids = d_p[d_class_ids] + d_sort_idx = np.argsort(d_class_ids) + d_sort_j_class_ids = j_class_ids[d_sort_idx] + + train_d_classes = set() + for c_id in range(len(ju_classes)): + idx = (j_sort_class_ids == c_id).nonzero()[0] + count = len(idx) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + sel_d_class_ids = set(d_sort_idx[:num_train]) + train_d_classes = train_d_classes.union(sel_d_class_ids) + + train_mask = np.zeros(len(segments), dtype=bool) + for c_id in train_d_classes: + mask = d_class_ids == c_id + train_mask[mask] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def split_train_val( + self, + val_prob: float, + joint_classes: Optional[List[str]] = None, + disjoint_classes: Optional[List[str]] = None, + min_train_samples: int = 1, + seed: int = 11235813, + ): + rng = np.random.default_rng(seed) + if joint_classes is None and disjoint_classes is None: + train_segs, val_segs = self._segments_split(val_prob, rng) + elif joint_classes is not None and disjoint_classes is None: + train_segs, val_segs = self._segments_split_joint_classes( + val_prob, + joint_classes, + min_train_samples, + rng, + ) + elif joint_classes is None and disjoint_classes is not None: + train_segs, val_segs = self._segments_split_disjoint_classes( + val_prob, + disjoint_classes, + rng, + ) + else: + train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( + val_prob, + joint_classes, + disjoint_classes, + min_train_samples, + rng, + ) + + train_ds = self.clone() + train_ds.set_segments(train_segs) + train_ds.clean() + + val_ds = self.clone() + val_ds.set_segments(val_segs) + val_ds.clean() - return cls(segments, classes, recordings, features) + return train_ds, val_ds diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py new file mode 100644 index 00000000..4af69144 --- /dev/null +++ b/hyperion/utils/enrollment_map.py @@ -0,0 +1,101 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import re +from collections import OrderedDict +from copy import deepcopy +from pathlib import Path + +import numpy as np +import pandas as pd + +from .list_utils import split_list, split_list_group_by_key +from .info_table import InfoTable + + +class EnrollmentMap(InfoTable): + """Class to store the mapping between enrollment id + and segmentids + """ + + def __init__(self, df): + if "modelid" in df: + df.rename(columns={"modelid": "id"}, inplace=True) + assert "segmentid" in df + super().__init__(df) + + def split(self, idx, num_parts): + """Splits the mapping into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub InfoTable object + """ + _, idx1 = split_list_group_by_key(self.df["id"], idx, num_parts) + + df = self.df.iloc[idx1] + return EnrollmentMap(df) + + def save(self, file_path, sep=None, nist_compatible=True): + if nist_compatible: + # For compatibility with NIST SRE files the index column "id" + # is saved as modelid + self.df.rename(columns={"id": "modelid"}, inplace=True) + + super().save(file_path, sep) + if nist_compatible: + self.df.rename(columns={"modelid": "id"}, inplace=True) + + @classmethod + def load(cls, file_path, sep=None): + """Loads EnrollmentMap from file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded + Returns: + EnrollmentMap object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext in ["", ".scp"]: + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["segmentid", "modelid"], + dtype={"segmentid": np.str, "modelid": np.str}, + ) + df = df[["modelid", "segmentid"]] + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + + return cls(df) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + return cls(df) diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py index 2b2f0aaf..7e40dfd6 100644 --- a/hyperion/utils/feature_set.py +++ b/hyperion/utils/feature_set.py @@ -9,6 +9,7 @@ import pandas as pd from .info_table import InfoTable +from .misc import PathLike class FeatureSet(InfoTable): @@ -16,6 +17,9 @@ def __init__(self, df): super().__init__(df) assert "storage_path" in df + def add_prefix_to_storage_path(self, prefix: PathLike): + self.df["storge_path"] = self.df["storage_path"].apply(lambda x: f"{prefix}{x}") + def save(self, file_path, sep=None): """Saves info table to file @@ -31,14 +35,14 @@ def save(self, file_path, sep=None): from .scp_list import SCPList offset = self.df["storage_byte"] if "storage_byte" in self.df else None - range = None + range_spec = None if "start" and "num_frames" in self.df: - range = [ + range_spec = [ np.array([s, n], dtype=np.int64) for s, n in self.df[["start", "num_frames"]] ] scp = SCPList( - self.df["id"].values, self.df["storage_path"].values, offset, range + self.df["id"].values, self.df["storage_path"].values, offset, range_spec ) scp.save(file_path) return @@ -67,9 +71,9 @@ def load(cls, file_path, sep=None): if scp.offset is not None: df["storage_byte"] = scp.offset - if scp.range is not None: - df["start"] = [r[0] for r in scp.range] - df["num_frames"] = [r[0] for r in scp.range] + if scp.range_spec is not None: + df["start"] = [r[0] for r in scp.range_spec] + df["num_frames"] = [r[1] for r in scp.range_spec] return cls(df) diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index f22263cf..80b818d6 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -176,7 +176,7 @@ def create( FoldList object. """ if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index a3a1da27..eed973fb 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -8,6 +8,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path +from typing import Optional, Union, List import numpy as np import pandas as pd @@ -22,6 +23,7 @@ class InfoTable: Attributes: df: pandas dataframe. """ + def __init__(self, df): self.df = df assert "id" in df, f"info_table={df}" @@ -118,7 +120,7 @@ def from_dict(cls, df_dict): @classmethod def load(cls, file_path, sep=None, name="class_id"): - """Loads utt2info list from text file. + """Loads table from file. Args: file_path: File to read the list. @@ -126,7 +128,7 @@ def load(cls, file_path, sep=None, name="class_id"): dtype: Dictionary with the dtypes of each column. name: name for the data to be loaded Returns: - Utt2Info object + InfoTable object """ file_path = Path(file_path) ext = file_path.suffix @@ -138,8 +140,8 @@ def load(cls, file_path, sep=None, name="class_id"): header=None, names=["id", name], dtype={ - "id": np.str, - name: np.str + "id": str, + name: str }, ) else: @@ -158,52 +160,77 @@ def sort(self, column="id", ascending=True): self.df.sort_values(by=column, inplace=True, ascending=ascending) def split(self, idx, num_parts, group_by=None): - """Splits SCPList into num_parts and return part idx. + """Splits the table into num_parts and return part idx. Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. - group_by_field: All the lines with the same value in column + group_by: All the lines with the same value in column groub_by_field go to the same part Returns: - Sub Utt2Info object + Sub InfoTable object """ - if group_by is None: + if group_by is None or group_by == "id": _, idx1 = split_list(self.df["id"], idx, num_parts) else: - _, idx1 = split_list_group_by_key(self.df[group_by], idx, - num_parts) + _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) df = self.df.iloc[idx1] return self.__class__(df) @classmethod - def merge(cls, tables): - """Merges several Utt2Info tables. + def cat(cls, tables): + """Concatenates several tables. Args: - info_lists: List of Utt2Info + info_lists: List of InfoTables Returns: - Utt2Info object concatenation the info_lists. + InfoTable object concatenation the info_lists. """ df_list = [table.df for table in tables] df = pd.concat(df_list) + assert df[ + "id" + ].is_unique, """there are duplicated ids in the tables we are concatenating""" return cls(df) - def filter(self, - items=None, - iindex=None, - columns=None, - by="id", - keep=True): - assert (items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + def filter( + self, predicate=None, items=None, iindex=None, columns=None, by="id", keep=True + ): + """Filters the table and produce a new table with the elements to keep + + Args: + predicate: callable function that defines the filtering criterion e.g.: + lambda df: df["duration"] > 1.0. + items: filters the table based in column value with pandas command: + df.loc[items, by], used only if predicate is None + iindex: filters the table based on integer index with pandas command: + df.iloc[iiindex], used if predicate and items are None + columns: columns to keep of remove. + by: column id to use with itmes criterion + keep: if True, the criterion is used to keep rows, if False it is used + to remove rows + + Returns + InfoTable of the same class as the input. + """ + assert ( + predicate is not None + or items is not None + or iindex is not None + or columns is not None + ), "predicate, items, iindex and columns cannot be not None at the same time" df = self.df + if predicate is not None: + mask = predicate(self.df) + if not keep: - if items is not None: + if predicate is not None: + mask = np.logical_not(mask) + elif items is not None: items = np.setdiff1d(df[by], items) elif iindex is not None: iindex = np.setdiff1d(np.arange(len(df)), iindex) @@ -211,7 +238,12 @@ def filter(self, if columns is not None: columns = np.setdiff1d(df.columns, columns) - if items is not None: + if predicate is not None: + if columns is None: + df = df.loc[mask] + else: + df = df.loc[mask, columns] + elif items is not None: if by != "id": missing = [False if v in df[by] else True for v in items] if any(missing): @@ -229,7 +261,7 @@ def filter(self, if columns is not None: df = df[columns] - return self.__class__(df) + return self.__class__(df.copy()) def __eq__(self, other): """Equal operator""" @@ -259,7 +291,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.df)) rng.shuffle(index) self.df = self.df.iloc[index] @@ -283,14 +315,33 @@ def get_loc(self, keys): loc = self.df.index.get_loc(keys) if isinstance(loc, int): return loc - elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + + if isinstance(loc, np.ndarray) and loc.dtype == np.bool: return np.nonzero(loc)[0] - else: - return list(range(loc.start, loc.stop, loc.step)) + + return list(range(loc.start, loc.stop, loc.step)) def get_col_idx(self, keys): return self.df.columns.get_loc(keys) + def add_columns( + self, + right_table, + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, InfoTable): + right_table = right_table.df + + if column_names is not None: + right_table = right_table[column_names] + + if right_on is None: + right_on = on + + self.df = self.df.merge(right_table, how="left", left_on=on, right_on=right_on) + # def __len__(self): # """Returns the number of elements in the list.""" diff --git a/hyperion/utils/math.py b/hyperion/utils/math_funcs.py similarity index 93% rename from hyperion/utils/math.py rename to hyperion/utils/math_funcs.py index 84596f7d..5ee510b9 100644 --- a/hyperion/utils/math.py +++ b/hyperion/utils/math_funcs.py @@ -346,10 +346,26 @@ def int2onehot(class_ids, num_classes=None): return p -def cosine_scoring(x1, x2): +def average_vectors(x, ids): + assert x.shape[0] == len(ids) + num_ids = np.max(ids) + 1 + x_avg = np.zeros((num_ids, x.shape[1]), dtype=x.dtype) + for i in range(num_ids): + mask = ids == i + x_avg[i] = np.mean(x[mask], axis=0) - l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True)) - l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True)) + return x_avg + + +def cosine_scoring(x1, x2, ids1=None, ids2=None): + if ids1 is not None: + x1 = average_vectors(x1, ids1) + + if ids2 is not None: + x2 = average_vectors(x2, ids2) + + l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True) + 1e-10) + l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True) + 1e-10) x1 = x1 / l2_1 x2 = x2 / l2_2 diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 2341beb4..ec617975 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -4,6 +4,7 @@ """ import matplotlib + # matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np @@ -11,7 +12,7 @@ import scipy.stats as stats from mpl_toolkits.mplot3d import Axes3D as plt3d -from .math import invert_pdmat +from .math_funcs import invert_pdmat def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs): diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py deleted file mode 100644 index 8bfd0166..00000000 --- a/hyperion/utils/queues.py +++ /dev/null @@ -1,287 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import copy -import multiprocessing -import threading -import time -import warnings -from abc import abstractmethod - -import numpy as np -import six - -try: - import queue -except ImportError: - import Queue as queue - - -class SequenceQueue(object): - """Base class to enqueue inputs. - - The task of an Queue is to use parallelism to speed up preprocessing. - This is done with processes or threads. - - # Examples - - ```python - enqueuer = SequenceQueue(...) - enqueuer.start() - datas = enqueuer.get() - for data in datas: - # Use the inputs; training, evaluating, predicting. - # ... stop sometime. - enqueuer.close() - ``` - - The `enqueuer.get()` should be an infinite stream of datas. - - """ - - @abstractmethod - def is_running(self): - raise NotImplemented - - @abstractmethod - def start(self, workers=1, max_queue_size=10): - """Starts the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`). - """ - raise NotImplemented - - @abstractmethod - def stop(self, timeout=None): - """Stop running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called start(). - - # Arguments - timeout: maximum time to wait on thread.join() - """ - raise NotImplemented - - @abstractmethod - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples `(inputs, targets)` - or `(inputs, targets, sample_weights)`. - """ - raise NotImplemented - - -class OrderedQueue(SequenceQueue): - """Builds a Queue from a Sequence. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - sequence: A `keras.utils.data_utils.Sequence` object. - use_multiprocessing: use multiprocessing if True, otherwise threading - scheduling: Sequential querying of datas if 'sequential', random otherwise. - """ - - def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"): - self.sequence = sequence - self.use_multiprocessing = use_multiprocessing - self.scheduling = scheduling - self.workers = 0 - self.executor = None - self.queue = None - self.run_thread = None - self.stop_signal = None - - def is_running(self): - return self.stop_signal is not None and not self.stop_signal.is_set() - - def start(self, workers=1, max_queue_size=10): - """Start the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, workers could block on `put()`) - """ - if self.use_multiprocessing: - self.executor = multiprocessing.Pool(workers) - else: - self.executor = ThreadPool(workers) - self.queue = queue.Queue(max_queue_size) - self.stop_signal = threading.Event() - self.run_thread = threading.Thread(target=self._run) - self.run_thread.daemon = True - self.run_thread.start() - - def _run(self): - """Function to submit request to the executor and queue the `Future` objects.""" - sequence = list(range(len(self.sequence))) - while True: - if self.scheduling is not "sequential": - random.shuffle(sequence) - for i in sequence: - if self.stop_signal.is_set(): - return - self.queue.put( - self.executor.apply_async(get_index, (self.sequence, i)), block=True - ) - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples (inputs, targets) - or (inputs, targets, sample_weights) - """ - try: - while self.is_running(): - inputs = self.queue.get(block=True).get() - if inputs is not None: - yield inputs - except Exception as e: - self.stop() - raise StopIteration(e) - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()` - """ - self.stop_signal.set() - with self.queue.mutex: - self.queue.queue.clear() - self.queue.unfinished_tasks = 0 - self.queue.not_full.notify() - self.executor.close() - self.executor.join() - self.run_thread.join(timeout) - - -class GeneratorQueue(SequenceQueue): - """Builds a queue out of a data generator. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - generator: a generator function which endlessly yields data - use_multiprocessing: use multiprocessing if True, otherwise threading - wait_time: time to sleep in-between calls to `put()` - random_seed: Initial seed for workers, - will be incremented by one for each workers. - """ - - def __init__( - self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None - ): - self.wait_time = wait_time - self._generator = generator - self._use_multiprocessing = use_multiprocessing - self._threads = [] - self._stop_event = None - self.queue = None - self.random_seed = random_seed - - def start(self, workers=1, max_queue_size=10): - """Kicks off threads which add data from the generator into the queue. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`) - """ - - def data_generator_task(): - while not self._stop_event.is_set(): - try: - if self._use_multiprocessing or self.queue.qsize() < max_queue_size: - generator_output = next(self._generator) - self.queue.put(generator_output) - else: - time.sleep(self.wait_time) - except Exception: - self._stop_event.set() - raise - - try: - if self._use_multiprocessing: - self.queue = multiprocessing.Queue(maxsize=max_queue_size) - self._stop_event = multiprocessing.Event() - else: - self.queue = queue.Queue() - self._stop_event = threading.Event() - - for _ in range(workers): - if self._use_multiprocessing: - # Reset random seed else all children processes - # share the same seed - np.random.seed(self.random_seed) - thread = multiprocessing.Process(target=data_generator_task) - thread.daemon = True - if self.random_seed is not None: - self.random_seed += 1 - else: - thread = threading.Thread(target=data_generator_task) - self._threads.append(thread) - thread.start() - except: - self.stop() - raise - - def is_running(self): - return self._stop_event is not None and not self._stop_event.is_set() - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()`. - """ - if self.is_running(): - self._stop_event.set() - - for thread in self._threads: - if thread.is_alive(): - if self._use_multiprocessing: - thread.terminate() - else: - thread.join(timeout) - - if self._use_multiprocessing: - if self.queue is not None: - self.queue.close() - - self._threads = [] - self._stop_event = None - self.queue = None - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - A generator - """ - while self.is_running(): - if not self.queue.empty(): - inputs = self.queue.get() - if inputs is not None: - yield inputs - else: - time.sleep(self.wait_time) diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 5abf76f2..3d8b5e9d 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None): def validate(self): """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) - self.file_path = list2ndarray(self.file_path, dtype=np.object) + self.file_path = list2ndarray(self.file_path, dtype=object) assert len(self.key) == len(self.file_path) if self.offset is not None: if isinstance(self.offset, list): @@ -384,7 +384,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index f9da69fa..a99b4e1e 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -7,11 +7,54 @@ class SegmentSet(InfoTable): + """Class to store information about a speech segment + Internally, it uses a pandas table. + """ + def __init__(self, df): super().__init__(df) + if "start" in df and "recordings" not in df: + df["recordings"] = df["id"] + + if "start" not in df and "recordings" in df: + df["start"] = 0.0 + + @property + def has_time_marks(self): + return "recordings" in self.df and "start" in self.df and "duration" in self.df + + @property + def has_recording_ids(self): + return "recordings" in self.df + + @property + def has_recordings(self): + return "recordings" in self.df - def recording_ids(self, ids): - if "recording_id" in self.df: - return self.df.loc[ids, "recording_id"] + def recordings(self, ids=None): + if ids is None: + if "recordings" in self.df: + return self.df["recordings"] + else: + return self.df["id"] + + if "recordings" in self.df: + return self.df.loc[ids, "recordings"] return ids + + def recording_ids(self, ids=None): + return self.recordings(ids) + + def recording_time_marks(self, ids, recordings_name: str = "recordings"): + if recordings_name == "recordings": + if "recordings" in self.df: + recordings_name = "recordings" + else: + recordings_name = "id" + + assert "duration" in self.df + if "start" not in self.df: + self.df["start"] = 0.0 + + return self.df.loc[ids, [recordings_name, "start", "duration"]] diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index 5afc72a0..62fcd446 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -5,8 +5,10 @@ import copy import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from .list_utils import * @@ -79,6 +81,28 @@ def save_txt(self, file_path): for r, c in zip(non.row, non.col): f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c])) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + self.tar.eliminate_zeros() + self.non.eliminate_zeros() + tar = self.tar.tocoo() + for r, c in zip(tar.row, tar.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}target\n") + non = self.non.tocoo() + for r, c in zip(non.row, non.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}nontarget\n") + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -113,6 +137,36 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + SparseTrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod def merge(cls, key_list): raise NotImplementedError() diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index 7ed9a1d1..760bd1f1 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from ..hyp_defs import float_cpu @@ -18,9 +18,6 @@ from .trial_ndx import TrialNdx from .trial_scores import TrialScores -# import h5py - - class SparseTrialScores(TrialScores): @@ -55,6 +52,26 @@ def save_txt(self, file_path): % (self.model_set[r], self.seg_set[c], self.scores[r, c]) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.score_mask.eliminate_zeros() + score_mask = self.score_mask.tocoo() + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + for i, j in zip(score_mask.row, score_mask.col): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -90,6 +107,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu()) + score_mask = sparse.lil_matrix(scores.shape, dtype="bool") + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod def merge(cls, scr_list): raise NotImplementedError() @@ -160,9 +206,9 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if not (np.all(f_mod) and np.all(f_seg)): for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") @@ -172,18 +218,36 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): scores = self.scores.tocoo() new_data = scores.data new_row = scores.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = scores.row == r + # new_row[idx] = i + + # new_col = scores.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = scores.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(scores.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = scores.row == r new_row[idx] = i - new_col = scores.col.copy() + new_col = -1 * np.ones_like(scores.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = scores.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -193,19 +257,37 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): score_mask = self.score_mask.tocoo() new_data = score_mask.data - new_row = score_mask.row.copy() + # new_row = score_mask.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = score_mask.row == r + # new_row[idx] = i + + # new_col = score_mask.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = score_mask.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(score_mask.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = score_mask.row == r new_row[idx] = i - new_col = score_mask.col.copy() + new_col = -1 * np.ones_like(score_mask.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = score_mask.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -249,7 +331,7 @@ def align_with_ndx(self, ndx, raise_missing=True): if not scr.score_mask[r, c]: missing_scores = True logging.info( - "missing-scores for %s %s" % (scr.model_set[r], scr.seg_set[c]) + "missing-scores for %s %s", scr.model_set[r], scr.seg_set[c] ) if missing_scores and raise_missing: @@ -291,7 +373,7 @@ def set_valid_scores(self, scores, ndx=None): self.scores = scr.scores self.score_mat = scr.score_mat - self.scores[self.score_mask]=scores + self.scores[self.score_mask] = scores @classmethod def from_trial_scores(cls, scr): @@ -302,6 +384,12 @@ def from_trial_scores(cls, scr): score_mask.eliminate_zeros() return cls(scr.model_set, scr.seg_set, scores, score_mask) + def to_trial_scores(self): + scores = self.scores.toarray("C") + score_mask = self.score_mask.toarray("C") + # scores[~score_mask] = 0.0 + return TrialScores(self.model_set, self.seg_set, scores, score_mask) + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py index fd17e240..cbccf093 100644 --- a/hyperion/utils/train_val_eval_list.py +++ b/hyperion/utils/train_val_eval_list.py @@ -207,7 +207,7 @@ def create( part_names = ["train", "eval"] if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 9552d7c0..5d8019b6 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -5,11 +5,14 @@ import copy import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray from .trial_ndx import TrialNdx @@ -82,18 +85,20 @@ def sort(self): if self.trial_cond is not None: self.trial_cond = self.trial_cond[:, ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + ext = file_path.suffix + if ext in (".h5", ".hdf5"): self.save_h5(file_path) - else: + elif ext in ("", ".txt"): self.save_txt(file_path) + else: + self.save_table(file_path, sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -132,20 +137,40 @@ def save_txt(self, file_path): file_path: File to write the list. """ with open(file_path, "w") as f: - idx = (self.tar.T == True).nonzero() + idx = (self.tar.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s target\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) - idx = (self.non.T == True).nonzero() + idx = (self.non.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s nontarget\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + I, J = np.logical_or(self.tar, self.non).nonzero() + for i, j in zip(I, J): + target_type = "target" if self.tar[i, j] else "nontarget" + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{target_type}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -154,11 +179,14 @@ def load(cls, file_path): Returns: TrialKey object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -240,6 +268,36 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar, non) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") + non = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar, non) + @classmethod def merge(cls, key_list): """Merges several key objects. diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index e26d19e2..b7b873df 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -4,12 +4,14 @@ """ import copy -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray class TrialNdx(object): @@ -46,17 +48,20 @@ def sort(self): self.seg_set, s_idx = sort(self.seg_set, return_index=True) self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in [".txt", ""]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -71,15 +76,6 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8")) - # model_set = self.model_set.astype('S') - # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype) - # f['ID/row_ids'] = model_set - # seg_set = self.seg_set.astype('S') - # f.create_dataset('ID/column_ids', self.seg_set.shape, dtype=seg_set.dtype) - # f['ID/column_ids'] = seg_set - # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8') - # f['trial_mask'] = self.trial_mask.astype('uint8') - def save_txt(self, file_path): """Saves object to txt file. @@ -91,8 +87,25 @@ def save_txt(self, file_path): for item in zip(idx[0], idx[1]): f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]])) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}\n") + I, J = self.trial_mask.nonzero() + for i, j in zip(I, J): + f.write(f"{self.model_set[i]}{sep}{self.seg_set[j]}\n") + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -101,11 +114,14 @@ def load(cls, file_path): Returns: TrialNdx object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -148,6 +164,36 @@ def load_txt(cls, file_path): trial_mask[item[0], item[1]] = True return cls(model_set, seg_set, trial_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialNdx object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j in zip(model_idx, seg_idx): + trial_mask[i, j] = True + + return cls(model_set, seg_set, trial_mask) + @classmethod def merge(cls, ndx_list): """Merges several index objects. diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index a486647d..4a5e59da 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_cpu -from .list_utils import * + +# from .list_utils import * +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_key import TrialKey from .trial_ndx import TrialNdx @@ -26,13 +28,22 @@ class TrialScores(object): seg_set: List of test segment names. scores: Matrix with the scores (num_models x num_segments). score_mask: Boolean matrix with the trials with valid scores to True (num_models x num_segments). + q_measures: optional dictionary of quality measure matrices """ - def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): + def __init__( + self, + model_set=None, + seg_set=None, + scores=None, + score_mask=None, + q_measures=None, + ): self.model_set = model_set self.seg_set = seg_set self.scores = scores self.score_mask = score_mask + self.q_measures = q_measures if (model_set is not None) and (seg_set is not None): self.validate() @@ -55,18 +66,24 @@ def sort(self): ix = np.ix_(m_idx, s_idx) self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] + if self.q_measures is not None: + for k in self.q_measures.keys(): + self.q_measures[k] = self.q_measures[k][ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in ["", ".txt"]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -81,6 +98,10 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("scores", data=self.scores) f.create_dataset("score_mask", data=self.score_mask.astype("uint8")) + if self.q_measures is not None: + q_grp = f.create_group("q_measures") + for k, v in self.q_measures.items(): + q_grp.create_dataset(k, data=v) def save_txt(self, file_path): """Saves object to txt file. @@ -100,8 +121,38 @@ def save_txt(self, file_path): ) ) + if self.q_measures is not None: + logging.warning("q_measures cannot be saved to txt file") + + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + q_str = "" + if self.q_measures is not None: + q_str = sep + sep.join(self.q_measures.keys()) + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR{q_str}\n") + I, J = self.score_mask.nonzero() + for i, j in zip(I, J): + if self.q_measures is not None: + q_str = sep + sep.join( + [str(v[i, j]) for k, v in self.q_measures.items()] + ) + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}{q_str}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -110,11 +161,14 @@ def load(cls, file_path): Returns: TrialScores object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -131,7 +185,12 @@ def load_h5(cls, file_path): seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] scores = np.asarray(f["scores"], dtype=float_cpu()) score_mask = np.asarray(f["score_mask"], dtype="bool") - return cls(model_set, seg_set, scores, score_mask) + if "q_measures" in f: + q_grp = f["q_measures"] + q_measures = {k: q_grp[k] for k in q_grp} + else: + q_measures = None + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def load_txt(cls, file_path): @@ -163,6 +222,49 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores, score_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + score_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu()) + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + if len(df.columns) > 3: + q_names = df.columns[3:] + q_vals = df.iloc[:, 3:].values + q_measures = {} + for q_name in q_names: + q_measures[q_name] = np.zeros(scores.shape, dtype=float_cpu()) + + for i, j, q_row in zip(model_idx, seg_idx, q_vals): + for col, q_name in enumerate(q_names): + q_measures[q_name][i, j] = q_row[col] + + else: + q_measures = None + + return cls(model_set, seg_set, scores, score_mask, q_measures) + @classmethod def merge(cls, scr_list): """Merges several score objects. @@ -178,6 +280,7 @@ def merge(cls, scr_list): seg_set = scr_list[0].seg_set scores = scr_list[0].scores score_mask = scr_list[0].score_mask + q_measures = scr_list[0].q_measures for i in range(1, num_scr): scr_i = scr_list[i] new_model_set = np.union1d(model_set, scr_i.model_set) @@ -196,6 +299,10 @@ def merge(cls, scr_list): scores_1[ix_a] = scores[ix_b] score_mask_1 = np.zeros(shape, dtype="bool") score_mask_1[ix_a] = score_mask[ix_b] + if q_measures is not None: + q_measures_1 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_1[k][ix_a] = q_measures[k][ix_b] trial_mask_2 = np.zeros( (len(new_model_set), len(new_seg_set)), dtype="bool" @@ -212,14 +319,21 @@ def merge(cls, scr_list): scores_2[ix_a] = scr_i.scores[ix_b] score_mask_2 = np.zeros(shape, dtype="bool") score_mask_2[ix_a] = scr_i.score_mask[ix_b] + if q_measures is not None: + q_measures_2 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_2[k][ix_a] = scr_i.q_measures[k][ix_b] model_set = new_model_set seg_set = new_seg_set scores = scores_1 + scores_2 assert not (np.any(np.logical_and(score_mask_1, score_mask_2))) score_mask = np.logical_or(score_mask_1, score_mask_2) + if q_measures is not None: + for k in q_measures.keys(): + q_measures[k] = q_measures_1[k] + q_measures_2[k] - return cls(model_set, seg_set, scores, score_mask) + return cls(model_set, seg_set, scores, score_mask, q_measures) def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. @@ -235,24 +349,28 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not (keep): + if not keep: model_set = np.setdiff1d(self.model_set, model_set) seg_set = np.setdiff1d(self.model_set, seg_set) f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) - + q_measures = None if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] else: for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") @@ -262,8 +380,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg]) scores[ix1] = self.scores[ix2] score_mask[ix1] = self.score_mask[ix2] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = np.zeros(scores.shape, dtype=float_cpu()) + q_measures[k][ix1] = self.q_measures[k][ix2] - return TrialScores(model_set, seg_set, scores, score_mask) + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): """Splits the TrialScores into num_model_parts x num_seg_parts and returns part @@ -284,7 +407,13 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] score_mask = self.score_mask[ix] - return TrialScores(model_set, seg_set, scores, score_mask) + q_measures = None + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] + + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def validate(self): """Validates the attributes of the TrialScores object.""" @@ -306,6 +435,10 @@ def validate(self): else: assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) + if self.q_measures is not None: + for k in self.q_measures.keys(): + assert self.q_measures[k].shape == self.scores.shape + def align_with_ndx(self, ndx, raise_missing=True): """Aligns scores, model_set and seg_set with TrialNdx or TrialKey. @@ -356,6 +489,34 @@ def get_tar_non(self, key): non = scr.scores[non_mask] return tar, non + def get_tar_non_q_measures(self, key, q_names=None, return_dict=False): + """Returns target and non target scores. + + Args: + key: TrialKey object. + q_names: names of quality measures to return, if None it will return all + + Returns: + Numpy array with target scores. + Numpy array with non-target scores. + """ + scr = self.align_with_ndx(key) + tar_mask = np.logical_and(scr.score_mask, key.tar) + if q_names is None: + q_names = self.q_measures.keys() + tar = {} + for k in q_names: + tar[k] = self.q_measures[k][tar_mask] + non_mask = np.logical_and(scr.score_mask, key.non) + non = {} + for k in q_names: + non[k] = self.q_measures[k][non_mask] + + if not return_dict: + tar = np.vstack(tuple(tar[k] for k in q_names)).T + non = np.vstack(tuple(non[k] for k in q_names)).T + return tar, non + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. @@ -394,6 +555,18 @@ def __eq__(self, other): eq = eq and np.all(self.seg_set == other.seg_set) eq = eq and np.all(np.isclose(self.scores, other.scores, atol=1e-5)) eq = eq and np.all(self.score_mask == other.score_mask) + if self.q_measures is not None: + eq = eq and other.q_measures is not None + if eq: + eq = self.q_measures.keys() == other.q_measures.keys() + if eq: + for k in self.q_measures.keys(): + eq = eq and np.all( + np.isclose( + self.q_measures[k], other.q_measures[k], atol=1e-5 + ) + ) + return eq def __ne__(self, other): @@ -407,7 +580,6 @@ def __cmp__(self, other): return 1 def test(key_file="core-core_det5_key.h5"): - key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9785d021..ae91aabf 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: str, 1: str}): """Loads utt2info list from text file. Args: @@ -261,7 +261,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) self.utt_info = self.utt_info.iloc[index] diff --git a/requirements.txt b/requirements.txt index c3410829..1e1aea9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,6 @@ memory_profiler gdown fairscale==0.4.4 tensorboard>=2.5.0 -yapf jsonargparse>=3.5.0 wandb>=0.10.30 librosa>=0.8.1 @@ -22,3 +21,6 @@ twine wheel transformers>=4.16.2 sentencepiece>=0.1.97 +loralib +lhotse + diff --git a/setup.py b/setup.py index 9780586d..e1fb35cc 100644 --- a/setup.py +++ b/setup.py @@ -15,15 +15,26 @@ # limitations under the License. # -import setuptools from pathlib import Path +import setuptools + project_root = Path(__file__).parent -with open(project_root / "apps.txt") as f: - apps = f.read().splitlines() +# with open(project_root / "apps.txt") as f: +# apps = f.read().splitlines() -apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +# apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +binaries = (project_root / "hyperion" / "bin").glob("*.py") +console_scripts = [] +for binary in binaries: + stem = binary.stem + script_name = stem.replace("hyperion_", "").replace("_", "-") + if script_name[0] == "-": + continue + module = f"hyperion.bin.{stem}:main" + console_script = f"hyperion-{script_name} = {module}" + console_scripts.append(console_script) with open(project_root / "requirements.txt") as f: requirements = f.read().splitlines() @@ -77,10 +88,22 @@ def get_version(): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], python_requires=">=3.7", install_requires=requirements, - scripts=apps, + entry_points={ + "console_scripts": console_scripts, + } + # entry_points={ + # "console_scripts": [ + # "hyperion-prepare-data = hyperion.bin.prepare_data:main", + # "hyperion-train-wav2xvector = hyperion.bin.train_wav2xvector:main", + # ] + # }, + # scripts=apps, )