kaldi-asr
diff --git a/‎egs/chime6/s5_track2/RESULTS‎
Lines changed: 15 additions & 10 deletions b/‎egs/chime6/s5_track2/RESULTS‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎egs/chime6/s5_track2/local/decode.sh‎
Lines changed: 51 additions & 10 deletions b/‎egs/chime6/s5_track2/local/decode.sh‎
Lines changed: 51 additions & 10 deletions
diff --git a/‎egs/chime6/s5_track2/local/decode_diarized.sh‎
Lines changed: 5 additions & 2 deletions b/‎egs/chime6/s5_track2/local/decode_diarized.sh‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎egs/chime6/s5_track2/local/diarize.sh‎
Lines changed: 20 additions & 14 deletions b/‎egs/chime6/s5_track2/local/diarize.sh‎
Lines changed: 20 additions & 14 deletions
diff --git a/‎egs/chime6/s5_track2/local/install_dscore.sh‎
Lines changed: 8 additions & 0 deletions b/‎egs/chime6/s5_track2/local/install_dscore.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎egs/chime6/s5_track2/local/truncate_rttm.py‎
Lines changed: 39 additions & 0 deletions b/‎egs/chime6/s5_track2/local/truncate_rttm.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎egs/chime6/s5_track2/local/uem_file‎
Lines changed: 20 additions & 0 deletions b/‎egs/chime6/s5_track2/local/uem_file‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎egs/chime6/s5_track2/path.sh‎
Lines changed: 2 additions & 0 deletions b/‎egs/chime6/s5_track2/path.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎egs/chime6/s5_track2/run.sh‎
Lines changed: 6 additions & 2 deletions b/‎egs/chime6/s5_track2/run.sh‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎egs/wsj/s5/local/nnet3/run_ivector_common.sh‎
Lines changed: 1 addition & 6 deletions b/‎egs/wsj/s5/local/nnet3/run_ivector_common.sh‎
Lines changed: 1 addition & 6 deletions
@@ -1,18 +1,23 @@
 # Results for Chime-6 track 2 for dev and eval, using pretrained models
 # available at http://kaldi-asr.org/models/m12.
 
-# Speech Activity Detection (SAD)
-          Missed speech   False alarm   Total error
-Dev         4.3             2.1           6.4                                                
-Eval        5.6             5.9           11.5
+# These results are reported only for array U06, which is the default
+# array selection method in the baseline system.
 
-# The results for the remaining pipeline are only for array U06.
+# Speech Activity Detection (SAD)
+                  Missed speech   False alarm   Total error
+Dev (old RTTM)        2.5             0.8           3.3                                                
+Dev (new RTTM)        1.9             0.7           2.6                                                
+Eval (old RTTM)       4.1             1.8           5.9           
+Eval (new RTTM)       4.3             1.5           5.8           
 
 # Diarization
-        DER       JER
-Dev     57.15     83.96
-Eval    54.12     80.33
+                    DER      JER
+Dev (old RTTM)    61.56     69.75
+Dev (new RTTM)    63.42     70.83
+Eval (old RTTM)   61.96     71.40
+Eval (new RTTM)   68.20     72.54
 
 # ASR nnet3 tdnn+chain
-Dev:  %WER 84.33 [ 49653 / 58881, 1529 ins, 35813 del, 12311 sub ]
-Eval: %WER 78.08 [ 43046 / 55132, 957 ins, 32045 del, 10044 sub ]
+Dev:  %WER 84.25 [ 49610 / 58881, 1937 ins, 34685 del, 12988 sub ]
+Eval: %WER 77.94 [ 42971 / 55132, 1086 ins, 30839 del, 11046 sub ]
@@ -4,19 +4,26 @@
 # Feature extraction -> SAD -> Diarization -> ASR
 #
 # Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
-#            2019  Desh Raj, David Snyder, Ashish Arora
+#            2019  Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni
 # Apache 2.0
 
 # Begin configuration section.
 nj=8
-decode_nj=10
 stage=0
 sad_stage=0
+score_sad=true
 diarizer_stage=0
 decode_diarize_stage=0
 score_stage=0
+
 enhancement=beamformit
 
+# option to use the new RTTM reference for sad and diarization
+use_new_rttm_reference=false
+if $use_new_rttm_reference == "true"; then
+  git clone https://github.com/nateanl/chime6_rttm
+fi
+
 # chime5 main directory path
 # please change the path accordingly
 chime5_corpus=/export/corpora4/CHiME5
@@ -93,14 +100,15 @@ if [ $stage -le 1 ]; then
       "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
       ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb
   done
+
 fi
 
 if [ $stage -le 2 ]; then
   # mfccdir should be some place with a largish disk where you
   # want to store MFCC features.
   mfccdir=mfcc
   for x in ${test_sets}; do
-    steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
       --mfcc-config conf/mfcc_hires.conf \
       data/$x exp/make_mfcc/$x $mfccdir
   done
@@ -121,18 +129,44 @@ if [ $stage -le 3 ]; then
       exit 0
     fi
     # Perform segmentation
-    local/segmentation/detect_speech_activity.sh --nj $decode_nj --stage $sad_stage \
+    local/segmentation/detect_speech_activity.sh --nj $nj --stage $sad_stage \
       $test_set $sad_nnet_dir mfcc $sad_work_dir \
       data/${datadir} || exit 1
 
-    mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg
-    mv data/${datadir}/{segments.bak,utt2spk.bak} data/${datadir}_${nnet_type}_seg
+    test_dir=data/${datadir}_${nnet_type}_seg
+    mv data/${datadir}_seg ${test_dir}/
+    cp data/${datadir}/{segments.bak,utt2spk.bak} ${test_dir}/
     # Generate RTTM file from segmentation performed by SAD. This can
     # be used to evaluate the performance of the SAD as an intermediate
     # step.
     steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
-      data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \
-      data/${datadir}_${nnet_type}_seg/rttm
+      ${test_dir}/utt2spk ${test_dir}/segments ${test_dir}/rttm
+
+    if [ $score_sad == "true" ]; then
+      echo "Scoring $datadir.."
+      # We first generate the reference RTTM from the backed up utt2spk and segments
+      # files.
+      ref_rttm=${test_dir}/ref_rttm
+      steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_dir}/utt2spk.bak \
+        ${test_dir}/segments.bak ${test_dir}/ref_rttm
+
+      # To score, we select just U06 segments from the hypothesis RTTM.
+      hyp_rttm=${test_dir}/rttm.U06
+      grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06
+      echo "Array U06 selected for scoring.."
+      
+      if $use_new_rttm_reference == "true"; then
+        echo "Use the new RTTM reference."
+        mode="$(cut -d'_' -f1 <<<"$datadir")"
+        ref_rttm=./chime6_rttm/${mode}_rttm
+      fi
+
+      sed 's/_U0[1-6].ENH//g' $ref_rttm > $ref_rttm.scoring
+      sed 's/_U0[1-6].ENH//g' $hyp_rttm > $hyp_rttm.scoring
+      cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.tmp
+      md-eval.pl -1 -c 0.25 -u ./local/uem_file.tmp -r $ref_rttm.scoring -s $hyp_rttm.scoring |\
+        awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)'
+    fi
   done
 fi
 
@@ -141,7 +175,14 @@ fi
 #######################################################################
 if [ $stage -le 4 ]; then
   for datadir in ${test_sets}; do
-    local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \
+    if $use_new_rttm_reference == "true"; then
+      mode="$(cut -d'_' -f1 <<<"$datadir")"
+      ref_rttm=./chime6_rttm/${mode}_rttm
+    else
+      ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm
+    fi
+    local/diarize.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \
+      --ref-rttm $ref_rttm \
       exp/xvector_nnet_1a \
       data/${datadir}_${nnet_type}_seg \
       exp/${datadir}_${nnet_type}_seg_diarization
@@ -156,7 +197,7 @@ if [ $stage -le 5 ]; then
     local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \
       exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang \
       exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \
-      data/${datadir}_diarized
+      data/${datadir}_diarized || exit 1
   done
 fi
 
 
@@ -38,15 +38,18 @@ if [ $stage -le 0 ]; then
   echo "$0 copying data files in output directory"
   cp $rttm_dir/rttm $rttm_dir/rttm_1
   sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1
+  # removing participant introduction from the hypothesis rttm
+  # UEM file contains the scoring durations for each recording
+  local/truncate_rttm.py $rttm_dir/rttm_1 local/uem_file $rttm_dir/rttm_introduction_removed
   mkdir -p ${out_dir}_hires
   cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires
   utils/data/get_reco2dur.sh ${out_dir}_hires
 fi
 
 if [ $stage -le 1 ]; then
   echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
-  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_1 \
-    <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_1 |sort -u) \
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_introduction_removed \
+    <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_introduction_removed |sort -u) \
     ${out_dir}_hires/utt2spk ${out_dir}_hires/segments
 
   utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
 
@@ -1,5 +1,7 @@
-#!/usr/bin/env bash
-# Copyright   2019   David Snder
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
 # Apache 2.0.
 #
 # This script takes an input directory that has a segments file (and
@@ -20,7 +22,7 @@ if [ $# != 3 ]; then
   echo "Options: "
   echo "  --nj <nj>                                        # number of parallel jobs."
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-  echo "  --ref-rttm <path to reference RTTM>              # if present, used to score output RTTM."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
   exit 1;
 fi
 
@@ -85,29 +87,33 @@ if [ $stage -le 4 ]; then
   echo "$0: wrote RTTM to output directory ${out_dir}"
 fi
 
+hyp_rttm=${out_dir}/rttm
+
 # For scoring the diarization system, we use the same tool that was
 # used in the DIHARD II challenge. This is available at:
 # https://github.com/nryant/dscore
+# Note that the scoring takes a single reference RTTM and a single
+# hypothesis RTTM.
 if [ $stage -le 5 ]; then
   # If a reference RTTM file is not provided, we create one using the backed up
   # segments and utt2spk files in the original data directory.
-  if [ -z $ref_rttm ]; then
-    ref_rttm=data/$name/rttm
-    echo "$0: preparing ref RTTM file from segments and utt2spk"
+  if [ -z "$ref_rttm" ]; then
     steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \
-      data/$name/segments.bak $ref_rttm
+      data/$name/segments.bak data/$name/rttm
+    ref_rttm=data/$name/rttm
   fi
-  grep 'U06' $ref_rttm > ${ref_rttm}.U06
-  ref_rttm_path=$(readlink -f ${ref_rttm}.U06)
-  out_rttm_path=$(readlink -f $out_dir/rttm)
+  echo "Diarization results for "${name}
   if ! [ -d dscore ]; then
     git clone https://github.com/nryant/dscore.git || exit 1;
     cd dscore
     python -m pip install --user -r requirements.txt
     cd ..
   fi
-  cd dscore
-  python score.py -r $ref_rttm_path -s $out_rttm_path
-  cd ..
+  sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring
+  sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring
+  ref_rttm_path=$(readlink -f ${ref_rttm}.scoring)
+  hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring)
+  cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring
+  cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \
+    -s $hyp_rttm_path && cd .. || exit 1;
 fi
-
 
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Installs dscore
+git clone https://github.com/nryant/dscore.git
+pip3 install intervaltree --user
+pip3 install tabulate --user
+pip3 install munkres --user
+pip3 install pytest --user
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# Apache 2.0
+# This script truncates the rttm file
+# using UEM file and writes it to a new rttm file
+#
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+from scorelib.turn import trim_turns
+import scorelib.rttm as rttm_func
+from scorelib.uem import load_uem
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script truncates the rttm file
+                       using UEM file""")
+    parser.add_argument("rttm_file", type=str,
+                        help="""Input RTTM file.
+                            The format of the RTTM file is
+                            <type> <file-id> <channel-id> <begin-time> """
+                             """<end-time> <NA> <NA> <speaker> <conf>""")
+    parser.add_argument("uem_file", type=str,
+                        help="""Input UEM file.
+                            The format of the UEM file is
+                            <file-id> <channel-id> <begin-time> <end-time>""")
+    parser.add_argument("rttm_file_write", type=str,
+                        help="""output RTTM file.""")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    rttm_writer = open(args.rttm_file_write, 'w')
+    turns, speaker_ids, file_ids = rttm_func.load_rttm(args.rttm_file)
+    loaded_uem = load_uem(args.uem_file)
+    truncated_turns = trim_turns(turns, loaded_uem)
+    rttm_func.write_rttm(args.rttm_file_write,truncated_turns)
@@ -0,0 +1,20 @@
+S01_U01 1 0 12000
+S02_U01 1 75 12000
+S09_U01 1 64 12000
+S21_U01 1 59 12000
+S01_U02 1 0 12000
+S02_U02 1 75 12000
+S09_U02 1 64 12000
+S21_U02 1 59 12000
+S01_U03 1 0 12000
+S02_U03 1 75 12000
+S09_U03 1 64 12000
+S21_U03 1 59 12000
+S01_U04 1 0 12000
+S02_U04 1 75 12000
+S09_U04 1 64 12000
+S21_U04 1 59 12000
+S01_U06 1 0 12000
+S02_U06 1 75 12000
+S09_U06 1 64 12000
+S21_U06 1 59 12000
@@ -1,6 +1,8 @@
 export KALDI_ROOT=`pwd`/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+export PATH=$PWD/dscore:$PATH
+export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
 
@@ -16,7 +16,7 @@ stage=0
 nnet_stage=-10
 sad_stage=0
 diarizer_stage=0
-decode_stage=1
+decode_stage=0
 enhancement=beamformit # for a new enhancement method,
                        # change this variable and decode stage
 decode_only=false
@@ -111,8 +111,12 @@ if [ $stage -le 4 ]; then
   utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
   grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
   utils/fix_data_dir.sh data/train_worn
-fi
 
+  # Remove S12_U05 from training data since it has known issues
+  utils/copy_data_dir.sh data/train_u05 data/train_u05_org # back up
+  grep -v -e "^S12_U05" data/train_u05_org/text > data/train_u05/text
+  utils/fix_data_dir.sh data/train_u05
+fi
 
 #########################################################################################
 # In stages 5 and 6, we augment and fix train data for our training purpose. point source
 
@@ -77,17 +77,12 @@ fi
 
 
 # high-resolution features and i-vector extractor,
-if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+if [ $stage -le 4 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
   echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
   echo " ... Please either remove it, or rerun this script with stage > 2."
   exit 1
 fi
 
-if [ $stage -le 4 ]; then
-  echo "$0: preparing directory for speed-perturbed data"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
-fi
-
 if [ $stage -le 5 ]; then
   echo "$0: creating high-resolution MFCC features"