diff --git a/.gitignore b/.gitignore index 9f8c727d4d0..d0a03a5c13e 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ core .[#]* *~ +# vim autosave and backup files. +*.sw? + # [ecg]tag files. TAGS tags diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh new file mode 100755 index 00000000000..5e3b80adf09 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh @@ -0,0 +1,181 @@ +#!/usr/bin/env bash + +# Copyright 2021 Brno University of Technology (Author: Karel Vesely). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. +# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting +# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing). + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +#num_threads=1 # if >1, will use gmm-latgen-faster-parallel +#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. + # In that case it is recommended to set num-threads to a large + # number, e.g. 20 if you have that many free CPU slots on a GPU + # node, and to use a small number of jobs. +scoring_opts= +skip_diagnostics=false +skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false + +boosting_graphs= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo " --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=$(dirname $dir) # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1 + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +if [ -f $srcdir/cmvn_opts ]; then + cmvn_opts=`cat $srcdir/cmvn_opts` +else + cmvn_opts="--norm-means=false --norm-vars=false" +fi + +#thread_string= +#if $use_gpu; then +# if [ $num_threads -eq 1 ]; then +# echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding." +# fi +# thread_string="-batch --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads --gpu 1" +#elif [ $num_threads -gt 1 ]; then +# thread_string="-parallel --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads" +#fi +queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading... + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +## Set up features. +if [ -f $srcdir/online_cmvn ]; then online_cmvn=true +else online_cmvn=false; fi + +if ! $online_cmvn; then + echo "$0: feature type is raw" + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +else + echo "$0: feature type is raw (apply-cmvn-online)" + feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +elif [ -f $srcdir/init/info.txt ]; then + frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt) + if [ ! -z $frame_subsampling_factor ]; then + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + fi +fi + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh new file mode 100755 index 00000000000..362bec260ca --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +# Copyright 2021 Brno University of Technology (Author: Karel Vesely). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. +# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting +# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing). + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +#num_threads=1 # if >1, will use gmm-latgen-faster-parallel +#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. + # In that case it is recommended to set num-threads to a large + # number, e.g. 20 if you have that many free CPU slots on a GPU + # node, and to use a small number of jobs. +scoring_opts= +skip_diagnostics=false +skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false + +boosting_graphs= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo " --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=$(dirname $dir) # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1 + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +if [ -f $srcdir/cmvn_opts ]; then + cmvn_opts=`cat $srcdir/cmvn_opts` +else + cmvn_opts="--norm-means=false --norm-vars=false" +fi + +#thread_string= +#if $use_gpu; then +# if [ $num_threads -eq 1 ]; then +# echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding." +# fi +# thread_string="-batch --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads --gpu 1" +#elif [ $num_threads -gt 1 ]; then +# thread_string="-parallel --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads" +#fi +queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading... + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +## Set up features. +if [ -f $srcdir/online_cmvn ]; then online_cmvn=true +else online_cmvn=false; fi + +if ! $online_cmvn; then + echo "$0: feature type is raw" + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +else + echo "$0: feature type is raw (apply-cmvn-online)" + feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +elif [ -f $srcdir/init/info.txt ]; then + frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt) + if [ ! -z $frame_subsampling_factor ]; then + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + fi +fi + +rho_label=$(grep '#0' $graphdir/words.txt | awk '{ print $2; }') + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \ + --rho-label=$rho_label \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/src/configure_mkl b/src/configure_mkl new file mode 100755 index 00000000000..0f2936471f7 --- /dev/null +++ b/src/configure_mkl @@ -0,0 +1,21 @@ +#!/bin/bash + +# Get the MKL config, +#source /usr/local/share/intel/mkl/bin/mklvars.sh intel64 ilp64 # outdated... +export MKLROOT=/usr/local/share/intel/mkl/2021.4.0 + +# Use older compiler +# export CXX=g++-7.4 # CUDA 10.0 works well with version '7.4', +export CXX=g++-9.4 # CUDA 11.2, recommended gcc version 9.*, + +export CXXFLAGS="-march=x86-64" # compile for 'generic' 64bit CPU, +#export CXXFLAGS="-march=westmere" # oldest architecutre we have at BUT (X5675, Westmere, blade024), + +# Use different CUDA, +# CUDATK=/usr/local/share/cuda-10.2.89 # CUDA 10.0 supports our default gcc 7.4.0, +# CUDATK=/usr/local/share/cuda-11.0.194 +CUDATK=/usr/local/share/cuda-11.2 +# and add '--cudatk-dir=$CUDATK' to './configure' + +# Generate kaldi.mk, +./configure --mkl-root=$MKLROOT --cudatk-dir=$CUDATK --shared diff --git a/src/latbin/Makefile b/src/latbin/Makefile index d5cc4d035b9..592fca41e50 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -26,7 +26,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \ lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse \ - lattice-expand lattice-path-cover lattice-add-nnlmscore + lattice-expand lattice-path-cover lattice-add-nnlmscore \ + lattice-compose-fsts OBJFILES = @@ -36,6 +37,6 @@ TESTFILES = ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \ ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ - ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc new file mode 100644 index 00000000000..8fd889e192f --- /dev/null +++ b/src/latbin/lattice-compose-fsts.cc @@ -0,0 +1,200 @@ +// latbin/lattice-compose-fsts.cc + +// Copyright 2020 Brno University of Technology; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Composes lattices (in transducer form, as type Lattice) with word-network FSTs.\n" + "Either with a single FST from rxfilename or with per-utterance FSTs from rspecifier.\n" + "The FST weights are interpreted as \"graph weights\" when converted into the Lattice format.\n" + "\n" + "Usage: lattice-compose-fsts [options] lattice-rspecifier1 " + "(fst-rspecifier2|fst-rxfilename2) lattice-wspecifier\n" + " e.g.: lattice-compose-fsts ark:1.lats ark:2.fsts ark:composed.lats\n" + " or: lattice-compose-fsts ark:1.lats G.fst ark:composed.lats\n"; + + ParseOptions po(usage); + + bool write_compact = true; + int32 num_states_cache = 50000; + int32 phi_label = fst::kNoLabel; // == -1 + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form."); + po.Register("phi-label", &phi_label, + "If >0, the label on backoff arcs of the LM"); + po.Register("num-states-cache", &num_states_cache, + "Number of states we cache when mapping LM FST to lattice type." + " More -> more memory but faster."); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + KALDI_ASSERT(phi_label > 0 || phi_label == fst::kNoLabel); // e.g. 0 not allowed. + + std::string lats_rspecifier1 = po.GetArg(1), + arg2 = po.GetArg(2), + lats_wspecifier = po.GetArg(3); + int32 n_done = 0, n_fail = 0; + + SequentialLatticeReader lattice_reader1(lats_rspecifier1); + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) { + compact_lattice_writer.Open(lats_wspecifier); + } else { + lattice_writer.Open(lats_wspecifier); + } + + if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) { + std::string fst_rxfilename = arg2; + VectorFst* fst2 = fst::ReadFstKaldi(fst_rxfilename); + // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring, + // with all the cost on the first member of the pair (since we're + // assuming it's a graph weight). + if (fst2->Properties(fst::kILabelSorted, true) == 0) { + // Make sure fst2 is sorted on ilabel. + fst::ILabelCompare ilabel_comp; + ArcSort(fst2, ilabel_comp); + } + /* // THIS MAKES ALL STATES FINAL STATES! WHY? + if (phi_label > 0) + PropagateFinal(phi_label, fst2); + */ + + fst::CacheOptions cache_opts(true, num_states_cache); + fst::MapFstOptions mapfst_opts(cache_opts); + fst::StdToLatticeMapper mapper; + fst::MapFst > + mapped_fst2(*fst2, mapper, mapfst_opts); + + for (; !lattice_reader1.Done(); lattice_reader1.Next()) { + std::string key = lattice_reader1.Key(); + KALDI_VLOG(1) << "Processing lattice for key " << key; + Lattice lat1 = lattice_reader1.Value(); + ArcSort(&lat1, fst::OLabelCompare()); + Lattice composed_lat; + if (phi_label > 0) { + PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); + } else { + Compose(lat1, mapped_fst2, &composed_lat); + } + if (composed_lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + n_fail++; + } else { + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + lattice_writer.Write(key, composed_lat); + } + n_done++; + } + } + delete fst2; + } else { + // Compose each utterance with its matching (by key) FST. + std::string fst_rspecifier2 = arg2; + RandomAccessTableReader fst_reader2(fst_rspecifier2); + + for (; !lattice_reader1.Done(); lattice_reader1.Next()) { + std::string key = lattice_reader1.Key(); + KALDI_VLOG(1) << "Processing lattice for key " << key; + Lattice lat1 = lattice_reader1.Value(); + lattice_reader1.FreeCurrent(); + + if (!fst_reader2.HasKey(key)) { + KALDI_WARN << "Not producing output for utterance " << key + << " because it's not present in second table."; + n_fail++; + continue; + } + + VectorFst fst2 = fst_reader2.Value(key); + if (fst2.Properties(fst::kILabelSorted, true) == 0) { + // Make sure fst2 is sorted on ilabel. + fst::ILabelCompare ilabel_comp; + fst::ArcSort(&fst2, ilabel_comp); + } + /* // THIS MAKES ALL STATES FINAL STATES! WHY? + if (phi_label > 0) + PropagateFinal(phi_label, &fst2); + */ + + // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring, + // with all the cost on the first member of the pair (since we're + // assuming it's a graph weight). + fst::CacheOptions cache_opts(true, num_states_cache); + fst::MapFstOptions mapfst_opts(cache_opts); + fst::StdToLatticeMapper mapper; + fst::MapFst > + mapped_fst2(fst2, mapper, mapfst_opts); + + // sort lat1 on olabel. + ArcSort(&lat1, fst::OLabelCompare()); + + Lattice composed_lat; + if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); + else Compose(lat1, mapped_fst2, &composed_lat); + + if (composed_lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + n_fail++; + } else { + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + lattice_writer.Write(key, composed_lat); + } + n_done++; + } + } + } + + KALDI_LOG << "Done " << n_done << " lattices; failed for " + << n_fail; + + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 039fc258b13..2804e4b31fe 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -22,7 +22,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-xvector-compute-batched \ nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \ nnet3-latgen-faster-lookahead cuda-gpu-available cuda-compiled \ - nnet3-latgen-faster-looped-parallel + nnet3-latgen-faster-looped-parallel \ + nnet3-latgen-faster-compose OBJFILES = @@ -37,7 +38,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \ ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \ ../transform/kaldi-transform.a ../ivector/kaldi-ivector.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc new file mode 100644 index 00000000000..d4e7c094d30 --- /dev/null +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -0,0 +1,308 @@ +// nnet3bin/nnet3-latgen-faster-compose.cc + +// Copyright 2020 Brno University of Technology (author: Karel Vesely) +// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Guoguo Chen + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include + +#include + +#include "base/kaldi-common.h" +#include "base/timer.h" +#include "decoder/decoder-wrappers.h" +#include "fstext/fstext-lib.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-am-decodable-simple.h" +#include "nnet3/nnet-utils.h" +#include "tree/context-dep.h" +#include "util/common-utils.h" + + +int main(int argc, char *argv[]) { + // note: making this program work with GPUs is as simple as initializing the + // device, but it probably won't make a huge difference in speed for typical + // setups. You should use nnet3-latgen-faster-batch if you want to use a GPU. + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + using fst::SymbolTable; + using fst::Fst; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Generate lattices using nnet3 neural net model, with on-the-fly composition HCLG o B.\n" + "B is utterance-specific boosting graph, typically a single-state FST with\n" + "all words from words.txt on self loop arcs (then composition is not prohibitevly slow).\n" + "Some word-arcs will have score discounts as costs, to boost them in HMM beam-search.\n" + "Or, by not including words in B, we can remove them from HCLG network.\n" + "Usage: nnet3-latgen-faster-compose [options] " + " [ [] ]\n" + "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n"; + + ParseOptions po(usage); + + Timer timer, timer_compose; + double elapsed_compose = 0.0; + + int32 rho_label = fst::kNoLabel; // == -1 + + bool allow_partial = false; + LatticeFasterDecoderConfig config; + NnetSimpleComputationOptions decodable_opts; + + std::string word_syms_filename; + std::string ivector_rspecifier, + online_ivector_rspecifier, + utt2spk_rspecifier; + int32 online_ivector_period = 0; + config.Register(&po); + decodable_opts.Register(&po); + + po.Register("rho-label", &rho_label, + "If >0, symbol for 'match the rest' in the biasing graph boosting_fst"); + + po.Register("word-symbol-table", &word_syms_filename, + "Symbol table for words [for debug output]"); + po.Register("allow-partial", &allow_partial, + "If true, produce output even if end state was not reached."); + po.Register("ivectors", &ivector_rspecifier, "Rspecifier for " + "iVectors as vectors (i.e. not estimated online); per utterance " + "by default, or per speaker if you provide the --utt2spk option."); + po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for " + "utt2spk option used to get ivectors per speaker"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for " + "iVectors estimated online, as matrices. If you supply this," + " you must set the --online-ivector-period option."); + po.Register("online-ivector-period", &online_ivector_period, "Number of frames " + "between iVectors in matrices supplied to the --online-ivectors " + "option"); + + po.Read(argc, argv); + + if (po.NumArgs() < 4 || po.NumArgs() > 6) { + po.PrintUsage(); + exit(1); + } + + std::string model_in_filename = po.GetArg(1), + hclg_fst_rxfilename = po.GetArg(2), + boosting_fst_rspecifier = po.GetArg(3), + feature_rspecifier = po.GetArg(4), + lattice_wspecifier = po.GetArg(5), + words_wspecifier = po.GetOptArg(6), + alignment_wspecifier = po.GetOptArg(7); + + TransitionModel trans_model; + AmNnetSimple am_nnet; + { + bool binary; + Input ki(model_in_filename, &binary); + trans_model.Read(ki.Stream(), binary); + am_nnet.Read(ki.Stream(), binary); + SetBatchnormTestMode(true, &(am_nnet.GetNnet())); + SetDropoutTestMode(true, &(am_nnet.GetNnet())); + CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet())); + } + + bool determinize = config.determinize_lattice; + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) + : lattice_writer.Open(lattice_wspecifier))) { + KALDI_ERR << "Could not open table for writing lattices: " + << lattice_wspecifier; + } + + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReaderMapped ivector_reader( + ivector_rspecifier, utt2spk_rspecifier); + + Int32VectorWriter words_writer(words_wspecifier); + Int32VectorWriter alignment_writer(alignment_wspecifier); + + std::unique_ptr word_syms = nullptr; + if (word_syms_filename != "") { + word_syms.reset(fst::SymbolTable::ReadText(word_syms_filename)); + if (!word_syms) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + } + + double tot_like = 0.0; + kaldi::int64 frame_count = 0; + int num_success = 0, num_fail = 0; + // this compiler object allows caching of computations across + // different utterances. + CachingOptimizingCompiler compiler(am_nnet.GetNnet(), + decodable_opts.optimize_config); + + KALDI_ASSERT(ClassifyRspecifier(hclg_fst_rxfilename, NULL, NULL) == kNoRspecifier); + { + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + + RandomAccessTableReader boosting_fst_reader(boosting_fst_rspecifier); + + // 'hclg_fst' is a single FST. + VectorFst hclg_fst; + { + auto hclg_fst_tmp = std::unique_ptr>(fst::ReadFstKaldiGeneric(hclg_fst_rxfilename)); + hclg_fst = VectorFst(*hclg_fst_tmp); // Fst -> VectorFst, as it has to be MutableFst... + // 'hclg_fst_tmp' is deleted by 'going out of scope' ... + } + + // make sure hclg is sorted on olabel + if (hclg_fst.Properties(fst::kOLabelSorted, true) == 0) { + fst::OLabelCompare olabel_comp; + fst::ArcSort(&hclg_fst, olabel_comp); + } + + timer.Reset(); + + //// MAIN LOOP //// + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + const Matrix &features (feature_reader.Value()); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_fail++; + continue; + } + const Matrix *online_ivectors = NULL; + const Vector *ivector = NULL; + if (!ivector_rspecifier.empty()) { + if (!ivector_reader.HasKey(utt)) { + KALDI_WARN << "No iVector available for utterance " << utt; + num_fail++; + continue; + } else { + ivector = &ivector_reader.Value(utt); + } + } + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(utt)) { + KALDI_WARN << "No online iVector available for utterance " << utt; + num_fail++; + continue; + } else { + online_ivectors = &online_ivector_reader.Value(utt); + } + } + + // get the boosting graph, + VectorFst boosting_fst; + if (!boosting_fst_reader.HasKey(utt)) { + KALDI_WARN << "No boosting fst for utterance " << utt; + num_fail++; + continue; + } else { + boosting_fst = boosting_fst_reader.Value(utt); // copy, + } + + timer_compose.Reset(); + + // RmEpsilon saved 30% of composition runtime. + // - Note: we are loading 2-state graphs with eps back-link to the initial state. + if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) { + fst::RmEpsilon(&boosting_fst); + } + + // Make sure boosting graph is sorted on ilabel. + if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) { + fst::ILabelCompare ilabel_comp; + fst::ArcSort(&boosting_fst, ilabel_comp); + } + + // run composition, + VectorFst decode_fst; + if (rho_label > 0) { + fst::RhoCompose(hclg_fst, boosting_fst, rho_label, &decode_fst); + } else { + fst::Compose(hclg_fst, boosting_fst, &decode_fst); + } + + // check that composed graph is non-empty, + if (decode_fst.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty 'decode_fst' HCLG for utterance " + << utt << " (bad boosting graph?)"; + num_fail++; + continue; + } + + elapsed_compose += timer_compose.Elapsed(); + + DecodableAmNnetSimple nnet_decodable( + decodable_opts, trans_model, am_nnet, + features, ivector, online_ivectors, + online_ivector_period, &compiler); + + // Note: decode_fst is VectorFst, not ConstFst. + // + // OpenFst docs say that more specific iterators + // are faster than generic iterators. And in HCLG + // is usually loaded for decoding as ConstFst. + // + // auto decode_fst_ = ConstFst(decode_fst); + // + // In this way, I tried to cast VectorFst to ConstFst, + // but this made the decoding 20% slower. + // + LatticeFasterDecoder decoder(decode_fst, config); + + double like; + if (DecodeUtteranceLatticeFaster( + decoder, nnet_decodable, trans_model, word_syms.get(), utt, + decodable_opts.acoustic_scale, determinize, allow_partial, + &alignment_writer, &words_writer, &compact_lattice_writer, + &lattice_writer, + &like)) { + tot_like += like; + frame_count += nnet_decodable.NumFramesReady(); + num_success++; + } else { + ++num_fail; + } + } + } + + kaldi::int64 input_frame_count = + frame_count * decodable_opts.frame_subsampling_factor; + + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 frames/sec is " + << (elapsed * 100.0 / input_frame_count); + KALDI_LOG << "Composition time "<< elapsed_compose + << "s (" << (elapsed_compose * 100.0 / elapsed) << "%)"; + KALDI_LOG << "Done " << num_success << " utterances, failed for " + << num_fail; + KALDI_LOG << "Overall log-likelihood per frame is " + << (tot_like / frame_count) << " over " + << frame_count << " frames."; + + return num_success != 0 ? 0 : 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}