diff --git a/.gitignore b/.gitignore
index 9f8c727d4d0..d0a03a5c13e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,9 @@ core
 .[#]*
 *~
 
+# vim autosave and backup files.
+*.sw?
+
 # [ecg]tag files.
 TAGS
 tags
diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh
new file mode 100755
index 00000000000..5e3b80adf09
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh
@@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+
+# Copyright 2021       Brno University of Technology (Author: Karel Vesely).
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.
+# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting
+# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing).
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+#num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+
+boosting_graphs=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "      --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=$(dirname $dir)  # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+if [ -f $srcdir/cmvn_opts ]; then
+    cmvn_opts=`cat $srcdir/cmvn_opts`
+else
+    cmvn_opts="--norm-means=false --norm-vars=false"
+fi
+
+#thread_string=
+#if $use_gpu; then
+#  if [ $num_threads -eq 1 ]; then
+#    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+#  fi
+#  thread_string="-batch --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads --gpu 1"
+#elif [ $num_threads -gt 1 ]; then
+#  thread_string="-parallel --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads"
+#fi
+queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading...
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+## Set up features.
+if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
+else online_cmvn=false; fi
+
+if ! $online_cmvn; then
+  echo "$0: feature type is raw"
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+else
+  echo "$0: feature type is raw (apply-cmvn-online)"
+  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+elif [ -f $srcdir/init/info.txt ]; then
+    frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
+    if [ ! -z $frame_subsampling_factor ]; then
+        frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+    fi
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh
new file mode 100755
index 00000000000..362bec260ca
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+# Copyright 2021       Brno University of Technology (Author: Karel Vesely).
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.
+# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting
+# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing).
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+#num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+
+boosting_graphs=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "      --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=$(dirname $dir)  # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+if [ -f $srcdir/cmvn_opts ]; then
+    cmvn_opts=`cat $srcdir/cmvn_opts`
+else
+    cmvn_opts="--norm-means=false --norm-vars=false"
+fi
+
+#thread_string=
+#if $use_gpu; then
+#  if [ $num_threads -eq 1 ]; then
+#    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+#  fi
+#  thread_string="-batch --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads --gpu 1"
+#elif [ $num_threads -gt 1 ]; then
+#  thread_string="-parallel --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads"
+#fi
+queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading...
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+## Set up features.
+if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
+else online_cmvn=false; fi
+
+if ! $online_cmvn; then
+  echo "$0: feature type is raw"
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+else
+  echo "$0: feature type is raw (apply-cmvn-online)"
+  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+elif [ -f $srcdir/init/info.txt ]; then
+    frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
+    if [ ! -z $frame_subsampling_factor ]; then
+        frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+    fi
+fi
+
+rho_label=$(grep '#0' $graphdir/words.txt | awk '{ print $2; }')
+
+if [ $stage -le 1 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \
+     --rho-label=$rho_label \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/src/configure_mkl b/src/configure_mkl
new file mode 100755
index 00000000000..0f2936471f7
--- /dev/null
+++ b/src/configure_mkl
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Get the MKL config,
+#source /usr/local/share/intel/mkl/bin/mklvars.sh intel64 ilp64 # outdated...
+export MKLROOT=/usr/local/share/intel/mkl/2021.4.0
+
+# Use older compiler
+# export CXX=g++-7.4 # CUDA 10.0 works well with version '7.4',
+export CXX=g++-9.4   # CUDA 11.2, recommended gcc version 9.*,
+
+export CXXFLAGS="-march=x86-64" # compile for 'generic' 64bit CPU,
+#export CXXFLAGS="-march=westmere" # oldest architecutre we have at BUT (X5675, Westmere, blade024),
+
+# Use different CUDA,
+# CUDATK=/usr/local/share/cuda-10.2.89 # CUDA 10.0 supports our default gcc 7.4.0,
+# CUDATK=/usr/local/share/cuda-11.0.194
+CUDATK=/usr/local/share/cuda-11.2
+# and add '--cudatk-dir=$CUDATK' to './configure'
+
+# Generate kaldi.mk,
+./configure --mkl-root=$MKLROOT --cudatk-dir=$CUDATK --shared
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index d5cc4d035b9..592fca41e50 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -26,7 +26,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
            lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse \
-		   lattice-expand lattice-path-cover lattice-add-nnlmscore
+		       lattice-expand lattice-path-cover lattice-add-nnlmscore \
+	         lattice-compose-fsts
 
 OBJFILES =
 
@@ -36,6 +37,6 @@ TESTFILES =
 ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc
new file mode 100644
index 00000000000..8fd889e192f
--- /dev/null
+++ b/src/latbin/lattice-compose-fsts.cc
@@ -0,0 +1,200 @@
+// latbin/lattice-compose-fsts.cc
+
+// Copyright 2020  Brno University of Technology; Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Composes lattices (in transducer form, as type Lattice) with word-network FSTs.\n"
+        "Either with a single FST from rxfilename or with per-utterance FSTs from rspecifier.\n"
+        "The FST weights are interpreted as \"graph weights\" when converted into the Lattice format.\n"
+        "\n"
+        "Usage: lattice-compose-fsts [options] lattice-rspecifier1 "
+        "(fst-rspecifier2|fst-rxfilename2) lattice-wspecifier\n"
+        " e.g.: lattice-compose-fsts ark:1.lats ark:2.fsts ark:composed.lats\n"
+        " or: lattice-compose-fsts ark:1.lats G.fst ark:composed.lats\n";
+
+    ParseOptions po(usage);
+
+    bool write_compact = true;
+    int32 num_states_cache = 50000;
+    int32 phi_label = fst::kNoLabel;  // == -1
+    po.Register("write-compact", &write_compact,
+                "If true, write in normal (compact) form.");
+    po.Register("phi-label", &phi_label,
+                "If >0, the label on backoff arcs of the LM");
+    po.Register("num-states-cache", &num_states_cache,
+                "Number of states we cache when mapping LM FST to lattice type."
+                " More -> more memory but faster.");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    KALDI_ASSERT(phi_label > 0 || phi_label == fst::kNoLabel); // e.g. 0 not allowed.
+
+    std::string lats_rspecifier1 = po.GetArg(1),
+        arg2 = po.GetArg(2),
+        lats_wspecifier = po.GetArg(3);
+    int32 n_done = 0, n_fail = 0;
+
+    SequentialLatticeReader lattice_reader1(lats_rspecifier1);
+
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+
+    if (write_compact) {
+      compact_lattice_writer.Open(lats_wspecifier);
+    } else {
+      lattice_writer.Open(lats_wspecifier);
+    }
+
+    if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) {
+      std::string fst_rxfilename = arg2;
+      VectorFst<StdArc>* fst2 = fst::ReadFstKaldi(fst_rxfilename);
+      // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring,
+      // with all the cost on the first member of the pair (since we're
+      // assuming it's a graph weight).
+      if (fst2->Properties(fst::kILabelSorted, true) == 0) {
+        // Make sure fst2 is sorted on ilabel.
+        fst::ILabelCompare<StdArc> ilabel_comp;
+        ArcSort(fst2, ilabel_comp);
+      }
+      /* // THIS MAKES ALL STATES FINAL STATES! WHY?
+      if (phi_label > 0)
+        PropagateFinal(phi_label, fst2);
+      */
+
+      fst::CacheOptions cache_opts(true, num_states_cache);
+      fst::MapFstOptions mapfst_opts(cache_opts);
+      fst::StdToLatticeMapper<BaseFloat> mapper;
+      fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
+          mapped_fst2(*fst2, mapper, mapfst_opts);
+
+      for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
+        std::string key = lattice_reader1.Key();
+        KALDI_VLOG(1) << "Processing lattice for key " << key;
+        Lattice lat1 = lattice_reader1.Value();
+        ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
+        Lattice composed_lat;
+        if (phi_label > 0) {
+          PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
+        } else {
+          Compose(lat1, mapped_fst2, &composed_lat);
+        }
+        if (composed_lat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          if (write_compact) {
+            CompactLattice clat;
+            ConvertLattice(composed_lat, &clat);
+            compact_lattice_writer.Write(key, clat);
+          } else {
+            lattice_writer.Write(key, composed_lat);
+          }
+          n_done++;
+        }
+      }
+      delete fst2;
+    } else {
+      // Compose each utterance with its matching (by key) FST.
+      std::string fst_rspecifier2 = arg2;
+      RandomAccessTableReader<fst::VectorFstHolder> fst_reader2(fst_rspecifier2);
+
+      for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
+        std::string key = lattice_reader1.Key();
+        KALDI_VLOG(1) << "Processing lattice for key " << key;
+        Lattice lat1 = lattice_reader1.Value();
+        lattice_reader1.FreeCurrent();
+
+        if (!fst_reader2.HasKey(key)) {
+          KALDI_WARN << "Not producing output for utterance " << key
+                     << " because it's not present in second table.";
+          n_fail++;
+          continue;
+        }
+
+        VectorFst<StdArc> fst2 = fst_reader2.Value(key);
+        if (fst2.Properties(fst::kILabelSorted, true) == 0) {
+          // Make sure fst2 is sorted on ilabel.
+          fst::ILabelCompare<StdArc> ilabel_comp;
+          fst::ArcSort(&fst2, ilabel_comp);
+        }
+        /* // THIS MAKES ALL STATES FINAL STATES! WHY?
+        if (phi_label > 0)
+          PropagateFinal(phi_label, &fst2);
+        */
+
+        // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring,
+        // with all the cost on the first member of the pair (since we're
+        // assuming it's a graph weight).
+        fst::CacheOptions cache_opts(true, num_states_cache);
+        fst::MapFstOptions mapfst_opts(cache_opts);
+        fst::StdToLatticeMapper<BaseFloat> mapper;
+        fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
+            mapped_fst2(fst2, mapper, mapfst_opts);
+
+        // sort lat1 on olabel.
+        ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
+
+        Lattice composed_lat;
+        if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
+        else Compose(lat1, mapped_fst2, &composed_lat);
+
+        if (composed_lat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          if (write_compact) {
+            CompactLattice clat;
+            ConvertLattice(composed_lat, &clat);
+            compact_lattice_writer.Write(key, clat);
+          } else {
+            lattice_writer.Write(key, composed_lat);
+          }
+          n_done++;
+        }
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices; failed for "
+              << n_fail;
+
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 039fc258b13..2804e4b31fe 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -22,7 +22,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-xvector-compute-batched \
    nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
    nnet3-latgen-faster-lookahead cuda-gpu-available cuda-compiled \
-   nnet3-latgen-faster-looped-parallel
+   nnet3-latgen-faster-looped-parallel \
+   nnet3-latgen-faster-compose
 
 OBJFILES =
 
@@ -37,7 +38,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../ivector/kaldi-ivector.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
new file mode 100644
index 00000000000..d4e7c094d30
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -0,0 +1,308 @@
+// nnet3bin/nnet3-latgen-faster-compose.cc
+
+// Copyright      2020   Brno University of Technology (author: Karel Vesely)
+//           2012-2015   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <fst/compose.h>
+#include <fst/rmepsilon.h>
+
+#include <memory>
+
+#include "base/kaldi-common.h"
+#include "base/timer.h"
+#include "decoder/decoder-wrappers.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.  You should use nnet3-latgen-faster-batch if you want to use a GPU.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model, with on-the-fly composition HCLG o B.\n"
+        "B is utterance-specific boosting graph, typically a single-state FST with\n"
+        "all words from words.txt on self loop arcs (then composition is not prohibitevly slow).\n"
+        "Some word-arcs will have score discounts as costs, to boost them in HMM beam-search.\n"
+        "Or, by not including words in B, we can remove them from HCLG network.\n"
+        "Usage: nnet3-latgen-faster-compose [options] <nnet-in> <fst-in> <boost-fsts-rspecifier> <features-rspecifier>"
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n";
+
+    ParseOptions po(usage);
+
+    Timer timer, timer_compose;
+    double elapsed_compose = 0.0;
+
+    int32 rho_label = fst::kNoLabel; // == -1
+
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig config;
+    NnetSimpleComputationOptions decodable_opts;
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    config.Register(&po);
+    decodable_opts.Register(&po);
+
+    po.Register("rho-label", &rho_label,
+                "If >0, symbol for 'match the rest' in the biasing graph boosting_fst");
+
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1),
+        hclg_fst_rxfilename = po.GetArg(2),
+        boosting_fst_rspecifier = po.GetArg(3),
+        feature_rspecifier = po.GetArg(4),
+        lattice_wspecifier = po.GetArg(5),
+        words_wspecifier = po.GetOptArg(6),
+        alignment_wspecifier = po.GetOptArg(7);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier))) {
+      KALDI_ERR << "Could not open table for writing lattices: "
+                << lattice_wspecifier;
+    }
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    Int32VectorWriter words_writer(words_wspecifier);
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    std::unique_ptr<fst::SymbolTable> word_syms = nullptr;
+    if (word_syms_filename != "") {
+      word_syms.reset(fst::SymbolTable::ReadText(word_syms_filename));
+      if (!word_syms)
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+    }
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+    // this compiler object allows caching of computations across
+    // different utterances.
+    CachingOptimizingCompiler compiler(am_nnet.GetNnet(),
+                                       decodable_opts.optimize_config);
+
+    KALDI_ASSERT(ClassifyRspecifier(hclg_fst_rxfilename, NULL, NULL) == kNoRspecifier);
+    {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+      RandomAccessTableReader<fst::VectorFstHolder> boosting_fst_reader(boosting_fst_rspecifier);
+
+      // 'hclg_fst' is a single FST.
+      VectorFst<StdArc> hclg_fst;
+      {
+        auto hclg_fst_tmp = std::unique_ptr<Fst<StdArc>>(fst::ReadFstKaldiGeneric(hclg_fst_rxfilename));
+        hclg_fst = VectorFst<StdArc>(*hclg_fst_tmp); // Fst -> VectorFst, as it has to be MutableFst...
+        // 'hclg_fst_tmp' is deleted by 'going out of scope' ...
+      }
+
+      // make sure hclg is sorted on olabel
+      if (hclg_fst.Properties(fst::kOLabelSorted, true) == 0) {
+        fst::OLabelCompare<StdArc> olabel_comp;
+        fst::ArcSort(&hclg_fst, olabel_comp);
+      }
+
+      timer.Reset();
+
+      //// MAIN LOOP ////
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        std::string utt = feature_reader.Key();
+        const Matrix<BaseFloat> &features (feature_reader.Value());
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        // get the boosting graph,
+        VectorFst<StdArc> boosting_fst;
+        if (!boosting_fst_reader.HasKey(utt)) {
+          KALDI_WARN << "No boosting fst for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          boosting_fst = boosting_fst_reader.Value(utt); // copy,
+        }
+
+        timer_compose.Reset();
+
+        // RmEpsilon saved 30% of composition runtime.
+        // - Note: we are loading 2-state graphs with eps back-link to the initial state.
+        if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) {
+          fst::RmEpsilon(&boosting_fst);
+        }
+
+        // Make sure boosting graph is sorted on ilabel.
+        if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) {
+          fst::ILabelCompare<StdArc> ilabel_comp;
+          fst::ArcSort(&boosting_fst, ilabel_comp);
+        }
+
+        // run composition,
+        VectorFst<StdArc> decode_fst;
+        if (rho_label > 0) {
+          fst::RhoCompose(hclg_fst, boosting_fst, rho_label, &decode_fst);
+        } else {
+          fst::Compose(hclg_fst, boosting_fst, &decode_fst);
+        }
+
+        // check that composed graph is non-empty,
+        if (decode_fst.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty 'decode_fst' HCLG for utterance "
+                     << utt << " (bad boosting graph?)";
+          num_fail++;
+          continue;
+        }
+
+        elapsed_compose += timer_compose.Elapsed();
+
+        DecodableAmNnetSimple nnet_decodable(
+            decodable_opts, trans_model, am_nnet,
+            features, ivector, online_ivectors,
+            online_ivector_period, &compiler);
+
+        // Note: decode_fst is VectorFst, not ConstFst.
+        //
+        //       OpenFst docs say that more specific iterators
+        //       are faster than generic iterators. And in HCLG
+        //       is usually loaded for decoding as ConstFst.
+        //
+        //       auto decode_fst_ = ConstFst<StdArc>(decode_fst);
+        //
+        //       In this way, I tried to cast VectorFst to ConstFst,
+        //       but this made the decoding 20% slower.
+        //
+        LatticeFasterDecoder decoder(decode_fst, config);
+
+        double like;
+        if (DecodeUtteranceLatticeFaster(
+                decoder, nnet_decodable, trans_model, word_syms.get(), utt,
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer,
+                &like)) {
+          tot_like += like;
+          frame_count += nnet_decodable.NumFramesReady();
+          num_success++;
+        } else {
+          ++num_fail;
+        }
+      }
+    }
+
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed * 100.0 / input_frame_count);
+    KALDI_LOG << "Composition time "<< elapsed_compose
+              << "s (" << (elapsed_compose * 100.0 / elapsed) << "%)";
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count << " frames.";
+
+    return num_success != 0 ? 0 : 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}