diff --git a/src/bin/latgen-faster-mapped-combine.cc b/src/bin/latgen-faster-mapped-combine.cc
new file mode 100644
index 00000000000..ae5946d9e8e
--- /dev/null
+++ b/src/bin/latgen-faster-mapped-combine.cc
@@ -0,0 +1,179 @@
+// bin/latgen-faster-mapped.cc
+
+// Copyright 2009-2012  Microsoft Corporation, Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+//                2014  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices, reading log-likelihoods as matrices\n"
+        " (model is needed only for the integer mappings in its transition-model)\n"
+        "Usage: latgen-faster-mapped [options] trans-model-in (fst-in|fsts-rspecifier) loglikes-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeFasterDecoderCombineConfig config;
+
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4),
+        words_wspecifier = po.GetOptArg(5),
+        alignment_wspecifier = po.GetOptArg(6);
+
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader loglike_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+      timer.Reset();
+
+      {
+        LatticeFasterDecoderCombine decoder(*decode_fst, config);
+
+        for (; !loglike_reader.Done(); loglike_reader.Next()) {
+          std::string utt = loglike_reader.Key();
+          Matrix<BaseFloat> loglikes (loglike_reader.Value());
+          loglike_reader.FreeCurrent();
+          if (loglikes.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+
+          DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+
+          double like;
+          if (DecodeUtteranceLatticeFasterCombine(
+                  decoder, decodable, trans_model, word_syms, utt,
+                  acoustic_scale, determinize, allow_partial, &alignment_writer,
+                  &words_writer, &compact_lattice_writer, &lattice_writer,
+                  &like)) {
+            tot_like += like;
+            frame_count += loglikes.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
+      RandomAccessBaseFloatMatrixReader loglike_reader(feature_rspecifier);
+      for (; !fst_reader.Done(); fst_reader.Next()) {
+        std::string utt = fst_reader.Key();
+        if (!loglike_reader.HasKey(utt)) {
+          KALDI_WARN << "Not decoding utterance " << utt
+                     << " because no loglikes available.";
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> &loglikes = loglike_reader.Value(utt);
+        if (loglikes.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+        LatticeFasterDecoderCombine decoder(fst_reader.Value(), config);
+        DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+        double like;
+        if (DecodeUtteranceLatticeFasterCombine(
+                decoder, decodable, trans_model, word_syms, utt, acoustic_scale,
+                determinize, allow_partial, &alignment_writer, &words_writer,
+                &compact_lattice_writer, &lattice_writer, &like)) {
+          tot_like += like;
+          frame_count += loglikes.NumRows();
+          num_success++;
+        } else num_fail++;
+      }
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/decoder/lattice-faster-decoder-combine.cc b/src/decoder/lattice-faster-decoder-combine.cc
index 67c4bfe7e8e..63e7e31224b 100644
--- a/src/decoder/lattice-faster-decoder-combine.cc
+++ b/src/decoder/lattice-faster-decoder-combine.cc
@@ -892,6 +892,7 @@ void LatticeFasterDecoderCombineTpl<FST, Token>::ProcessForFrame(
   }  // end of while loop
   frame_processed_[frame] = true;
   frame_processed_[frame + 1] = false;
+  KALDI_VLOG(6) << "toks after: " << cur_toks_.size();
 }
 
 
diff --git a/src/decoder/lattice-faster-decoder-combine.h b/src/decoder/lattice-faster-decoder-combine.h
index f4d74a5acd8..1c27a4ecb84 100644
--- a/src/decoder/lattice-faster-decoder-combine.h
+++ b/src/decoder/lattice-faster-decoder-combine.h
@@ -32,6 +32,7 @@
 #include "lat/kaldi-lattice.h"
 #include "decoder/grammar-fst.h"
 #include "decoder/lattice-faster-decoder.h"
+#include "memory.h"
 
 namespace kaldi {
 
@@ -242,7 +243,8 @@ class LatticeFasterDecoderCombineTpl {
   using Weight = typename Arc::Weight;
   using ForwardLinkT = decodercombine::ForwardLink<Token>;
 
-  using StateIdToTokenMap = typename std::unordered_map<StateId, Token*>;
+  //using StateIdToTokenMap = typename std::unordered_map<StateId, Token*>;
+  using StateIdToTokenMap = typename std::unordered_map<StateId, Token*, std::hash<StateId>, std::equal_to<StateId>, fkaldi::PoolAllocator<std::pair<const StateId, Token*>>>;
   using IterType = typename StateIdToTokenMap::const_iterator;
 
   // Instantiate this class once for each thing you have to decode.
diff --git a/src/decoder/memory.h b/src/decoder/memory.h
new file mode 100644
index 00000000000..784712eb0c3
--- /dev/null
+++ b/src/decoder/memory.h
@@ -0,0 +1,421 @@
+// See www.openfst.org for extensive documentation on this weighted
+// finite-state transducer library.
+//
+// FST memory utilities.
+
+#ifndef FFST_MEMORY_H_
+#define FFST_MEMORY_H_
+
+#include <list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <fst/types.h>
+#include <fst/log.h>
+#include <fstream>
+
+namespace fkaldi {
+using namespace fst;
+
+// Default block allocation size.
+constexpr int kAllocSize = 64;
+
+// Minimum number of allocations per block.
+constexpr int kAllocFit = 4;
+
+// Base class for MemoryArena that allows (e.g.) MemoryArenaCollection to
+// easily manipulate collections of variously sized arenas.
+class MemoryArenaBase {
+ public:
+  virtual ~MemoryArenaBase() {}
+  virtual size_t Size() const = 0;
+};
+
+// Allocates 'size' unintialized memory chunks of size sizeof(T) from underlying
+// blocks of (at least) size 'block_size * sizeof(T)'. All blocks are freed when
+// this class is deleted. Result of allocate() will be aligned to sizeof(T).
+template <typename T>
+class MemoryArena : public MemoryArenaBase {
+ public:
+  explicit MemoryArena(size_t block_size = kAllocSize)
+      : block_size_(block_size * sizeof(T)), block_pos_(0) {
+    blocks_.emplace_front(new char[block_size_]);
+  }
+
+  void *Allocate(size_t size) {
+    const auto byte_size = size * sizeof(T);
+    if (byte_size * kAllocFit > block_size_) {
+      // Large block; adds new large block.
+      auto *ptr = new char[byte_size];
+      blocks_.emplace_back(ptr);
+      return ptr;
+    }
+    if (block_pos_ + byte_size > block_size_) {
+      // Doesn't fit; adds new standard block.
+      auto *ptr = new char[block_size_];
+      block_pos_ = 0;
+      blocks_.emplace_front(ptr);
+    }
+    // Fits; uses current block.
+    auto *ptr = blocks_.front().get() + block_pos_;
+    block_pos_ += byte_size;
+    return ptr;
+  }
+
+  size_t Size() const override { return sizeof(T); }
+
+ private:
+  size_t block_size_;  // Default block size in bytes.
+  size_t block_pos_;   // Current position in block in bytes.
+  std::list<std::unique_ptr<char[]>> blocks_;  // List of allocated blocks.
+};
+
+// Base class for MemoryPool that allows (e.g.) MemoryPoolCollection to easily
+// manipulate collections of variously sized pools.
+class MemoryPoolBase {
+ public:
+  virtual ~MemoryPoolBase() {}
+  virtual size_t Size() const = 0;
+};
+
+// Allocates and frees initially uninitialized memory chunks of size sizeof(T).
+// Keeps an internal list of freed chunks that are reused (as is) on the next
+// allocation if available. Chunks are constructed in blocks of size
+// 'pool_size'. All memory is freed when the class is deleted. The result of
+// Allocate() will be suitably memory-aligned.
+//
+// Combined with placement operator new and destroy fucntions for the T class,
+// this can be used to improve allocation efficiency. See nlp/fst/lib/visit.h
+// (global new) and nlp/fst/lib/dfs-visit.h (class new) for examples.
+template <typename T>
+class MemoryPool : public MemoryPoolBase {
+ public:
+  struct Link {
+    char buf[sizeof(T)];
+    Link *next;
+  };
+
+  // 'pool_size' specifies the size of the initial pool and how it is extended.
+  explicit MemoryPool(size_t pool_size = kAllocSize)
+      : mem_arena_(pool_size), free_list_(nullptr) {}
+
+  void *Allocate() {
+    if (free_list_ == nullptr) {
+      auto *link = static_cast<Link *>(mem_arena_.Allocate(1));
+      link->next = nullptr;
+      return link;
+    } else {
+      auto *link = free_list_;
+      free_list_ = link->next;
+      return link;
+    }
+  }
+
+  void Free(void *ptr) {
+    if (ptr) {
+      auto *link = static_cast<Link *>(ptr);
+      link->next = free_list_;
+      free_list_ = link;
+    }
+  }
+
+  size_t Size() const override { return sizeof(T); }
+
+ private:
+  MemoryArena<Link> mem_arena_;
+  Link *free_list_;
+
+  MemoryPool(const MemoryPool &) = delete;
+  MemoryPool &operator=(const MemoryPool &) = delete;
+};
+
+// Stores a collection of memory arenas.
+class MemoryArenaCollection {
+ public:
+  // 'block_size' specifies the block size of the arenas.
+  explicit MemoryArenaCollection(size_t block_size = kAllocSize)
+      : block_size_(block_size), ref_count_(1) {}
+
+  template <typename T>
+  MemoryArena<T> *Arena() {
+    if (sizeof(T) >= arenas_.size()) arenas_.resize(sizeof(T) + 1);
+    MemoryArenaBase *arena = arenas_[sizeof(T)].get();
+    if (arena == nullptr) {
+      arena = new MemoryArena<T>(block_size_);
+      arenas_[sizeof(T)].reset(arena);
+    }
+    return static_cast<MemoryArena<T> *>(arena);
+  }
+
+  size_t BlockSize() const { return block_size_; }
+
+  size_t RefCount() const { return ref_count_; }
+
+  size_t IncrRefCount() { return ++ref_count_; }
+
+  size_t DecrRefCount() { return --ref_count_; }
+
+ private:
+  size_t block_size_;
+  size_t ref_count_;
+  std::vector<std::unique_ptr<MemoryArenaBase>> arenas_;
+};
+
+// Stores a collection of memory pools
+class MemoryPoolCollection {
+ public:
+  // 'pool_size' specifies the size of initial pool and how it is extended.
+  explicit MemoryPoolCollection(size_t pool_size = kAllocSize)
+      : pool_size_(pool_size), ref_count_(1) {}
+
+  template <typename T>
+  MemoryPool<T> *Pool() {
+    if (sizeof(T) >= pools_.size()) pools_.resize(sizeof(T) + 1);
+    MemoryPoolBase *pool = pools_[sizeof(T)].get();
+    if (pool == nullptr) {
+      pool = new MemoryPool<T>(pool_size_);
+      pools_[sizeof(T)].reset(pool);
+    }
+    return static_cast<MemoryPool<T> *>(pool);
+  }
+
+  size_t PoolSize() const { return pool_size_; }
+
+  size_t RefCount() const { return ref_count_; }
+
+  size_t IncrRefCount() { return ++ref_count_; }
+
+  size_t DecrRefCount() { return --ref_count_; }
+
+ private:
+  size_t pool_size_;
+  size_t ref_count_;
+  std::vector<std::unique_ptr<MemoryPoolBase>> pools_;
+};
+
+// STL allocator using memory arenas. Memory is allocated from underlying
+// blocks of size 'block_size * sizeof(T)'. Memory is freed only when all
+// objects using this allocator are destroyed and there is otherwise no reuse
+// (unlike PoolAllocator).
+//
+// This allocator has object-local state so it should not be used with splicing
+// or swapping operations between objects created with different allocators nor
+// should it be used if copies must be thread-safe. The result of allocate()
+// will be suitably memory-aligned.
+template <typename T>
+class BlockAllocator {
+ public:
+  using Allocator = std::allocator<T>;
+  using size_type = typename Allocator::size_type;
+  using difference_type = typename Allocator::difference_type;
+  using pointer = typename Allocator::pointer;
+  using const_pointer = typename Allocator::const_pointer;
+  using reference = typename Allocator::reference;
+  using const_reference = typename Allocator::const_reference;
+  using value_type = typename Allocator::value_type;
+
+  template <typename U>
+  struct rebind {
+    using other = BlockAllocator<U>;
+  };
+
+  explicit BlockAllocator(size_t block_size = kAllocSize)
+      : arenas_(new MemoryArenaCollection(block_size)) {}
+
+  BlockAllocator(const BlockAllocator<T> &arena_alloc)
+      : arenas_(arena_alloc.Arenas()) {
+    Arenas()->IncrRefCount();
+  }
+
+  template <typename U>
+  explicit BlockAllocator(const BlockAllocator<U> &arena_alloc)
+      : arenas_(arena_alloc.Arenas()) {
+    Arenas()->IncrRefCount();
+  }
+
+  ~BlockAllocator() {
+    if (Arenas()->DecrRefCount() == 0) delete Arenas();
+  }
+
+  pointer address(reference ref) const { return Allocator().address(ref); }
+
+  const_pointer address(const_reference ref) const {
+    return Allocator().address(ref);
+  }
+
+  size_type max_size() const { return Allocator().max_size(); }
+
+  template <class U, class... Args>
+  void construct(U *p, Args &&... args) {
+    Allocator().construct(p, std::forward<Args>(args)...);
+  }
+
+  void destroy(pointer p) { Allocator().destroy(p); }
+
+  pointer allocate(size_type n, const void *hint = nullptr) {
+    if (n * kAllocFit <= kAllocSize) {
+      return static_cast<pointer>(Arena()->Allocate(n));
+    } else {
+      return Allocator().allocate(n, hint);
+    }
+  }
+
+  void deallocate(pointer p, size_type n) {
+    if (n * kAllocFit > kAllocSize) Allocator().deallocate(p, n);
+  }
+
+  MemoryArenaCollection *Arenas() const { return arenas_; }
+
+  //BlockAllocator<T> operator=(const BlockAllocator<T> &) { assert(0); }
+ private:
+  MemoryArena<T> *Arena() { return arenas_->Arena<T>(); }
+
+  MemoryArenaCollection *arenas_;
+
+};
+
+template <typename T, typename U>
+bool operator==(const BlockAllocator<T> &alloc1,
+                const BlockAllocator<U> &alloc2) {
+  return false;
+}
+
+template <typename T, typename U>
+bool operator!=(const BlockAllocator<T> &alloc1,
+                const BlockAllocator<U> &alloc2) {
+  return true;
+}
+
+// STL allocator using memory pools. Memory is allocated from underlying
+// blocks of size 'block_size * sizeof(T)'. Keeps an internal list of freed
+// chunks thare are reused on the next allocation.
+//
+// This allocator has object-local state so it should not be used with splicing
+// or swapping operations between objects created with different allocators nor
+// should it be used if copies must be thread-safe. The result of allocate()
+// will be suitably memory-aligned.
+template <typename T>
+class PoolAllocator {
+ public:
+  using Allocator = std::allocator<T>;
+  using size_type = typename Allocator::size_type;
+  using difference_type = typename Allocator::difference_type;
+  using pointer = typename Allocator::pointer;
+  using const_pointer = typename Allocator::const_pointer;
+  using reference = typename Allocator::reference;
+  using const_reference = typename Allocator::const_reference;
+  using value_type = typename Allocator::value_type;
+
+  template <typename U>
+  struct rebind {
+    using other = PoolAllocator<U>;
+  };
+
+  explicit PoolAllocator(size_t pool_size = kAllocSize)
+      : pools_(new MemoryPoolCollection(pool_size)) {}
+
+  PoolAllocator(const PoolAllocator<T> &pool_alloc)
+      : pools_(pool_alloc.Pools()) {
+    Pools()->IncrRefCount();
+  }
+
+  template <typename U>
+  explicit PoolAllocator(const PoolAllocator<U> &pool_alloc)
+      : pools_(pool_alloc.Pools()) {
+    Pools()->IncrRefCount();
+  }
+
+  ~PoolAllocator() {
+    if (Pools()->DecrRefCount() == 0) delete Pools();
+  }
+
+  pointer address(reference ref) const { return Allocator().address(ref); }
+
+  const_pointer address(const_reference ref) const {
+    return Allocator().address(ref);
+  }
+
+  size_type max_size() const { return Allocator().max_size(); }
+
+  template <class U, class... Args>
+  void construct(U *p, Args &&... args) {
+    Allocator().construct(p, std::forward<Args>(args)...);
+  }
+
+  void destroy(pointer p) { Allocator().destroy(p); }
+
+  pointer allocate(size_type n, const void *hint = nullptr) {
+    if (n == 1) {
+      return static_cast<pointer>(Pool<1>()->Allocate());
+    } else if (n == 2) {
+      return static_cast<pointer>(Pool<2>()->Allocate());
+    } else if (n <= 4) {
+      return static_cast<pointer>(Pool<4>()->Allocate());
+    } else if (n <= 8) {
+      return static_cast<pointer>(Pool<8>()->Allocate());
+    } else if (n <= 16) {
+      return static_cast<pointer>(Pool<16>()->Allocate());
+    } else if (n <= 32) {
+      return static_cast<pointer>(Pool<32>()->Allocate());
+    } else if (n <= 64) {
+      return static_cast<pointer>(Pool<64>()->Allocate());
+    } else {
+      return Allocator().allocate(n, hint);
+    }
+  }
+
+  void deallocate(pointer p, size_type n) {
+    if (n == 1) {
+      Pool<1>()->Free(p);
+    } else if (n == 2) {
+      Pool<2>()->Free(p);
+    } else if (n <= 4) {
+      Pool<4>()->Free(p);
+    } else if (n <= 8) {
+      Pool<8>()->Free(p);
+    } else if (n <= 16) {
+      Pool<16>()->Free(p);
+    } else if (n <= 32) {
+      Pool<32>()->Free(p);
+    } else if (n <= 64) {
+      Pool<64>()->Free(p);
+    } else {
+      Allocator().deallocate(p, n);
+    }
+  }
+
+  MemoryPoolCollection *Pools() const { return pools_; }
+  //PoolAllocator<T> operator=(const PoolAllocator<T> &) { assert(0); }
+
+ private:
+  template <int n>
+  struct TN {
+    T buf[n];
+  };
+
+  template <int n>
+  MemoryPool<TN<n>> *Pool() {
+    return pools_->Pool<TN<n>>();
+  }
+
+  MemoryPoolCollection *pools_;
+
+};
+
+template <typename T, typename U>
+bool operator==(const PoolAllocator<T> &alloc1,
+                const PoolAllocator<U> &alloc2) {
+  return false;
+}
+
+template <typename T, typename U>
+bool operator!=(const PoolAllocator<T> &alloc1,
+                const PoolAllocator<U> &alloc2) {
+  return true;
+}
+
+}  // namespace fst
+
+#endif  // FFST_MEMORY_H_
+