some fixes

mrityunjay-tripathi · mrityunjay-tripathi · commit 81219d8c73e9 · 2020-08-22T17:35:56.000+05:30
diff --git a/models/bert/bert.hpp b/models/bert/bert.hpp
@@ -30,8 +30,8 @@ namespace ann /** Artificial Neural Network. */ {
  *         arma::sp_mat or arma::cube).
  */
 template <
-  typename OutputLayerType = NegativeLogLikelihood,
-  typename InitType = XavierInitialization,
+  typename OutputLayerType = NegativeLogLikelihood<>,
+  typename InitializationRuleType = XavierInitialization,
   typename InputDataType = arma::mat,
   typename OutputDataType = arma::mat
 >
@@ -41,21 +41,23 @@ class BERT
   BERT();
 
   /**
-   * Create the TransformerDecoder object using the specified parameters.
+   * Create the BERT object using the specified parameters.
    *
-   * @param vocabSize The size of the vocabulary.
+   * @param srcVocabSize The size of the vocabulary.
+   * @param srcSeqLen The source sequence length.
+   * @param numEncoderLayers The number of Transformer Encoder layers.
    * @param dModel The dimensionality of the model.
    * @param numHeads The number of attention heads.
-   * @param numLayers The number of Transformer Encoder layers.
    * @param dropout The dropout rate.
-   * @param maxSequenceLength The maximum sequence length in the given input.
+   * @param attentionMask The attention mask used to black-out future sequences.
+   * @param keyPaddingMask Blacks out specific tokens.
    */
-  BERT(const size_t vocabSize,
+  BERT(const size_t srcVocabSize,
+       const size_t srcSeqLen,
+       const size_t numEncoderLayers = 12,
        const size_t dModel = 512,
        const size_t numHeads = 8,
-       const size_t numLayers = 12,
        const double dropout = 0.1,
-       const size_t maxSequenceLength = 5000,
        const InputDataType& attentionMask = InputDataType(),
        const InputDataType& keyPaddingMask = InputDataType());
 
@@ -75,7 +77,13 @@ class BERT
 
  private:
   //! Locally-stored size of the vocabulary.
-  size_t vocabSize;
+  size_t srcVocabSize;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of Transformer Encoder blocks.
+  size_t numEncoderLayers;
 
   //! Locally-stored dimensionality of the model.
   size_t dModel;
@@ -86,26 +94,17 @@ class BERT
   //! Locally-stored number of hidden units in FFN.
   size_t dimFFN;
 
-  //! Locally-stored number of Transformer Encoder blocks.
-  size_t numLayers;
-
   //! Locally-stored dropout rate.
   double dropout;
 
-  //! Locally-stored maximum sequence length.
-  size_t maxSequenceLength;
-
   //! Locally-stored attention mask.
   InputDataType attentionMask;
 
   //! Locally-stored key padding mask.
   InputDataType keyPaddingMask;
 
-  //! Locally-stored BERT embedding layer.
-  LayerTypes<> embedding;
-
   //! Locally-stored complete decoder network.
-  FFN<OutputLayerType, InitType> bert;
+  FFN<OutputLayerType, InitializationRuleType> bert;
 }; // class BERT
 
 } // namespace ann
diff --git a/models/bert/bert_impl.hpp b/models/bert/bert_impl.hpp
@@ -22,52 +22,60 @@ namespace ann /** Artificial Neural Network. */ {
 template<typename OutputLayerType, typename InitType, typename InputDataType,
     typename OutputDataType>
 BERT<OutputLayerType, InitType, InputDataType, OutputDataType>::BERT() :
-    vocabSize(0),
+    srcVocabSize(0),
+    srcSeqLen(0),
+    numEncoderLayers(0),
     dModel(0),
     numHeads(0),
     dimFFN(4 * dModel),
-    numLayers(0),
-    dropout(0),
-    maxSequenceLength(5000),
+    dropout(0.0)
 {
   // Nothing to do here.
 }
 
 template<typename OutputLayerType, typename InitType, typename InputDataType,
     typename OutputDataType>
 BERT<OutputLayerType, InitType, InputDataType, OutputDataType>::BERT(
-    const size_t vocabSize,
+    const size_t srcVocabSize,
+    const size_t srcSeqLen,
+    const size_t numEncoderLayers,
     const size_t dModel,
     const size_t numHeads,
-    const size_t numLayers,
     const double dropout,
-    const size_t maxSequenceLength,
     const InputDataType& attentionMask,
     const InputDataType& keyPaddingMask) :
-    vocabSize(vocabSize)
+    srcVocabSize(srcVocabSize),
+    srcSeqLen(srcSeqLen),
+    numEncoderLayers(numEncoderLayers),
     dModel(dModel),
     numHeads(numHeads),
     dimFFN(4 * dModel),
-    numLayers(numLayers),
     dropout(dropout),
-    maxSequenceLength(maxSequenceLength),
     attentionMask(attentionMask),
     keyPaddingMask(keyPaddingMask)
 {
-  embedding = new AddMerge<>();
-  embedding.Add<Lookup<>>(vocabSize, dModel);
-  embedding.Add<Lookup<>>(3, dModel);
+  AddMerge<>* embedding = new AddMerge<>();
+  embedding->Add<Lookup<>>(vocabSize, dModel);
+  embedding->Add<Lookup<>>(3, dModel);
 
   bert.Add(embedding);
-  bert.Add<PositionalEncoding<>>(dModel, maxSequenceLength);
+  bert.Add<PositionalEncoding<>>(dModel, srcSeqLen);
   bert.Add<Dropout<>>(dropout);
 
   for (size_t i = 0; i < numLayers; ++i)
   {
-    TransformerEncoder<> enc(dModel, numHeads, dimFFN, dropout);
-    enc.AttentionMask() = attentionMask;
-    enc.KeyPaddingMask() = keyPaddingMask;
-    bert.Add(enc);
+    mlpack::ann::TransformerEncoder<> encoder(
+      numEncoderLayers,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask
+    );
+
+    bert.Add(encoder.Model());
   }
 }
 
diff --git a/models/bert/bert_tokenizer.hpp b/models/bert/bert_tokenizer.hpp
@@ -127,7 +127,4 @@ class BertTokenizer
 } // namespace ann
 } // namespace mlpack
 
-// Include implementation.
-#include "bert_tokenizer_impl.hpp"
-
 #endif