From 89deadc342fe51be0e452c182bc9d96b6f008f16 Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Fri, 13 Dec 2019 10:08:37 +0000
Subject: [PATCH 01/32] add language model estimator

---
 .../word_language_model_estimator.py          | 225 ++++++++++++++++++
 src/gluonnlp/estimator/__init__.py            |  28 +++
 .../language_model_batch_processor.py         |  81 +++++++
 .../estimator/language_model_estimator.py     |  69 ++++++
 .../estimator/language_model_event_handler.py | 127 ++++++++++
 src/gluonnlp/estimator/loss.py                |  80 +++++++
 6 files changed, 610 insertions(+)
 create mode 100644 scripts/estimator/word_language_model_estimator.py
 create mode 100644 src/gluonnlp/estimator/__init__.py
 create mode 100644 src/gluonnlp/estimator/language_model_batch_processor.py
 create mode 100644 src/gluonnlp/estimator/language_model_estimator.py
 create mode 100644 src/gluonnlp/estimator/language_model_event_handler.py
 create mode 100644 src/gluonnlp/estimator/loss.py

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
new file mode 100644
index 0000000000..92fbba8b42
--- /dev/null
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import time
+import math
+import os
+import sys
+import mxnet as mx
+from mxnet import gluon, autograd
+import gluonnlp as nlp
+from mxnet.gluon.contrib.estimator import LoggingHandler
+from gluonnlp.estimator import JointActivationRegularizationLoss
+from gluonnlp.estimator import LanguageModelEstimator
+from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
+from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
+from gluonnlp.estimator import LanguageModelBatchProcessor
+from mxnet.gluon.data.sampler import BatchSampler
+
+class BatchVariableLenTextSampler(BatchSampler):
+    def __init__(self, bptt, length):
+        self.bptt = bptt
+        self.length = length
+        self.index = 0
+
+    def __iter__(self):
+        while self.index < self.length - 2:
+            bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2
+            seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
+            seq_len = min(seq_len, self.length - self.index - 1)
+            # batch_size = seq_len + 1
+            batch = []
+            for i in range(self.index, self.index + seq_len + 1):
+                batch.append(i)
+            self.index += seq_len
+            yield batch
+
+    def __len__(self):
+        # you may never get real size of the data sampler beforehand. May need some
+        # postprocessing after fetching the data batch
+        return self.length / 5 + 1
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '..', '..'))
+
+nlp.utils.check_version('0.7.0')
+
+parser = argparse.ArgumentParser(description=
+                                 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.')
+parser.add_argument('--model', type=str, default='lstm',
+                    help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)')
+parser.add_argument('--emsize', type=int, default=400,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=1150,
+                    help='number of hidden units per layer')
+parser.add_argument('--nlayers', type=int, default=3,
+                    help='number of layers')
+parser.add_argument('--lr', type=float, default=30,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=750,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=80, metavar='N',
+                    help='batch size')
+parser.add_argument('--bptt', type=int, default=70,
+                    help='sequence length')
+parser.add_argument('--dropout', type=float, default=0.4,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--dropout_h', type=float, default=0.2,
+                    help='dropout applied to hidden layer (0 = no dropout)')
+parser.add_argument('--dropout_i', type=float, default=0.65,
+                    help='dropout applied to input layer (0 = no dropout)')
+parser.add_argument('--dropout_e', type=float, default=0.1,
+                    help='dropout applied to embedding layer (0 = no dropout)')
+parser.add_argument('--weight_dropout', type=float, default=0.5,
+                    help='weight dropout applied to h2h weight matrix (0 = no weight dropout)')
+parser.add_argument('--tied', action='store_true',
+                    help='tie the word embedding and softmax weights')
+parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+                    help='report interval')
+parser.add_argument('--save', type=str, default='model.params',
+                    help='path to save the final model')
+parser.add_argument('--eval_only', action='store_true',
+                    help='Whether to only evaluate the trained model')
+parser.add_argument('--gpu', type=str, help='single gpu id')
+parser.add_argument('--optimizer', type=str, default='sgd',
+                    help='optimizer to use (sgd, adam)')
+parser.add_argument('--wd', type=float, default=1.2e-6,
+                    help='weight decay applied to all weights')
+parser.add_argument('--alpha', type=float, default=2,
+                    help='alpha L2 regularization on RNN activation '
+                         '(alpha = 0 means no regularization)')
+parser.add_argument('--beta', type=float, default=1,
+                    help='beta slowness regularization applied on RNN activation '
+                         '(beta = 0 means no regularization)')
+parser.add_argument('--ntasgd', action='store_true',
+                    help='Whether to apply ntasgd')
+parser.add_argument('--test_mode', action='store_true',
+                    help='Whether to run through the script with few examples')
+parser.add_argument('--lr_update_interval', type=int, default=30,
+                    help='lr udpate interval')
+parser.add_argument('--lr_update_factor', type=float, default=0.1,
+                    help='lr udpate factor')
+args = parser.parse_args()
+
+###############################################################################
+# Load data
+###############################################################################
+
+context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))]
+
+assert args.batch_size % len(context) == 0, \
+    'Total batch size must be multiple of the number of devices'
+
+assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \
+    'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0'
+
+train_dataset, val_dataset, test_dataset = \
+    [nlp.data.WikiText2(segment=segment,
+                        skip_empty=False, bos=None, eos='<eos>')
+     for segment in ['train', 'val', 'test']]
+
+vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
+train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size)
+train_data = train_batchify(train_dataset)
+val_batch_size = 10
+val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
+val_data = val_batchify(val_dataset)
+test_batch_size = 1
+test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
+test_data = test_batchify(test_dataset)
+
+if args.test_mode:
+    args.emsize = 200
+    args.nhid = 200
+    args.nlayers = 1
+    args.epochs = 3
+    train_data = train_data[0:100]
+    val_data = val_data[0:100]
+    test_data = test_data[0:100]
+
+print(args)
+
+###############################################################################
+# Build the model
+###############################################################################
+
+ntokens = len(vocab)
+
+if args.weight_dropout > 0:
+    print('Use AWDRNN')
+    model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
+                                   args.tied, args.dropout, args.weight_dropout,
+                                   args.dropout_h, args.dropout_i, args.dropout_e)
+    model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
+                                  args.tied, args.dropout, args.weight_dropout,
+                                  args.dropout_h, args.dropout_i, args.dropout_e,
+                                  params=model.collect_params())
+else:
+    model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
+                                        args.nhid, args.nlayers, args.dropout, args.tied)
+    model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
+                                       args.nhid, args.nlayers, args.dropout, args.tied,
+                                       params=model.collect_params())
+
+model.initialize(mx.init.Xavier(), ctx=context)
+
+model.hybridize(static_alloc=True)
+
+print(model)
+
+
+if args.optimizer == 'sgd':
+    trainer_params = {'learning_rate': args.lr,
+                      'momentum': 0,
+                      'wd': args.wd}
+elif args.optimizer == 'adam':
+    trainer_params = {'learning_rate': args.lr,
+                      'wd': args.wd,
+                      'beta1': 0,
+                      'beta2': 0.999,
+                      'epsilon': 1e-9}
+
+trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params,
+                        update_on_kvstore=False)
+
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
+
+sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data))
+train_data_loader = mx.gluon.data.DataLoader(train_data,
+                                             batch_sampler=sampler)
+
+
+train_metric = mx.metric.Loss(train_loss)
+val_metric = mx.metric.Loss(loss)
+batch_processor = LanguageModelBatchProcessor()
+est = LanguageModelEstimator(net=model, loss=train_loss,
+                             train_metrics=train_metric,
+                             val_metrics=val_metric,
+                             trainer=trainer, context=context,
+                             evaluation_loss=loss,
+                             eval_net=model_eval,
+                             batch_processor=batch_processor)
+event_handlers = [HiddenStateHandler(), AvgParamHandler(),
+                  LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
+                  RNNGradientUpdateHandler(clip=args.clip),
+                  LoggingHandler(log_interval=20, metrics=est.train_metrics + est.val_metrics)]
+est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers,
+        batch_axis=1)
diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py
new file mode 100644
index 0000000000..69172adde6
--- /dev/null
+++ b/src/gluonnlp/estimator/__init__.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+
+""" Gluon NLP Estimator Module """
+from .language_model_estimator import *
+from .language_model_event_handler import *
+from .language_model_batch_processor import *
+from .loss import *
+
+__all__ = (language_model_estimator.__all__ + language_model_event_handler.__all__ +
+           language_model_batch_processor.__all__ + loss.__all__)
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
new file mode 100644
index 0000000000..d62a6c38db
--- /dev/null
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+""" Gluon Languange Model Estimator """
+
+import mxnet as mx
+from mxnet.gluon.contrib.estimator import BatchProcessor
+from mxnet.gluon.utils import split_and_load
+
+__all__ = ['LanguageModelBatchProcessor']
+
+class LanguageModelBatchProcessor(BatchProcessor):
+    def __init__(self):
+        pass
+
+    def fit_batch(self, estimator, train_batch, batch_axis=0):
+        data = train_batch[:-1]
+        target = train_batch[1:]
+        batch_size = train_batch.shape[batch_axis]
+        data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True)
+        target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True)
+        
+        Ls = []
+        outputs = []
+        data_size = 0
+        if estimator.hiddens is None:
+            estimator.hiddens = [estimator.net.begin_state(batch_size // len(estimator.context),
+                                                           func=mx.nd.zeros,
+                                                           ctx=ctx) for ctx in estimator.context]
+        else:
+            estimator.hiddens = estimator.detach(estimator.hiddens)
+        with mx.autograd.record():
+            for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)):
+                output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h)
+                l = estimator.loss(output, y, encoder_hs, dropped_encoder_hs)
+                Ls.append(l / (len(estimator.context) * X.size))
+                estimator.hiddens[i] = h
+                outputs.append(output)
+
+        for L in Ls:
+            L.backward()
+
+        return data, target, outputs, Ls
+
+    def evaluate_batch(self, estimator, val_batch, batch_axis=0):
+        batch_size = val_batch.shape[batch_axis]
+        val_batch = [split_and_load(x, ctx_list=estimator.context, batch_axis=batch_axis) for x in val_batch]
+        data, target = val_batch
+        Ls = []
+        outputs = []
+        if estimator.eval_hiddens is None:
+            estimator.eval_hiddens = \
+            [estimator.eval_net.begin_state(batch_size //
+                                            len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \
+             in estimator.context]
+        else:
+            estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens)
+        for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)):
+            output, h = estimator.eval_net(X, h)
+            L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,))
+            self.eval_hiddens[i] = h
+            Ls.append(L)
+            outputs.append(output)
+
+        return data, target, outputs, Ls
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
new file mode 100644
index 0000000000..39b4970f20
--- /dev/null
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+""" Gluon Languange Model Estimator """
+
+import copy
+import warnings
+
+import numpy as np
+import mxnet as mx
+from mxnet.gluon.contrib.estimator import Estimator
+from mxnet.gluon.utils import split_and_load
+from mxnet.gluon.utils import clip_global_norm
+from mxnet.metric import Loss as metric_loss
+from .language_model_batch_processor import LanguageModelBatchProcessor
+
+__all__ = ['LanguageModelEstimator']
+
+class LanguageModelEstimator(Estimator):
+    def __init__(self, net, loss, train_metrics=None,
+                 val_metrics = None,
+                 initializer=None,
+                 trainer=None,
+                 context=None,
+                 evaluation_loss=None,
+                 eval_net=None,
+                 batch_processor=LanguageModelBatchProcessor(),
+                 bptt=70):
+        super().__init__(net=net, loss=loss,
+                         train_metrics=train_metrics,
+                         val_metrics=val_metrics,
+                         initializer=initializer,
+                         trainer=trainer,
+                         context=context,
+                         evaluation_loss=evaluation_loss,
+                         eval_net=eval_net,
+                         batch_processor=batch_processor)
+        self.hiddens = None
+        self.eval_hiddens = None
+        self.avg_param = None
+        self.bptt = bptt
+        
+        self.total_L = 0
+        self.ntotal = 0
+        
+    def detach(self, hidden):
+        if isinstance(hidden, (tuple, list)):
+            hidden = [self.detach(h) for h in hidden]
+        else:
+            hidden = hidden.detach()
+        return hidden
+
+
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
new file mode 100644
index 0000000000..ed958c5592
--- /dev/null
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+""" Gluon Language Model Event Handler """
+
+import copy
+import warnings
+
+import mxnet as mx
+from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin
+from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd
+from mxnet.gluon.contrib.estimator import GradientUpdateHandler
+from mxnet.gluon.utils import clip_global_norm
+
+__all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
+           'RNNGradientUpdateHandler']
+
+class HiddenStateHandler(EpochBegin):
+    def __init__(self):
+        pass
+
+    def epoch_begin(self, estimator, *args, **kwargs):
+        estimator.hiddens = None
+        estimator.eval_hiddens = None
+    
+class AvgParamHandler(BatchEnd, EpochEnd):
+    def __init__(self):
+        self.ntasgd = False
+        self.epoch_id = 0
+        self.batch_id = 0
+        self.avg_trigger = 0
+        # self.ntasgd is always False during the first epoch
+        self.batches_per_epoch = 0
+        self.t = 0
+        self.n = 5
+        self.valid_losses = []
+
+    def batch_end(self, estimator, *args, **kwargs):
+        parameters = estimator.net.collect_params()
+        if self.ntasgd:
+            if estimator.avg_param is None:
+                estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
+                                       for k, v in parameters.items()}
+            else:
+                gamma = 1. / max(1, self.epoch_id * (self.batches_per_epoch // estimator.bptt) +
+                                 self.batch_index - avg_trigger + 2)
+                for key, val in estimator.avg_param.items():
+                    val[:] += gamma * (parameters['{}{}'.format(estimator.net.__prefix, key)]
+                                       .data(estimator.context[0]) - val)
+        self.batch_id += 1
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        parameters = estimator.net.collect_params()
+        self.batches_per_epoch = self.batch_id
+        if self.ntasgd == False and self.avg_trigger == 0:
+            if self.t > self.n and estimator.val_metrics > min(self.valid_losses[-self.n:]):
+                if estimator.avg_param is None:
+                    estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
+                                           for k, v in parameters.items()}
+                else:
+                    for key, val in parameters.items():
+                        estimator.avg_param[key.split(estimator.net._prefix)[1]] \
+                            = val.data(estimator.context[0]).copy()
+                self.avg_trigger = (self.epoch_id + 1) * (self.batches_per_epoch // estimator.bptt)
+                print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger)
+                self.ntasgd = True
+            self.valid_losses.append(estimator.val_metrics)
+            self.t += 1
+        self.batch_id = 0
+        self.epoch_id += 1
+
+class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd):
+    def __init__(self, lr_update_interval=30, lr_update_factor=0.1):
+        self.lr_batch_start = 0
+        self.best_val = float('Inf')
+        self.update_lr_epoch = 0
+        self.lr_update_interval = lr_update_interval
+        self.lr_update_factor = lr_update_factor
+
+    def batch_begin(self, estimator, *args, **kwargs):
+        batch = kwargs['batch']
+        self.lr_batch_start = estimator.trainer.learning_rate
+        seq_len = batch.shape[0] - 1
+        estimator.trainer.set_learning_rate(self.lr_batch_start * seq_len / estimator.bptt)
+
+    def batch_end(self, estimator, *args, **kwargs):
+        estimator.trainer.set_learning_rate(self.lr_batch_start)
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        if estimator.val_metrics < self.best_val:
+            self.update_lr_epoch = 0
+            self.best_val = estimator.val_metrics
+        else:
+            self.update_lr_epoch += 1
+            if self.update_lr_epoch % self.lr_update_interval == 0 and self.update_lr_epoch != 0:
+                lr_scale = estimator.trainer.learning_rate * self.lr_update_factor
+                estimator.trainer.set_learning_rate(lr_scale)
+                self.update_lr_epoch = 0
+
+class RNNGradientUpdateHandler(GradientUpdateHandler):
+    def __init__(self, clip=None, **kwargs):
+        super().__init__(**kwargs)
+        self.clip = clip
+
+    def batch_end(self, estimator, *args, **kwargs):
+        parameters = estimator.net.collect_params()
+        grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context]
+        if self.clip is not None:
+            clip_global_norm(grads, self.clip)
+
+        estimator.trainer.step(1)
diff --git a/src/gluonnlp/estimator/loss.py b/src/gluonnlp/estimator/loss.py
new file mode 100644
index 0000000000..98febf217e
--- /dev/null
+++ b/src/gluonnlp/estimator/loss.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from mxnet import gluon
+from ..loss import ActivationRegularizationLoss, TemporalActivationRegularizationLoss
+
+__all__ = ['JointActivationRegularizationLoss']
+
+class JointActivationRegularizationLoss(gluon.loss.Loss):
+    r"""Computes Joint Regularization Loss with standard loss.
+
+    The activation regularization refer to
+    gluonnlp.loss.ActivationRegularizationLoss.
+
+    The temporal activation regularization refer to
+    gluonnlp.loss.TemporalActivationRegularizationLoss.
+
+    Parameters
+    ----------
+    loss : gluon.loss.Loss
+        The standard loss
+    alpha: float
+        The activation regularization parameter in gluonnlp.loss.ActivationRegularizationLoss
+    beta: float
+        The temporal activation regularization parameter in
+        gluonnlp.loss.TemporalActivationRegularizationLoss
+
+    Inputs:
+        - **out**: NDArray
+        output tensor with shape `(sequence_length, batch_size, input_size)`
+          when `layout` is "TNC".
+        - **target**: NDArray
+        target tensor with shape `(sequence_length, batch_size, input_size)`
+          when `layout` is "TNC".
+        - **states**: the stack outputs from RNN,
+        which consists of output from each time step (TNC).
+        - **dropped_states**: the stack outputs from RNN with dropout,
+        which consists of output from each time step (TNC).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
+          batch_axis are averaged out.
+    """
+
+    def __init__(self, l, alpha, beta, weight=None, batch_axis=None, **kwargs):
+        super(JointActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._loss = l
+        self._alpha, self._beta = alpha, beta
+        if alpha:
+            self._ar_loss = ActivationRegularizationLoss(alpha)
+        if beta:
+            self._tar_loss = TemporalActivationRegularizationLoss(beta)
+
+    def __repr__(self):
+        s = 'JointActivationTemporalActivationRegularizationLoss'
+        return s
+
+    def hybrid_forward(self, F, out, target, states, dropped_states): # pylint: disable=arguments-differ
+        # pylint: disable=unused-argument
+        l = self._loss(out.reshape(-3, -1), target.reshape(-1,))
+        if self._alpha:
+            l = l + self._ar_loss(*dropped_states)
+        if self._beta:
+            l = l + self._tar_loss(*states)
+        return l

From 90c5144e99d21961e3a1bd389f400c3ec183e7ef Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Fri, 13 Dec 2019 10:09:26 +0000
Subject: [PATCH 02/32] modify init file

---
 src/gluonnlp/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py
index 7a588e8233..f9772b95fc 100644
--- a/src/gluonnlp/__init__.py
+++ b/src/gluonnlp/__init__.py
@@ -30,6 +30,7 @@
 from . import vocab
 from . import optimizer
 from . import initializer
+from . import estimator
 from .vocab import Vocab
 
 __version__ = '0.10.0.dev'
@@ -43,7 +44,8 @@
            'initializer',
            'optimizer',
            'utils',
-           'metric']
+           'metric',
+           'estimator']
 
 warnings.filterwarnings(module='gluonnlp', action='default', category=DeprecationWarning)
 utils.version.check_version('1.6.0', warning_only=True, library=mxnet)

From f7c730fa1653459645cf5fa70bd690d7cfe1bce5 Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Tue, 17 Dec 2019 10:49:57 +0000
Subject: [PATCH 03/32] update language model estimator metrics computation

---
 .../word_language_model_estimator.py          |  4 +++-
 .../language_model_batch_processor.py         |  1 +
 .../estimator/language_model_event_handler.py | 23 ++++++++++++++++++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index 92fbba8b42..f075cadfa1 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -29,6 +29,7 @@
 from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
 from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
 from gluonnlp.estimator import LanguageModelBatchProcessor
+from gluonnlp.estimator import MetricResetHandler
 from mxnet.gluon.data.sampler import BatchSampler
 
 class BatchVariableLenTextSampler(BatchSampler):
@@ -220,6 +221,7 @@ def __len__(self):
 event_handlers = [HiddenStateHandler(), AvgParamHandler(),
                   LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
                   RNNGradientUpdateHandler(clip=args.clip),
-                  LoggingHandler(log_interval=20, metrics=est.train_metrics + est.val_metrics)]
+                  LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
+                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)]
 est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers,
         batch_axis=1)
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index d62a6c38db..bbf1772cee 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -56,6 +56,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
         for L in Ls:
             L.backward()
 
+        Ls = [l * (len(estimator.context) * X.size) for l in Ls]
         return data, target, outputs, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index ed958c5592..d0140506e6 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -26,10 +26,11 @@
 from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin
 from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd
 from mxnet.gluon.contrib.estimator import GradientUpdateHandler
+from mxnet.gluon.contrib.estimator import MetricHandler
 from mxnet.gluon.utils import clip_global_norm
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
-           'RNNGradientUpdateHandler']
+           'RNNGradientUpdateHandler', 'MetricResetHandler']
 
 class HiddenStateHandler(EpochBegin):
     def __init__(self):
@@ -119,9 +120,29 @@ def __init__(self, clip=None, **kwargs):
         self.clip = clip
 
     def batch_end(self, estimator, *args, **kwargs):
+        loss = kwargs['loss']
+        loss_size = sum([l.size for l in loss])
         parameters = estimator.net.collect_params()
         grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context]
         if self.clip is not None:
+            # use multi context clipping later
             clip_global_norm(grads, self.clip)
 
         estimator.trainer.step(1)
+
+class MetricResetHandler(BatchBegin, MetricHandler):
+    def __init__(self, metrics, log_interval=1):
+        super().__init__(metrics=metrics)
+        self.batch_id = 0
+        self.log_interval = log_interval
+
+    def epoch_begin(self, estimator, *args, **kwargs):
+        self.batch_id = 0
+        for metric in self.metrics:
+            metric.reset()
+
+    def batch_begin(self, estimator, *args, **kwargs):
+        if self.batch_id % self.log_interval == 1:
+            for metric in self.metrics:
+                metric.reset_local()
+        self.batch_id += 1

From c90509a7a80b88d1fb40b71163a806a0ac14044e Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Wed, 18 Dec 2019 10:29:21 +0000
Subject: [PATCH 04/32] fix and update language model estimator

---
 .../word_language_model_estimator.py          | 27 ++++++++++-----
 .../language_model_batch_processor.py         | 10 ++++--
 .../estimator/language_model_estimator.py     |  7 ++--
 .../estimator/language_model_event_handler.py | 33 +++++++++++--------
 4 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index f075cadfa1..9ced6697fb 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -33,15 +33,20 @@
 from mxnet.gluon.data.sampler import BatchSampler
 
 class BatchVariableLenTextSampler(BatchSampler):
-    def __init__(self, bptt, length):
+    def __init__(self, bptt, length, use_variable_length=True):
         self.bptt = bptt
         self.length = length
         self.index = 0
+        self.use_variable_length = use_variable_length
 
     def __iter__(self):
+        self.index = 0
         while self.index < self.length - 2:
-            bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2
-            seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
+            if self.use_variable_length:
+                bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2
+                seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
+            else:
+                seq_len = self.bptt
             seq_len = min(seq_len, self.length - self.index - 1)
             # batch_size = seq_len + 1
             batch = []
@@ -53,7 +58,7 @@ def __iter__(self):
     def __len__(self):
         # you may never get real size of the data sampler beforehand. May need some
         # postprocessing after fetching the data batch
-        return self.length / 5 + 1
+        return int(self.length / 5) + 1
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
@@ -168,6 +173,7 @@ def __len__(self):
     model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
                                    args.tied, args.dropout, args.weight_dropout,
                                    args.dropout_h, args.dropout_i, args.dropout_e)
+    model.initialize(mx.init.Xavier(), ctx=context)
     model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
                                   args.tied, args.dropout, args.weight_dropout,
                                   args.dropout_h, args.dropout_i, args.dropout_e,
@@ -175,17 +181,16 @@ def __len__(self):
 else:
     model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
                                         args.nhid, args.nlayers, args.dropout, args.tied)
+    model.initialize(mx.init.Xavier(), ctx=context)
     model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
                                        args.nhid, args.nlayers, args.dropout, args.tied,
                                        params=model.collect_params())
 
-model.initialize(mx.init.Xavier(), ctx=context)
 
 model.hybridize(static_alloc=True)
 
 print(model)
 
-
 if args.optimizer == 'sgd':
     trainer_params = {'learning_rate': args.lr,
                       'momentum': 0,
@@ -204,9 +209,11 @@ def __len__(self):
 train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
 
 sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data))
+val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False)
 train_data_loader = mx.gluon.data.DataLoader(train_data,
                                              batch_sampler=sampler)
-
+val_data_loader = mx.gluon.data.DataLoader(val_data,
+                                           batch_sampler=val_sampler)
 
 train_metric = mx.metric.Loss(train_loss)
 val_metric = mx.metric.Loss(loss)
@@ -218,10 +225,12 @@ def __len__(self):
                              evaluation_loss=loss,
                              eval_net=model_eval,
                              batch_processor=batch_processor)
-event_handlers = [HiddenStateHandler(), AvgParamHandler(),
+event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)),
                   LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
                   RNNGradientUpdateHandler(clip=args.clip),
                   LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
                   MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)]
-est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers,
+est.fit(train_data=train_data_loader, val_data=val_data_loader,
+        epochs=args.epochs,
+        event_handlers=event_handlers,
         batch_axis=1)
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index bbf1772cee..4655dd620e 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -60,9 +60,13 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
         return data, target, outputs, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
+        batch_axis = 1 #temporary work around, removed after estimator is fixed
+        data = val_batch[:-1]
+        target = val_batch[1:]
         batch_size = val_batch.shape[batch_axis]
-        val_batch = [split_and_load(x, ctx_list=estimator.context, batch_axis=batch_axis) for x in val_batch]
-        data, target = val_batch
+        data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True)
+        target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True)
+
         Ls = []
         outputs = []
         if estimator.eval_hiddens is None:
@@ -75,7 +79,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)):
             output, h = estimator.eval_net(X, h)
             L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,))
-            self.eval_hiddens[i] = h
+            estimator.eval_hiddens[i] = h
             Ls.append(L)
             outputs.append(output)
 
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index 39b4970f20..4cae6b9166 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -41,7 +41,8 @@ def __init__(self, net, loss, train_metrics=None,
                  evaluation_loss=None,
                  eval_net=None,
                  batch_processor=LanguageModelBatchProcessor(),
-                 bptt=70):
+                 bptt=70,
+                 ntasgd=True):
         super().__init__(net=net, loss=loss,
                          train_metrics=train_metrics,
                          val_metrics=val_metrics,
@@ -55,9 +56,7 @@ def __init__(self, net, loss, train_metrics=None,
         self.eval_hiddens = None
         self.avg_param = None
         self.bptt = bptt
-        
-        self.total_L = 0
-        self.ntotal = 0
+        self.ntasgd = ntasgd
         
     def detach(self, hidden):
         if isinstance(hidden, (tuple, list)):
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index d0140506e6..1401136749 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -41,16 +41,15 @@ def epoch_begin(self, estimator, *args, **kwargs):
         estimator.eval_hiddens = None
     
 class AvgParamHandler(BatchEnd, EpochEnd):
-    def __init__(self):
+    def __init__(self, data_length):
         self.ntasgd = False
         self.epoch_id = 0
         self.batch_id = 0
         self.avg_trigger = 0
-        # self.ntasgd is always False during the first epoch
-        self.batches_per_epoch = 0
         self.t = 0
         self.n = 5
         self.valid_losses = []
+        self.data_length = data_length
 
     def batch_end(self, estimator, *args, **kwargs):
         parameters = estimator.net.collect_params()
@@ -59,18 +58,21 @@ def batch_end(self, estimator, *args, **kwargs):
                 estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
                                        for k, v in parameters.items()}
             else:
-                gamma = 1. / max(1, self.epoch_id * (self.batches_per_epoch // estimator.bptt) +
-                                 self.batch_index - avg_trigger + 2)
+                gamma = 1. / max(1, self.epoch_id * (self.data_length // estimator.bptt) +
+                                 self.batch_id - self.avg_trigger + 2)
                 for key, val in estimator.avg_param.items():
-                    val[:] += gamma * (parameters['{}{}'.format(estimator.net.__prefix, key)]
+                    val[:] += gamma * (parameters['{}{}'.format(estimator.net._prefix, key)]
                                        .data(estimator.context[0]) - val)
         self.batch_id += 1
 
     def epoch_end(self, estimator, *args, **kwargs):
+        if not isinstance(estimator.val_metrics, list):
+            val_metrics = [estimator.val_metrics]
+        else:
+            val_metrics = estimator.val_metrics
         parameters = estimator.net.collect_params()
-        self.batches_per_epoch = self.batch_id
-        if self.ntasgd == False and self.avg_trigger == 0:
-            if self.t > self.n and estimator.val_metrics > min(self.valid_losses[-self.n:]):
+        if self.avg_trigger == 0:
+            if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]):
                 if estimator.avg_param is None:
                     estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
                                            for k, v in parameters.items()}
@@ -78,10 +80,10 @@ def epoch_end(self, estimator, *args, **kwargs):
                     for key, val in parameters.items():
                         estimator.avg_param[key.split(estimator.net._prefix)[1]] \
                             = val.data(estimator.context[0]).copy()
-                self.avg_trigger = (self.epoch_id + 1) * (self.batches_per_epoch // estimator.bptt)
+                self.avg_trigger = (self.epoch_id + 1) * (self.data_length // estimator.bptt)
                 print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger)
                 self.ntasgd = True
-            self.valid_losses.append(estimator.val_metrics)
+            self.valid_losses.append(val_metrics[0].get()[1])
             self.t += 1
         self.batch_id = 0
         self.epoch_id += 1
@@ -104,9 +106,14 @@ def batch_end(self, estimator, *args, **kwargs):
         estimator.trainer.set_learning_rate(self.lr_batch_start)
 
     def epoch_end(self, estimator, *args, **kwargs):
-        if estimator.val_metrics < self.best_val:
+        if not isinstance(estimator.val_metrics, list):
+            val_metrics = [estimator.val_metrics]
+        else:
+            val_metrics = estimator.val_metrics
+
+        if val_metrics[0].get()[1] < self.best_val:
             self.update_lr_epoch = 0
-            self.best_val = estimator.val_metrics
+            self.best_val = val_metrics[0].get()[1]
         else:
             self.update_lr_epoch += 1
             if self.update_lr_epoch % self.lr_update_interval == 0 and self.update_lr_epoch != 0:

From 8540f4bfe29ad8f694c135132f0b190675840a71 Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Wed, 18 Dec 2019 10:32:42 +0000
Subject: [PATCH 05/32] remove unnecessary argument from the language model
 estimator

---
 src/gluonnlp/estimator/language_model_estimator.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index 4cae6b9166..ac763a3723 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -41,8 +41,7 @@ def __init__(self, net, loss, train_metrics=None,
                  evaluation_loss=None,
                  eval_net=None,
                  batch_processor=LanguageModelBatchProcessor(),
-                 bptt=70,
-                 ntasgd=True):
+                 bptt=70):
         super().__init__(net=net, loss=loss,
                          train_metrics=train_metrics,
                          val_metrics=val_metrics,
@@ -56,7 +55,6 @@ def __init__(self, net, loss, train_metrics=None,
         self.eval_hiddens = None
         self.avg_param = None
         self.bptt = bptt
-        self.ntasgd = ntasgd
         
     def detach(self, hidden):
         if isinstance(hidden, (tuple, list)):

From d030199f6f5581eb02ffa74d7ab508e78ea24daf Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Thu, 19 Dec 2019 06:35:14 +0000
Subject: [PATCH 06/32] Add checkpoint handler for word language model

---
 .../word_language_model_estimator.py          |  4 ++-
 .../estimator/language_model_estimator.py     |  1 +
 .../estimator/language_model_event_handler.py | 31 ++++++++++++++++---
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index 9ced6697fb..6e6b5a8bb3 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -28,6 +28,7 @@
 from gluonnlp.estimator import LanguageModelEstimator
 from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
 from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
+from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 from gluonnlp.estimator import LanguageModelBatchProcessor
 from gluonnlp.estimator import MetricResetHandler
 from mxnet.gluon.data.sampler import BatchSampler
@@ -229,7 +230,8 @@ def __len__(self):
                   LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
                   RNNGradientUpdateHandler(clip=args.clip),
                   LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
-                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)]
+                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval),
+                  WordLanguageModelCheckpointHandler(args.save)]
 est.fit(train_data=train_data_loader, val_data=val_data_loader,
         epochs=args.epochs,
         event_handlers=event_handlers,
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index ac763a3723..6155e0e7c5 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -55,6 +55,7 @@ def __init__(self, net, loss, train_metrics=None,
         self.eval_hiddens = None
         self.avg_param = None
         self.bptt = bptt
+        self.ntasgd = False
         
     def detach(self, hidden):
         if isinstance(hidden, (tuple, list)):
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 1401136749..46a6f259e1 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -30,7 +30,8 @@
 from mxnet.gluon.utils import clip_global_norm
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
-           'RNNGradientUpdateHandler', 'MetricResetHandler']
+           'RNNGradientUpdateHandler', 'MetricResetHandler',
+           'WordLanguageModelCheckpointHandler']
 
 class HiddenStateHandler(EpochBegin):
     def __init__(self):
@@ -42,7 +43,6 @@ def epoch_begin(self, estimator, *args, **kwargs):
     
 class AvgParamHandler(BatchEnd, EpochEnd):
     def __init__(self, data_length):
-        self.ntasgd = False
         self.epoch_id = 0
         self.batch_id = 0
         self.avg_trigger = 0
@@ -53,7 +53,7 @@ def __init__(self, data_length):
 
     def batch_end(self, estimator, *args, **kwargs):
         parameters = estimator.net.collect_params()
-        if self.ntasgd:
+        if estimator.ntasgd:
             if estimator.avg_param is None:
                 estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
                                        for k, v in parameters.items()}
@@ -82,7 +82,7 @@ def epoch_end(self, estimator, *args, **kwargs):
                             = val.data(estimator.context[0]).copy()
                 self.avg_trigger = (self.epoch_id + 1) * (self.data_length // estimator.bptt)
                 print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger)
-                self.ntasgd = True
+                estimator.ntasgd = True
             self.valid_losses.append(val_metrics[0].get()[1])
             self.t += 1
         self.batch_id = 0
@@ -153,3 +153,26 @@ def batch_begin(self, estimator, *args, **kwargs):
             for metric in self.metrics:
                 metric.reset_local()
         self.batch_id += 1
+
+class WordLanguageModelCheckpointHandler(EpochEnd):
+    def __init__(self, save):
+        self.save = save
+        self.best_val = float('Inf')
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        if not isinstance(estimator.val_metrics, list):
+            val_metrics = [estimator.val_metrics]
+        else:
+            val_metrics = estimator.val_metrics
+
+        if estimator.ntasgd:
+            mx.nd.save('{}.val.params'.format(self.save), estimator.avg_param)
+        else:
+            estimator.net.save_parameters('{}.val.params'.format(self.save))
+
+        if val_metrics[0].get()[1] < self.best_val:
+            self.best_val = val_metrics[0].get()[1]
+            if estimator.ntasgd:
+                mx.nd.save(self.save, estimator.avg_param)
+            else:
+                estimator.net.save_parameters(self.save)

From 9aa824d808e466b120cefe0732c4f0c9088ef2d1 Mon Sep 17 00:00:00 2001
From: Zhuanghua Liu <liuzhuanghua1991@gmail.com>
Date: Mon, 23 Dec 2019 04:41:57 +0000
Subject: [PATCH 07/32] Add large language model estimator

---
 .../large_word_language_model_estimator.py    | 185 ++++++++++++++++++
 scripts/estimator/sampler.py                  | 109 +++++++++++
 .../word_language_model_estimator.py          |  13 ++
 .../language_model_batch_processor.py         |  54 ++++-
 .../estimator/language_model_event_handler.py |  18 ++
 .../estimator/parallel_language_model.py      |  41 ++++
 6 files changed, 416 insertions(+), 4 deletions(-)
 create mode 100644 scripts/estimator/large_word_language_model_estimator.py
 create mode 100644 scripts/estimator/sampler.py
 create mode 100644 src/gluonnlp/estimator/parallel_language_model.py

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
new file mode 100644
index 0000000000..c7ee01ff44
--- /dev/null
+++ b/scripts/estimator/large_word_language_model_estimator.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import math
+import os
+import sys
+import argparse
+import numpy as np
+import mxnet as mx
+from mxnet import gluon, autograd
+import gluonnlp as nlp
+from gluonnlp.utils import Parallel, Parallelizable
+from sampler import LogUniformSampler
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '..', '..'))
+
+nlp.utils.check_version('0.7.0')
+
+###############################################################################
+# Arg parser
+###############################################################################
+parser = argparse.ArgumentParser(description=
+                                 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW')
+parser.add_argument('--save', type=str, default='model.params',
+                    help='path to save the final model.')
+parser.add_argument('--emsize', type=int, default=512,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=2048,
+                    help='number of hidden units per layer')
+parser.add_argument('--nproj', type=int, default=512,
+                    help='number of projection units per layer. Could be different from embsize')
+parser.add_argument('--nlayers', type=int, default=1,
+                    help='number of layers')
+parser.add_argument('--from-epoch', type=int, default=None,
+                    help='start training or testing from the provided epoch')
+parser.add_argument('--epochs', type=int, default=50,
+                    help='number of epoch for training')
+parser.add_argument('--batch-size', type=int, default=128,
+                    help='batch size per gpu')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--eps', type=float, default=1,
+                    help='initial history accumulation for adagrad')
+parser.add_argument('--bptt', type=int, default=20,
+                    help='sequence length')
+parser.add_argument('--k', type=int, default=8192,
+                    help='number of noise samples for estimation')
+parser.add_argument('--gpus', type=str,
+                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
+parser.add_argument('--log-interval', type=int, default=1000,
+                    help='report interval')
+parser.add_argument('--seed', type=int, default=0,
+                    help='random seed')
+parser.add_argument('--lr', type=float, default=0.2,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=1.0,
+                    help='gradient clipping by global norm.')
+parser.add_argument('--test-mode', action='store_true',
+                    help='Whether to run through the script with few examples')
+parser.add_argument('--eval-only', action='store_true',
+                    help='Whether to only run evaluation for the trained model')
+args = parser.parse_args()
+
+segments = ['train', 'test']
+max_nbatch_eval = None
+
+if args.test_mode:
+    args.emsize = 200
+    args.log_interval = 1
+    args.nhid = 200
+    args.nlayers = 1
+    args.epochs = 20
+    max_nbatch_eval = 3
+    segments = ['test', 'test']
+
+print(args)
+mx.random.seed(args.seed)
+np.random.seed(args.seed)
+
+context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
+          [mx.gpu(int(x)) for x in args.gpus.split(',')]
+
+os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context))
+os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context))
+
+###############################################################################
+# Data stream
+###############################################################################
+train_data_stream, test_data_stream = \
+    [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='<eos>')
+     for segment in segments]
+vocab = train_data_stream.vocab
+ntokens = len(vocab)
+
+# Sampler for generating negative classes during training with importance sampling
+sampler = LogUniformSampler(ntokens, args.k)
+
+# Given a list of (array, context) pairs, load array[i] on context[i]
+def _load(xs):
+    ret = []
+    for x, ctx in zip(xs, context):
+        if isinstance(x, tuple):
+            ret.append([y.as_in_context(ctx) for y in x])
+        else:
+            ret.append(x.as_in_context(ctx))
+    return ret
+
+# Transformation for a data batch for training.
+# First, load the data, target and mask to target contexts.
+# Second, the LSTM-2048-512 model performs importance sampling for decoding
+# during training, we need to sample negative candidate classes by invoking the
+# log uniform sampler.
+def _split_and_sample(x, y):
+    m = x != vocab[vocab.padding_token]  # mask padding
+    num_ctx = len(context)
+    if num_ctx > 1:
+        xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True)
+        ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True)
+        ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True)
+    else:
+        xs, ys, ms = [x], [y], [m]
+    xs = _load(xs)
+    ys = _load(ys)
+    ms = _load(ms)
+    ss = [sampler(y) for y in ys]
+    ss = _load(ss)
+    return xs, ys, ms, ss
+
+train_batch_size = args.batch_size * len(context)
+train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
+train_data = train_batchify(train_data_stream)
+train_data = train_data.transform(_split_and_sample)
+
+test_batch_size = args.batch_size
+test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
+test_data = test_batchify(test_data_stream)
+test_data = nlp.data.PrefetchingStream(test_data)
+
+###############################################################################
+# Build the model
+###############################################################################
+
+eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
+                                             args.nlayers, args.nproj,
+                                             embed_dropout=args.dropout,
+                                             encode_dropout=args.dropout)
+model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
+                                              args.nlayers, args.nproj, args.k,
+                                              embed_dropout=args.dropout,
+                                              encode_dropout=args.dropout)
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
+trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
+trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params)
+if args.from_epoch:
+    from_epoch = args.from_epoch
+    checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d'))
+    model.load_parameters(checkpoint_name)
+    trainer.load_states('%s.state'%args.save)
+    print('Loaded parameters from checkpoint %s'%(checkpoint_name))
+
+
+for i, batch in enumerate(train_data):
+    tmp = type(batch)
+
+model.hybridize(static_alloc=True, static_shape=True)
+parallel_model = ParallelBigRNN(model, loss)
+parallel = Parallel(len(context), parallel_model)
diff --git a/scripts/estimator/sampler.py b/scripts/estimator/sampler.py
new file mode 100644
index 0000000000..f841fba160
--- /dev/null
+++ b/scripts/estimator/sampler.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Log Uniform Candidate Sampler"""
+
+import math
+import numpy as np
+from mxnet import ndarray, gluon
+
+
+class LogUniformSampler(gluon.block.Block):
+    """Draw random samples from an approximately log-uniform or Zipfian distribution.
+
+    This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
+    The elements of sampled_candidates are drawn without replacement from the base distribution.
+
+    The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
+
+    P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
+
+    This sampler is useful when the true classes approximately follow such a distribution.
+
+    For example, if the classes represent words in a lexicon sorted in decreasing order of
+    frequency. If your classes are not ordered by decreasing frequency, do not use this op.
+
+    Additionally, it also returns the number of times each of the
+    true classes and the sampled classes is expected to occur.
+
+    As the candidates are drawn without replacement, the expected count for the sampled candidates
+    and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume
+    (falsely) that the number of tries to get a batch of batch_size distinct values is always
+    `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries.
+
+    Parameters
+    ----------
+    num_sampled: int
+        The number of classes to randomly sample.
+    range_max: int
+        The number of possible classes.
+    dtype: str or np.dtype
+        The dtype for outputs
+    """
+    def __init__(self, range_max, num_sampled, dtype=None, **kwargs):
+        super(LogUniformSampler, self).__init__(**kwargs)
+        self._num_sampled = num_sampled
+        self._log_range = math.log(range_max + 1)
+        self._dtype = np.float32 if dtype is None else dtype
+        self._range_max = range_max
+
+    def _prob_helper(self, num_tries, prob):
+        return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1
+
+    def forward(self, true_classes): # pylint: disable=arguments-differ
+        """Draw samples from log uniform distribution and returns sampled candidates,
+        expected count for true classes and sampled classes.
+
+        Parameters
+        ----------
+        true_classes: NDArray
+            The true classes.
+
+        Returns
+        -------
+        samples: NDArray
+            The sampled candidate classes.
+        expected_count_sample: NDArray
+            The expected count for sampled candidates.
+        expected_count_true: NDArray
+            The expected count for true classes in the same shape as `true_classes`.
+        """
+        num_sampled = self._num_sampled
+        ctx = true_classes.context
+        num_tries = 0
+        log_range = math.log(self._range_max + 1)
+
+        # sample candidates
+        f = ndarray._internal._sample_unique_zipfian
+        sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled))
+        sampled_classes = sampled_classes.reshape((-1,))
+        sampled_classes = sampled_classes.as_in_context(ctx)
+        num_tries = num_tries.as_in_context(ctx)
+
+        # expected count for true classes
+        true_cls = true_classes.as_in_context(ctx).astype('float64')
+        prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range
+        count_true = self._prob_helper(num_tries, prob_true)
+        # expected count for sampled classes
+        sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64')
+        sampled_cls_fp64 = sampled_classes.astype('float64')
+        prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
+        count_sampled = self._prob_helper(num_tries, prob_sampled)
+        # convert to dtype
+        sampled_classes = sampled_classes.astype(self._dtype, copy=False)
+        count_true = count_true.astype(self._dtype, copy=False)
+        count_sampled = count_sampled.astype(self._dtype, copy=False)
+        return sampled_classes, count_sampled, count_true
diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index 6e6b5a8bb3..aacd787cde 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -192,6 +192,19 @@ def __len__(self):
 
 print(model)
 
+
+def check_initialized(net):
+    params = net.collect_params()
+    for param in params:
+        try:
+            params[param].list_ctx()
+        except RuntimeError:
+            return False
+    return True
+    
+print(check_initialized(model))
+print(check_initialized(model_eval))
+                                    
 if args.optimizer == 'sgd':
     trainer_params = {'learning_rate': args.lr,
                       'momentum': 0,
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 4655dd620e..411d93ad2c 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -35,16 +35,16 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
         batch_size = train_batch.shape[batch_axis]
         data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True)
         target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True)
-        
-        Ls = []
-        outputs = []
-        data_size = 0
         if estimator.hiddens is None:
             estimator.hiddens = [estimator.net.begin_state(batch_size // len(estimator.context),
                                                            func=mx.nd.zeros,
                                                            ctx=ctx) for ctx in estimator.context]
         else:
             estimator.hiddens = estimator.detach(estimator.hiddens)
+        
+        Ls = []
+        outputs = []
+        data_size = 0
         with mx.autograd.record():
             for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)):
                 output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h)
@@ -84,3 +84,49 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
             outputs.append(output)
 
         return data, target, outputs, Ls
+
+class ParallelLanguageModelBatchProcessor(BatchProcessor):
+    def __init__(self):
+        pass
+
+    def fit_batch(self, estimator, train_batch, batch_axis=0):
+        data, target, mask, sample = train_batch
+        batch_size = data.shape(batch_axis)
+        if estimator.hiddens is None:
+            estimator.hiddens = [estimator.net.begin_state(batch_size,
+                                                           func=mx.nd.zeros,
+                                                           ctx=ctx) for ctx in estimator.context]
+        else:
+            estimator.hiddens = estimator.detach(estimator.hiddens)
+        Ls = []
+        for _, batch in enumerate(zip(data, target, mask, sample, hiddens)):
+            paralllel.put(batch)
+
+        for _ in range(len(data)):
+            hidden, ls = parallel.get()
+            index = estimator.context.index(hidden[0].context)
+            estimator.hiddens[index] = hidden
+            Ls.append(ls)
+
+        #Ls = [l / estimator.bptt for l in Ls]
+        return data, target, hiddens, Ls
+
+    def evaluate_batch(self, estimator, val_batch, batch_axis=0):
+        data, target = val_batch
+        ctx = estimator.context[0]
+        data = data.as_in_context(ctx)
+        target = target.as_in_context(ctx)
+        if estimator.eval_hiddens is None:
+            estimator.eval_hiddens = estimator.eval_net.begin_state(batch_size=batch_size,
+                                                               func=mx.nd.zeros,
+                                                               ctx=ctx)
+        else:
+            estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens)
+
+        mask = data != vocab[vocab.padding_token]
+        output, estimator.eval_hiddens = estimator.eval_net(data, estimator.eval_hiddens)
+        output = output.reshape((-3, -1))
+        L = estimator.evaluation_loss(output, target.reshape(-1, ) * mask.reshape(-1))
+        L = L * mask
+
+        return data, target, output, L
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 46a6f259e1..378e152a14 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -137,6 +137,24 @@ def batch_end(self, estimator, *args, **kwargs):
 
         estimator.trainer.step(1)
 
+class LargeRNNGradientUpdateHandler(GradientUpdateHandler):
+    def __init__(self, batch_size, clip=None, **kwargs):
+        super().__init__(**kwargs)
+        self.batch_size = batch_size
+        self.clip = clip
+
+    def batch_end(self, estimator, *args, **kwargs):
+        encoder_params = estimator.net.encoder.collect_params().values()
+        embedding_params = list(estimator.net.embedding.collect_params().values())
+
+        for ctx in estimator.context:
+            x = embedding_params[0].grad(ctx)
+            x[:] *= self.batch_size # can I get the batch size dynamically?
+            encoder_grad = [p.grad(ctx) for p in encoder_params]
+            gluon.utils.clip_global_norm(encoder_grad, self.clip)
+            
+        estimator.trainer.step(len(estimator.context))
+
 class MetricResetHandler(BatchBegin, MetricHandler):
     def __init__(self, metrics, log_interval=1):
         super().__init__(metrics=metrics)
diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py
new file mode 100644
index 0000000000..8697d90360
--- /dev/null
+++ b/src/gluonnlp/estimator/parallel_language_model.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+""" Gluon Parallel Languange Model """
+
+from gluonnlp.utils import Parallel, Parallelizable
+
+__all__ = ['ParallelBigRNN']
+
+class ParallelBigRNN(Parallelizable):
+    def __init__(self, rnn, loss_fn):
+        self._model = rnn
+        self._loss = loss_fn
+
+    def forward_backward(self, x):
+        X, y, m, s, h = x
+        with autograd.record():
+            output, hidden, new_target = self._model(X, y, h, s)
+            output = output.reshape((-3, -1))
+            new_target = new_target.reshape((-1,))
+            ls = self._loss(output, new_target) * m.reshape((-1,))
+            ls = ls / args.batch_size
+            ls.backward()
+        return hidden, ls
+

From 06295ef0dfab06684a5d697a2fdc422496aee2ff Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 16 Jan 2020 08:02:59 +0000
Subject: [PATCH 08/32] fix name errors

---
 .../word_language_model_estimator.py          |  4 +--
 .../language_model_batch_processor.py         | 26 +++++++++----------
 .../estimator/language_model_estimator.py     | 10 +++----
 .../estimator/language_model_event_handler.py |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index aacd787cde..1138a43e8d 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -236,8 +236,8 @@ def check_initialized(net):
                              train_metrics=train_metric,
                              val_metrics=val_metric,
                              trainer=trainer, context=context,
-                             evaluation_loss=loss,
-                             eval_net=model_eval,
+                             val_loss=loss,
+                             val_net=model_eval,
                              batch_processor=batch_processor)
 event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)),
                   LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 411d93ad2c..b6a72777ca 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -69,17 +69,17 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
 
         Ls = []
         outputs = []
-        if estimator.eval_hiddens is None:
-            estimator.eval_hiddens = \
-            [estimator.eval_net.begin_state(batch_size //
+        if estimator.val_hiddens is None:
+            estimator.val_hiddens = \
+            [estimator.val_net.begin_state(batch_size //
                                             len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \
              in estimator.context]
         else:
-            estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens)
-        for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)):
-            output, h = estimator.eval_net(X, h)
-            L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,))
-            estimator.eval_hiddens[i] = h
+            estimator.val_hiddens = estimator.detach(estimator.val_hiddens)
+        for i, (X, y, h) in enumerate(zip(data, target, estimator.val_hiddens)):
+            output, h = estimator.val_net(X, h)
+            L = estimator.val_loss(output.reshape(-3, -1), y.reshape(-1,))
+            estimator.val_hiddens[i] = h
             Ls.append(L)
             outputs.append(output)
 
@@ -116,17 +116,17 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         ctx = estimator.context[0]
         data = data.as_in_context(ctx)
         target = target.as_in_context(ctx)
-        if estimator.eval_hiddens is None:
-            estimator.eval_hiddens = estimator.eval_net.begin_state(batch_size=batch_size,
+        if estimator.val_hiddens is None:
+            estimator.val_hiddens = estimator.val_net.begin_state(batch_size=batch_size,
                                                                func=mx.nd.zeros,
                                                                ctx=ctx)
         else:
-            estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens)
+            estimator.val_hiddens = estimator.detach(estimator.val_hiddens)
 
         mask = data != vocab[vocab.padding_token]
-        output, estimator.eval_hiddens = estimator.eval_net(data, estimator.eval_hiddens)
+        output, estimator.val_hiddens = estimator.val_net(data, estimator.val_hiddens)
         output = output.reshape((-3, -1))
-        L = estimator.evaluation_loss(output, target.reshape(-1, ) * mask.reshape(-1))
+        L = estimator.val_loss(output, target.reshape(-1, ) * mask.reshape(-1))
         L = L * mask
 
         return data, target, output, L
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index 6155e0e7c5..808eabb27d 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -38,8 +38,8 @@ def __init__(self, net, loss, train_metrics=None,
                  initializer=None,
                  trainer=None,
                  context=None,
-                 evaluation_loss=None,
-                 eval_net=None,
+                 val_loss=None,
+                 val_net=None,
                  batch_processor=LanguageModelBatchProcessor(),
                  bptt=70):
         super().__init__(net=net, loss=loss,
@@ -48,11 +48,11 @@ def __init__(self, net, loss, train_metrics=None,
                          initializer=initializer,
                          trainer=trainer,
                          context=context,
-                         evaluation_loss=evaluation_loss,
-                         eval_net=eval_net,
+                         val_loss=val_loss,
+                         val_net=val_net,
                          batch_processor=batch_processor)
         self.hiddens = None
-        self.eval_hiddens = None
+        self.val_hiddens = None
         self.avg_param = None
         self.bptt = bptt
         self.ntasgd = False
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 378e152a14..b735fc601e 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -39,7 +39,7 @@ def __init__(self):
 
     def epoch_begin(self, estimator, *args, **kwargs):
         estimator.hiddens = None
-        estimator.eval_hiddens = None
+        estimator.val_hiddens = None
     
 class AvgParamHandler(BatchEnd, EpochEnd):
     def __init__(self, data_length):

From 17ef38cc1b0b94439e96333a9466979e96565efa Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 17 Jan 2020 10:05:44 +0000
Subject: [PATCH 09/32] add word language model evaluation code

---
 scripts/estimator/word_language_model_estimator.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index 1138a43e8d..8dec869d12 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -224,10 +224,14 @@ def check_initialized(net):
 
 sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data))
 val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False)
+test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data),
+                                           use_variable_length=False)
 train_data_loader = mx.gluon.data.DataLoader(train_data,
                                              batch_sampler=sampler)
 val_data_loader = mx.gluon.data.DataLoader(val_data,
                                            batch_sampler=val_sampler)
+test_data_loader = mx.gluon.data.DataLoader(test_data,
+                                            batch_sampler=test_sampler)
 
 train_metric = mx.metric.Loss(train_loss)
 val_metric = mx.metric.Loss(loss)
@@ -249,3 +253,6 @@ def check_initialized(net):
         epochs=args.epochs,
         event_handlers=event_handlers,
         batch_axis=1)
+
+est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)
+est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)

From 87651c5d5f5544bfa1016a26a3ae03449af220cc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 17 Jan 2020 10:35:00 +0000
Subject: [PATCH 10/32] update parallel language model

---
 .../language_model_batch_processor.py         | 20 +++++++++++++------
 .../estimator/language_model_event_handler.py |  5 +++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index b6a72777ca..8cab67b478 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -22,8 +22,10 @@
 import mxnet as mx
 from mxnet.gluon.contrib.estimator import BatchProcessor
 from mxnet.gluon.utils import split_and_load
+from ..utils import Parallel
+from .parallel_language_model import ParallelBigRNN
 
-__all__ = ['LanguageModelBatchProcessor']
+__all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor']
 
 class LanguageModelBatchProcessor(BatchProcessor):
     def __init__(self):
@@ -86,10 +88,16 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         return data, target, outputs, Ls
 
 class ParallelLanguageModelBatchProcessor(BatchProcessor):
-    def __init__(self):
-        pass
+    def __init__(self, loss):
+        self.loss = loss
+
+    def _get_parallel_model(self):
+        if self.parallel_model is None:
+            self.parallel_model = ParallelBigRNN(estimator.net, self.loss)
+            self.parallel_model = Parallel(len(estimator.context), self.parallel_model)
 
     def fit_batch(self, estimator, train_batch, batch_axis=0):
+        self._get_parallel_model()
         data, target, mask, sample = train_batch
         batch_size = data.shape(batch_axis)
         if estimator.hiddens is None:
@@ -100,16 +108,16 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
             estimator.hiddens = estimator.detach(estimator.hiddens)
         Ls = []
         for _, batch in enumerate(zip(data, target, mask, sample, hiddens)):
-            paralllel.put(batch)
+            self.parallel_model.put(batch)
 
         for _ in range(len(data)):
-            hidden, ls = parallel.get()
+            hidden, ls = self.parallel_model.get()
             index = estimator.context.index(hidden[0].context)
             estimator.hiddens[index] = hidden
             Ls.append(ls)
 
         #Ls = [l / estimator.bptt for l in Ls]
-        return data, target, hiddens, Ls
+        return data, target, None, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         data, target = val_batch
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index b735fc601e..609372735f 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -31,7 +31,8 @@
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
            'RNNGradientUpdateHandler', 'MetricResetHandler',
-           'WordLanguageModelCheckpointHandler']
+           'WordLanguageModelCheckpointHandler',
+           'LargeRNNGradientUpdateHandler']
 
 class HiddenStateHandler(EpochBegin):
     def __init__(self):
@@ -149,7 +150,7 @@ def batch_end(self, estimator, *args, **kwargs):
 
         for ctx in estimator.context:
             x = embedding_params[0].grad(ctx)
-            x[:] *= self.batch_size # can I get the batch size dynamically?
+            x[:] *= self.batch_size
             encoder_grad = [p.grad(ctx) for p in encoder_params]
             gluon.utils.clip_global_norm(encoder_grad, self.clip)
             

From e56572321d9ca75bf1c944695fbd1043d2cb209d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 17 Jan 2020 11:06:44 +0000
Subject: [PATCH 11/32] update large language model estimator

---
 .../large_word_language_model_estimator.py    | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
index c7ee01ff44..7ea7bcbb96 100644
--- a/scripts/estimator/large_word_language_model_estimator.py
+++ b/scripts/estimator/large_word_language_model_estimator.py
@@ -26,6 +26,10 @@
 import gluonnlp as nlp
 from gluonnlp.utils import Parallel, Parallelizable
 from sampler import LogUniformSampler
+from gluonnlp.estimator import ParallelLanguageModelBatchProcessor
+from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler
+from gluonnlp.estimator import LargeRNNGradientUpdateHandler
+from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
@@ -157,14 +161,16 @@ def _split_and_sample(x, y):
 # Build the model
 ###############################################################################
 
-eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
-                                             args.nlayers, args.nproj,
-                                             embed_dropout=args.dropout,
-                                             encode_dropout=args.dropout)
 model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
                                               args.nlayers, args.nproj, args.k,
                                               embed_dropout=args.dropout,
                                               encode_dropout=args.dropout)
+eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
+                                             args.nlayers, args.nproj,
+                                             embed_dropout=args.dropout,
+                                             encode_dropout=args.dropout,
+                                             params=model.collect_params())
+
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
 model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
 trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
@@ -181,5 +187,27 @@ def _split_and_sample(x, y):
     tmp = type(batch)
 
 model.hybridize(static_alloc=True, static_shape=True)
-parallel_model = ParallelBigRNN(model, loss)
-parallel = Parallel(len(context), parallel_model)
+
+train_metric = mx.metric.Loss(loss)
+val_metric = mx.metric.Loss(loss)
+batch_processor = ParallelLanguageModelBatchProcessor(loss)
+lm_estimator = LanguageModelEstimator(net=model, loss=loss,
+                                      train_metrics=train_metric,
+                                      val_metrics=val_metric,
+                                      trainer=trainer,
+                                      context=context,
+                                      val_loss=loss,
+                                      val_net=eval_model,
+                                      batch_processor=batch_processor)
+
+hidden_state_handler = HiddenStateHandler()
+gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip)
+metric_handler = MetricResetHandler(metrics=est.train_metrics,
+                                    log_interval=args.log_interval)
+checkpoint_handler = WrodLanguageModelCheckpointHandler(args.save)
+
+event_handlers = [hidden_state_handler, gradient_handler,
+                  metric_handler, checkpoint_handler]
+
+lm_estimator.fit(train_data=train_data, epochs=args.epochs,
+                 event_handlers=event_handlers, batch_axis=0)

From cfc2f6ddde095f2fabe1c513fa2a9e87251820e6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 17 Jan 2020 11:27:18 +0000
Subject: [PATCH 12/32] fix typos

---
 scripts/estimator/large_word_language_model_estimator.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
index 7ea7bcbb96..a5d6468c23 100644
--- a/scripts/estimator/large_word_language_model_estimator.py
+++ b/scripts/estimator/large_word_language_model_estimator.py
@@ -30,6 +30,7 @@
 from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler
 from gluonnlp.estimator import LargeRNNGradientUpdateHandler
 from gluonnlp.estimator import WordLanguageModelCheckpointHandler
+from gluonnlp.estimator import LanguageModelEstimator
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
@@ -183,9 +184,6 @@ def _split_and_sample(x, y):
     print('Loaded parameters from checkpoint %s'%(checkpoint_name))
 
 
-for i, batch in enumerate(train_data):
-    tmp = type(batch)
-
 model.hybridize(static_alloc=True, static_shape=True)
 
 train_metric = mx.metric.Loss(loss)
@@ -202,9 +200,9 @@ def _split_and_sample(x, y):
 
 hidden_state_handler = HiddenStateHandler()
 gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip)
-metric_handler = MetricResetHandler(metrics=est.train_metrics,
+metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics,
                                     log_interval=args.log_interval)
-checkpoint_handler = WrodLanguageModelCheckpointHandler(args.save)
+checkpoint_handler = WordLanguageModelCheckpointHandler(args.save)
 
 event_handlers = [hidden_state_handler, gradient_handler,
                   metric_handler, checkpoint_handler]

From 3bf7679f2376621290d5eb12e2501a6827f5e660 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Sun, 19 Jan 2020 09:41:53 +0000
Subject: [PATCH 13/32] fix large language model estimator bugs

---
 .../large_word_language_model_estimator.py    | 30 ++++++--
 .../language_model_batch_processor.py         | 28 ++++---
 .../estimator/language_model_event_handler.py | 27 +++++--
 .../estimator/length_normalized_loss.py       | 77 +++++++++++++++++++
 .../estimator/parallel_language_model.py      |  6 +-
 5 files changed, 141 insertions(+), 27 deletions(-)
 create mode 100644 src/gluonnlp/estimator/length_normalized_loss.py

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
index a5d6468c23..49ce2fb458 100644
--- a/scripts/estimator/large_word_language_model_estimator.py
+++ b/scripts/estimator/large_word_language_model_estimator.py
@@ -23,6 +23,7 @@
 import numpy as np
 import mxnet as mx
 from mxnet import gluon, autograd
+from mxnet.gluon.contrib.estimator import CheckpointHandler
 import gluonnlp as nlp
 from gluonnlp.utils import Parallel, Parallelizable
 from sampler import LogUniformSampler
@@ -31,6 +32,7 @@
 from gluonnlp.estimator import LargeRNNGradientUpdateHandler
 from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 from gluonnlp.estimator import LanguageModelEstimator
+from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
@@ -169,8 +171,7 @@ def _split_and_sample(x, y):
 eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
                                              args.nlayers, args.nproj,
                                              embed_dropout=args.dropout,
-                                             encode_dropout=args.dropout,
-                                             params=model.collect_params())
+                                             encode_dropout=args.dropout)
 
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
 model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
@@ -187,8 +188,11 @@ def _split_and_sample(x, y):
 model.hybridize(static_alloc=True, static_shape=True)
 
 train_metric = mx.metric.Loss(loss)
-val_metric = mx.metric.Loss(loss)
-batch_processor = ParallelLanguageModelBatchProcessor(loss)
+val_metric = LengthNormalizedLoss(loss)
+batch_processor = ParallelLanguageModelBatchProcessor(loss=loss,
+                                                      vocab=vocab,
+                                                      batch_size=args.batch_size,
+                                                      val_batch_size=args.batch_size)
 lm_estimator = LanguageModelEstimator(net=model, loss=loss,
                                       train_metrics=train_metric,
                                       val_metrics=val_metric,
@@ -196,16 +200,26 @@ def _split_and_sample(x, y):
                                       context=context,
                                       val_loss=loss,
                                       val_net=eval_model,
-                                      batch_processor=batch_processor)
+                                      batch_processor=batch_processor,
+                                      bptt=args.bptt)
 
 hidden_state_handler = HiddenStateHandler()
 gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip)
 metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics,
                                     log_interval=args.log_interval)
-checkpoint_handler = WordLanguageModelCheckpointHandler(args.save)
+checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN')
 
 event_handlers = [hidden_state_handler, gradient_handler,
                   metric_handler, checkpoint_handler]
 
-lm_estimator.fit(train_data=train_data, epochs=args.epochs,
-                 event_handlers=event_handlers, batch_axis=0)
+lm_estimator.fit(train_data=train_data,
+                 #epochs=args.epochs,
+                 event_handlers=event_handlers,
+                 batches=5,
+                 batch_axis=0)
+
+val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics)
+lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0])
+lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True)
+lm_estimator.val_net.load_parameters(args.save + '/largeRNN-epoch0batch5.params')
+lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler])
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 8cab67b478..d1be813fad 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -88,26 +88,29 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         return data, target, outputs, Ls
 
 class ParallelLanguageModelBatchProcessor(BatchProcessor):
-    def __init__(self, loss):
+    def __init__(self, loss, vocab, batch_size, val_batch_size):
         self.loss = loss
+        self.parallel_model = None
+        self.batch_size = batch_size
+        self.val_batch_size = val_batch_size
+        self.vocab = vocab
 
-    def _get_parallel_model(self):
+    def _get_parallel_model(self, estimator):
         if self.parallel_model is None:
-            self.parallel_model = ParallelBigRNN(estimator.net, self.loss)
+            self.parallel_model = ParallelBigRNN(estimator.net, self.loss, self.batch_size)
             self.parallel_model = Parallel(len(estimator.context), self.parallel_model)
 
     def fit_batch(self, estimator, train_batch, batch_axis=0):
-        self._get_parallel_model()
+        self._get_parallel_model(estimator)
         data, target, mask, sample = train_batch
-        batch_size = data.shape(batch_axis)
         if estimator.hiddens is None:
-            estimator.hiddens = [estimator.net.begin_state(batch_size,
+            estimator.hiddens = [estimator.net.begin_state(batch_size=self.batch_size,
                                                            func=mx.nd.zeros,
                                                            ctx=ctx) for ctx in estimator.context]
         else:
             estimator.hiddens = estimator.detach(estimator.hiddens)
         Ls = []
-        for _, batch in enumerate(zip(data, target, mask, sample, hiddens)):
+        for _, batch in enumerate(zip(data, target, mask, sample, estimator.hiddens)):
             self.parallel_model.put(batch)
 
         for _ in range(len(data)):
@@ -116,7 +119,8 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
             estimator.hiddens[index] = hidden
             Ls.append(ls)
 
-        #Ls = [l / estimator.bptt for l in Ls]
+        Ls = [l / (estimator.bptt * len(estimator.context)) for l in Ls]
+        Ls = [mx.nd.sum(l) for l in Ls]
         return data, target, None, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
@@ -125,16 +129,16 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         data = data.as_in_context(ctx)
         target = target.as_in_context(ctx)
         if estimator.val_hiddens is None:
-            estimator.val_hiddens = estimator.val_net.begin_state(batch_size=batch_size,
+            estimator.val_hiddens = estimator.val_net.begin_state(batch_size=self.val_batch_size,
                                                                func=mx.nd.zeros,
                                                                ctx=ctx)
         else:
             estimator.val_hiddens = estimator.detach(estimator.val_hiddens)
 
-        mask = data != vocab[vocab.padding_token]
+        mask = data != self.vocab[self.vocab.padding_token]
+        mask = mask.reshape(-1)
         output, estimator.val_hiddens = estimator.val_net(data, estimator.val_hiddens)
         output = output.reshape((-3, -1))
         L = estimator.val_loss(output, target.reshape(-1, ) * mask.reshape(-1))
-        L = L * mask
 
-        return data, target, output, L
+        return data, [target, mask], output, L
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 609372735f..b9ceca599e 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -28,6 +28,8 @@
 from mxnet.gluon.contrib.estimator import GradientUpdateHandler
 from mxnet.gluon.contrib.estimator import MetricHandler
 from mxnet.gluon.utils import clip_global_norm
+from mxnet.metric import Loss as MetricLoss
+from .length_normalized_loss import LengthNormalizedLoss
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
            'RNNGradientUpdateHandler', 'MetricResetHandler',
@@ -152,12 +154,12 @@ def batch_end(self, estimator, *args, **kwargs):
             x = embedding_params[0].grad(ctx)
             x[:] *= self.batch_size
             encoder_grad = [p.grad(ctx) for p in encoder_params]
-            gluon.utils.clip_global_norm(encoder_grad, self.clip)
+            clip_global_norm(encoder_grad, self.clip)
             
         estimator.trainer.step(len(estimator.context))
 
 class MetricResetHandler(BatchBegin, MetricHandler):
-    def __init__(self, metrics, log_interval=1):
+    def __init__(self, metrics, log_interval=None):
         super().__init__(metrics=metrics)
         self.batch_id = 0
         self.log_interval = log_interval
@@ -168,11 +170,24 @@ def epoch_begin(self, estimator, *args, **kwargs):
             metric.reset()
 
     def batch_begin(self, estimator, *args, **kwargs):
-        if self.batch_id % self.log_interval == 1:
-            for metric in self.metrics:
-                metric.reset_local()
+        if self.log_interval is not None:
+            if self.batch_id % self.log_interval == 0:
+                for metric in self.metrics:
+                    metric.reset_local()
         self.batch_id += 1
 
+    def batch_end(self, estimator, *args, **kwargs):
+        pred = kwargs['pred']
+        label = kwargs['label']
+        loss = kwargs['loss']
+        for metric in self.metrics:
+            if isinstance(metric, MetricLoss):
+                metric.update(0, loss)
+            elif isinstance(metric, LengthNormalizedLoss):
+                metric.update(label, loss)
+            else:
+                metric.update(label, pred)
+
 class WordLanguageModelCheckpointHandler(EpochEnd):
     def __init__(self, save):
         self.save = save
@@ -195,3 +210,5 @@ def epoch_end(self, estimator, *args, **kwargs):
                 mx.nd.save(self.save, estimator.avg_param)
             else:
                 estimator.net.save_parameters(self.save)
+
+
diff --git a/src/gluonnlp/estimator/length_normalized_loss.py b/src/gluonnlp/estimator/length_normalized_loss.py
new file mode 100644
index 0000000000..e4558c6fb1
--- /dev/null
+++ b/src/gluonnlp/estimator/length_normalized_loss.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Length Normalized Loss """
+
+from mxnet import ndarray
+from mxnet.metric import EvalMetric
+
+__all__ = ['LengthNormalizedLoss']
+
+class LengthNormalizedLoss(EvalMetric):
+    """Compute length normalized loss metrics
+
+    Parameters
+    ----------
+    axis : int, default=1
+        The axis that represents classes
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    """
+    def __init__(self, axis=0, name='length-normalized-loss',
+                 output_names=None, label_names=None):
+        super(LengthNormalizedLoss, self).__init__(
+            name, axis=axis,
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
+
+    # Parameter labels should be a list in the form of  [target_sequence,
+    # target_seqauence_valid_length]
+    def update(self, labels, preds):
+        if not isinstance(labels, list) or len(labels) != 2:
+            raise ValueError('labels must be a list. Its first element should be'
+                             ' target sequence and the second element should be'
+                             'the valid length of sequence.')
+
+        _, seq_valid_length = labels
+
+        if not isinstance(seq_valid_length, list):
+            seq_valid_length = [seq_valid_length]
+
+        if not isinstance(preds, list):
+            preds = [preds]
+
+        for length in seq_valid_length:
+            if isinstance(length, ndarray.ndarray.NDArray):
+                total_length = ndarray.sum(length).asscalar()
+            else:
+                total_length = length
+            self.num_inst += total_length
+            self.global_num_inst += total_length
+
+        for pred in preds:
+            if isinstance(pred, ndarray.ndarray.NDArray):
+                loss = ndarray.sum(pred).asscalar()
+            else:
+                loss = pred
+            self.sum_metric += loss
+            self.global_sum_metric += loss
diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py
index 8697d90360..a4e43b29a8 100644
--- a/src/gluonnlp/estimator/parallel_language_model.py
+++ b/src/gluonnlp/estimator/parallel_language_model.py
@@ -20,13 +20,15 @@
 """ Gluon Parallel Languange Model """
 
 from gluonnlp.utils import Parallel, Parallelizable
+from mxnet import autograd
 
 __all__ = ['ParallelBigRNN']
 
 class ParallelBigRNN(Parallelizable):
-    def __init__(self, rnn, loss_fn):
+    def __init__(self, rnn, loss_fn, batch_size):
         self._model = rnn
         self._loss = loss_fn
+        self._batch_size = batch_size
 
     def forward_backward(self, x):
         X, y, m, s, h = x
@@ -35,7 +37,7 @@ def forward_backward(self, x):
             output = output.reshape((-3, -1))
             new_target = new_target.reshape((-1,))
             ls = self._loss(output, new_target) * m.reshape((-1,))
-            ls = ls / args.batch_size
+            ls = ls / self._batch_size
             ls.backward()
         return hidden, ls
 

From 50b3a95af2f51c5752b7cd5112e03b053520a829 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Wed, 22 Jan 2020 06:33:33 +0000
Subject: [PATCH 14/32] some bug fixes on language model estimator

---
 scripts/estimator/large_word_language_model_estimator.py | 4 +++-
 scripts/estimator/word_language_model_estimator.py       | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
index 49ce2fb458..0d9d9d0cca 100644
--- a/scripts/estimator/large_word_language_model_estimator.py
+++ b/scripts/estimator/large_word_language_model_estimator.py
@@ -208,9 +208,11 @@ def _split_and_sample(x, y):
 metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics,
                                     log_interval=args.log_interval)
 checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN')
+logging_handler = LoggingHandler(log_interval=args.log_interval,
+                                 metrics=lm_estimator.train_metrics)
 
 event_handlers = [hidden_state_handler, gradient_handler,
-                  metric_handler, checkpoint_handler]
+                  metric_handler, checkpoint_handler, logging_handler]
 
 lm_estimator.fit(train_data=train_data,
                  #epochs=args.epochs,
diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
index 8dec869d12..f14ea63030 100644
--- a/scripts/estimator/word_language_model_estimator.py
+++ b/scripts/estimator/word_language_model_estimator.py
@@ -254,5 +254,6 @@ def check_initialized(net):
         event_handlers=event_handlers,
         batch_axis=1)
 
+est.net.load_parameters(args.save)
 est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)
 est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)

From 275098f5bbb4088345bed278e79180762a070f4d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Wed, 12 Feb 2020 08:36:17 +0000
Subject: [PATCH 15/32] update large language model estimator

---
 .../large_word_language_model_estimator.py    | 227 ---------------
 .../word_language_model_estimator.py          | 259 ------------------
 .../language_model_batch_processor.py         |   2 +-
 .../estimator/language_model_event_handler.py |  27 +-
 4 files changed, 26 insertions(+), 489 deletions(-)
 delete mode 100644 scripts/estimator/large_word_language_model_estimator.py
 delete mode 100644 scripts/estimator/word_language_model_estimator.py

diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py
deleted file mode 100644
index 0d9d9d0cca..0000000000
--- a/scripts/estimator/large_word_language_model_estimator.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-import math
-import os
-import sys
-import argparse
-import numpy as np
-import mxnet as mx
-from mxnet import gluon, autograd
-from mxnet.gluon.contrib.estimator import CheckpointHandler
-import gluonnlp as nlp
-from gluonnlp.utils import Parallel, Parallelizable
-from sampler import LogUniformSampler
-from gluonnlp.estimator import ParallelLanguageModelBatchProcessor
-from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler
-from gluonnlp.estimator import LargeRNNGradientUpdateHandler
-from gluonnlp.estimator import WordLanguageModelCheckpointHandler
-from gluonnlp.estimator import LanguageModelEstimator
-from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-###############################################################################
-# Arg parser
-###############################################################################
-parser = argparse.ArgumentParser(description=
-                                 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW')
-parser.add_argument('--save', type=str, default='model.params',
-                    help='path to save the final model.')
-parser.add_argument('--emsize', type=int, default=512,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=2048,
-                    help='number of hidden units per layer')
-parser.add_argument('--nproj', type=int, default=512,
-                    help='number of projection units per layer. Could be different from embsize')
-parser.add_argument('--nlayers', type=int, default=1,
-                    help='number of layers')
-parser.add_argument('--from-epoch', type=int, default=None,
-                    help='start training or testing from the provided epoch')
-parser.add_argument('--epochs', type=int, default=50,
-                    help='number of epoch for training')
-parser.add_argument('--batch-size', type=int, default=128,
-                    help='batch size per gpu')
-parser.add_argument('--dropout', type=float, default=0.1,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--eps', type=float, default=1,
-                    help='initial history accumulation for adagrad')
-parser.add_argument('--bptt', type=int, default=20,
-                    help='sequence length')
-parser.add_argument('--k', type=int, default=8192,
-                    help='number of noise samples for estimation')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
-parser.add_argument('--log-interval', type=int, default=1000,
-                    help='report interval')
-parser.add_argument('--seed', type=int, default=0,
-                    help='random seed')
-parser.add_argument('--lr', type=float, default=0.2,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=1.0,
-                    help='gradient clipping by global norm.')
-parser.add_argument('--test-mode', action='store_true',
-                    help='Whether to run through the script with few examples')
-parser.add_argument('--eval-only', action='store_true',
-                    help='Whether to only run evaluation for the trained model')
-args = parser.parse_args()
-
-segments = ['train', 'test']
-max_nbatch_eval = None
-
-if args.test_mode:
-    args.emsize = 200
-    args.log_interval = 1
-    args.nhid = 200
-    args.nlayers = 1
-    args.epochs = 20
-    max_nbatch_eval = 3
-    segments = ['test', 'test']
-
-print(args)
-mx.random.seed(args.seed)
-np.random.seed(args.seed)
-
-context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
-          [mx.gpu(int(x)) for x in args.gpus.split(',')]
-
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context))
-os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context))
-
-###############################################################################
-# Data stream
-###############################################################################
-train_data_stream, test_data_stream = \
-    [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='<eos>')
-     for segment in segments]
-vocab = train_data_stream.vocab
-ntokens = len(vocab)
-
-# Sampler for generating negative classes during training with importance sampling
-sampler = LogUniformSampler(ntokens, args.k)
-
-# Given a list of (array, context) pairs, load array[i] on context[i]
-def _load(xs):
-    ret = []
-    for x, ctx in zip(xs, context):
-        if isinstance(x, tuple):
-            ret.append([y.as_in_context(ctx) for y in x])
-        else:
-            ret.append(x.as_in_context(ctx))
-    return ret
-
-# Transformation for a data batch for training.
-# First, load the data, target and mask to target contexts.
-# Second, the LSTM-2048-512 model performs importance sampling for decoding
-# during training, we need to sample negative candidate classes by invoking the
-# log uniform sampler.
-def _split_and_sample(x, y):
-    m = x != vocab[vocab.padding_token]  # mask padding
-    num_ctx = len(context)
-    if num_ctx > 1:
-        xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True)
-        ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True)
-        ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True)
-    else:
-        xs, ys, ms = [x], [y], [m]
-    xs = _load(xs)
-    ys = _load(ys)
-    ms = _load(ms)
-    ss = [sampler(y) for y in ys]
-    ss = _load(ss)
-    return xs, ys, ms, ss
-
-train_batch_size = args.batch_size * len(context)
-train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
-train_data = train_batchify(train_data_stream)
-train_data = train_data.transform(_split_and_sample)
-
-test_batch_size = args.batch_size
-test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
-test_data = test_batchify(test_data_stream)
-test_data = nlp.data.PrefetchingStream(test_data)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
-                                              args.nlayers, args.nproj, args.k,
-                                              embed_dropout=args.dropout,
-                                              encode_dropout=args.dropout)
-eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
-                                             args.nlayers, args.nproj,
-                                             embed_dropout=args.dropout,
-                                             encode_dropout=args.dropout)
-
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
-trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
-trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params)
-if args.from_epoch:
-    from_epoch = args.from_epoch
-    checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d'))
-    model.load_parameters(checkpoint_name)
-    trainer.load_states('%s.state'%args.save)
-    print('Loaded parameters from checkpoint %s'%(checkpoint_name))
-
-
-model.hybridize(static_alloc=True, static_shape=True)
-
-train_metric = mx.metric.Loss(loss)
-val_metric = LengthNormalizedLoss(loss)
-batch_processor = ParallelLanguageModelBatchProcessor(loss=loss,
-                                                      vocab=vocab,
-                                                      batch_size=args.batch_size,
-                                                      val_batch_size=args.batch_size)
-lm_estimator = LanguageModelEstimator(net=model, loss=loss,
-                                      train_metrics=train_metric,
-                                      val_metrics=val_metric,
-                                      trainer=trainer,
-                                      context=context,
-                                      val_loss=loss,
-                                      val_net=eval_model,
-                                      batch_processor=batch_processor,
-                                      bptt=args.bptt)
-
-hidden_state_handler = HiddenStateHandler()
-gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip)
-metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics,
-                                    log_interval=args.log_interval)
-checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN')
-logging_handler = LoggingHandler(log_interval=args.log_interval,
-                                 metrics=lm_estimator.train_metrics)
-
-event_handlers = [hidden_state_handler, gradient_handler,
-                  metric_handler, checkpoint_handler, logging_handler]
-
-lm_estimator.fit(train_data=train_data,
-                 #epochs=args.epochs,
-                 event_handlers=event_handlers,
-                 batches=5,
-                 batch_axis=0)
-
-val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics)
-lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0])
-lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True)
-lm_estimator.val_net.load_parameters(args.save + '/largeRNN-epoch0batch5.params')
-lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler])
diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py
deleted file mode 100644
index f14ea63030..0000000000
--- a/scripts/estimator/word_language_model_estimator.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import math
-import os
-import sys
-import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
-from mxnet.gluon.contrib.estimator import LoggingHandler
-from gluonnlp.estimator import JointActivationRegularizationLoss
-from gluonnlp.estimator import LanguageModelEstimator
-from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
-from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
-from gluonnlp.estimator import WordLanguageModelCheckpointHandler
-from gluonnlp.estimator import LanguageModelBatchProcessor
-from gluonnlp.estimator import MetricResetHandler
-from mxnet.gluon.data.sampler import BatchSampler
-
-class BatchVariableLenTextSampler(BatchSampler):
-    def __init__(self, bptt, length, use_variable_length=True):
-        self.bptt = bptt
-        self.length = length
-        self.index = 0
-        self.use_variable_length = use_variable_length
-
-    def __iter__(self):
-        self.index = 0
-        while self.index < self.length - 2:
-            if self.use_variable_length:
-                bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2
-                seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
-            else:
-                seq_len = self.bptt
-            seq_len = min(seq_len, self.length - self.index - 1)
-            # batch_size = seq_len + 1
-            batch = []
-            for i in range(self.index, self.index + seq_len + 1):
-                batch.append(i)
-            self.index += seq_len
-            yield batch
-
-    def __len__(self):
-        # you may never get real size of the data sampler beforehand. May need some
-        # postprocessing after fetching the data batch
-        return int(self.length / 5) + 1
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description=
-                                 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.')
-parser.add_argument('--model', type=str, default='lstm',
-                    help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)')
-parser.add_argument('--emsize', type=int, default=400,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=1150,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=3,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=30,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=750,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=80, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=70,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.4,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--dropout_h', type=float, default=0.2,
-                    help='dropout applied to hidden layer (0 = no dropout)')
-parser.add_argument('--dropout_i', type=float, default=0.65,
-                    help='dropout applied to input layer (0 = no dropout)')
-parser.add_argument('--dropout_e', type=float, default=0.1,
-                    help='dropout applied to embedding layer (0 = no dropout)')
-parser.add_argument('--weight_dropout', type=float, default=0.5,
-                    help='weight dropout applied to h2h weight matrix (0 = no weight dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.params',
-                    help='path to save the final model')
-parser.add_argument('--eval_only', action='store_true',
-                    help='Whether to only evaluate the trained model')
-parser.add_argument('--gpu', type=str, help='single gpu id')
-parser.add_argument('--optimizer', type=str, default='sgd',
-                    help='optimizer to use (sgd, adam)')
-parser.add_argument('--wd', type=float, default=1.2e-6,
-                    help='weight decay applied to all weights')
-parser.add_argument('--alpha', type=float, default=2,
-                    help='alpha L2 regularization on RNN activation '
-                         '(alpha = 0 means no regularization)')
-parser.add_argument('--beta', type=float, default=1,
-                    help='beta slowness regularization applied on RNN activation '
-                         '(beta = 0 means no regularization)')
-parser.add_argument('--ntasgd', action='store_true',
-                    help='Whether to apply ntasgd')
-parser.add_argument('--test_mode', action='store_true',
-                    help='Whether to run through the script with few examples')
-parser.add_argument('--lr_update_interval', type=int, default=30,
-                    help='lr udpate interval')
-parser.add_argument('--lr_update_factor', type=float, default=0.1,
-                    help='lr udpate factor')
-args = parser.parse_args()
-
-###############################################################################
-# Load data
-###############################################################################
-
-context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))]
-
-assert args.batch_size % len(context) == 0, \
-    'Total batch size must be multiple of the number of devices'
-
-assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \
-    'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0'
-
-train_dataset, val_dataset, test_dataset = \
-    [nlp.data.WikiText2(segment=segment,
-                        skip_empty=False, bos=None, eos='<eos>')
-     for segment in ['train', 'val', 'test']]
-
-vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size)
-train_data = train_batchify(train_dataset)
-val_batch_size = 10
-val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
-val_data = val_batchify(val_dataset)
-test_batch_size = 1
-test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
-test_data = test_batchify(test_dataset)
-
-if args.test_mode:
-    args.emsize = 200
-    args.nhid = 200
-    args.nlayers = 1
-    args.epochs = 3
-    train_data = train_data[0:100]
-    val_data = val_data[0:100]
-    test_data = test_data[0:100]
-
-print(args)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-ntokens = len(vocab)
-
-if args.weight_dropout > 0:
-    print('Use AWDRNN')
-    model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
-                                   args.tied, args.dropout, args.weight_dropout,
-                                   args.dropout_h, args.dropout_i, args.dropout_e)
-    model.initialize(mx.init.Xavier(), ctx=context)
-    model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
-                                  args.tied, args.dropout, args.weight_dropout,
-                                  args.dropout_h, args.dropout_i, args.dropout_e,
-                                  params=model.collect_params())
-else:
-    model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
-                                        args.nhid, args.nlayers, args.dropout, args.tied)
-    model.initialize(mx.init.Xavier(), ctx=context)
-    model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
-                                       args.nhid, args.nlayers, args.dropout, args.tied,
-                                       params=model.collect_params())
-
-
-model.hybridize(static_alloc=True)
-
-print(model)
-
-
-def check_initialized(net):
-    params = net.collect_params()
-    for param in params:
-        try:
-            params[param].list_ctx()
-        except RuntimeError:
-            return False
-    return True
-    
-print(check_initialized(model))
-print(check_initialized(model_eval))
-                                    
-if args.optimizer == 'sgd':
-    trainer_params = {'learning_rate': args.lr,
-                      'momentum': 0,
-                      'wd': args.wd}
-elif args.optimizer == 'adam':
-    trainer_params = {'learning_rate': args.lr,
-                      'wd': args.wd,
-                      'beta1': 0,
-                      'beta2': 0.999,
-                      'epsilon': 1e-9}
-
-trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params,
-                        update_on_kvstore=False)
-
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
-
-sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data))
-val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False)
-test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data),
-                                           use_variable_length=False)
-train_data_loader = mx.gluon.data.DataLoader(train_data,
-                                             batch_sampler=sampler)
-val_data_loader = mx.gluon.data.DataLoader(val_data,
-                                           batch_sampler=val_sampler)
-test_data_loader = mx.gluon.data.DataLoader(test_data,
-                                            batch_sampler=test_sampler)
-
-train_metric = mx.metric.Loss(train_loss)
-val_metric = mx.metric.Loss(loss)
-batch_processor = LanguageModelBatchProcessor()
-est = LanguageModelEstimator(net=model, loss=train_loss,
-                             train_metrics=train_metric,
-                             val_metrics=val_metric,
-                             trainer=trainer, context=context,
-                             val_loss=loss,
-                             val_net=model_eval,
-                             batch_processor=batch_processor)
-event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)),
-                  LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
-                  RNNGradientUpdateHandler(clip=args.clip),
-                  LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
-                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval),
-                  WordLanguageModelCheckpointHandler(args.save)]
-est.fit(train_data=train_data_loader, val_data=val_data_loader,
-        epochs=args.epochs,
-        event_handlers=event_handlers,
-        batch_axis=1)
-
-est.net.load_parameters(args.save)
-est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)
-est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index d1be813fad..c1b790a127 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -119,7 +119,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
             estimator.hiddens[index] = hidden
             Ls.append(ls)
 
-        Ls = [l / (estimator.bptt * len(estimator.context)) for l in Ls]
+        Ls = [l / estimator.bptt for l in Ls]
         Ls = [mx.nd.sum(l) for l in Ls]
         return data, target, None, Ls
 
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index b9ceca599e..d7dda14080 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -21,11 +21,12 @@
 
 import copy
 import warnings
+import time
 
 import mxnet as mx
 from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin
 from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd
-from mxnet.gluon.contrib.estimator import GradientUpdateHandler
+from mxnet.gluon.contrib.estimator import GradientUpdateHandler, LoggingHandler
 from mxnet.gluon.contrib.estimator import MetricHandler
 from mxnet.gluon.utils import clip_global_norm
 from mxnet.metric import Loss as MetricLoss
@@ -33,7 +34,7 @@
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
            'RNNGradientUpdateHandler', 'MetricResetHandler',
-           'WordLanguageModelCheckpointHandler',
+           'WordLanguageModelCheckpointHandler', 'ParallelLoggingHandler',
            'LargeRNNGradientUpdateHandler']
 
 class HiddenStateHandler(EpochBegin):
@@ -212,3 +213,25 @@ def epoch_end(self, estimator, *args, **kwargs):
                 estimator.net.save_parameters(self.save)
 
 
+class ParallelLoggingHandler(LoggingHandler):
+    def __init__(self, *args, **kwargs):
+        super(ParallelLoggingHandler, self).__init__(*args, **kwargs)
+
+    def batch_end(self, estimator, *args, **kwargs):
+        if isinstance(self.log_interval, int):
+            batch_time = time.time() - self.batch_start
+            msg = '[Epoch %d][Batch %d]' % (self.current_epoch, self.batch_index)
+            cur_batches = kwargs['batch'][0]
+            for batch in cur_batches:
+                self.processed_samples += batch.shape[0]
+            msg += '[Samples %s]' % (self.processed_samples)
+            self.log_interval_time += batch_time
+            if self.batch_index % self.log_interval == self.log_interval - 1:
+                msg += 'time/interval %.3fs ' % self.log_interval_time
+                self.log_interval_time = 0
+                for metric in self.metrics:
+                    name, val = metric.get()
+                    msg += '%s: %.4f, ' % (name, val)
+                estimator.logger.info(msg.rstrip(', '))
+        self.batch_index += 1
+                

From 8780711673e527304f5804d5cfa0a645bd926c64 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Wed, 12 Feb 2020 16:17:32 +0000
Subject: [PATCH 16/32] add script files

---
 .../large_word_language_model_estimator.py    | 235 ++++++++++++++++
 .../word_language_model_estimator.py          | 259 ++++++++++++++++++
 2 files changed, 494 insertions(+)
 create mode 100644 scripts/language_model/large_word_language_model_estimator.py
 create mode 100644 scripts/language_model/word_language_model_estimator.py

diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py
new file mode 100644
index 0000000000..d80da1b4d8
--- /dev/null
+++ b/scripts/language_model/large_word_language_model_estimator.py
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import math
+import os
+import sys
+import argparse
+import numpy as np
+import mxnet as mx
+from mxnet import gluon, autograd
+from mxnet.gluon.contrib.estimator import CheckpointHandler, LoggingHandler
+import gluonnlp as nlp
+from gluonnlp.utils import Parallel, Parallelizable
+from sampler import LogUniformSampler
+from gluonnlp.estimator import ParallelLanguageModelBatchProcessor
+from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler
+from gluonnlp.estimator import LargeRNNGradientUpdateHandler
+from gluonnlp.estimator import WordLanguageModelCheckpointHandler
+from gluonnlp.estimator import LanguageModelEstimator
+from gluonnlp.estimator import ParallelLoggingHandler
+from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '..', '..'))
+
+nlp.utils.check_version('0.7.0')
+
+###############################################################################
+# Arg parser
+###############################################################################
+parser = argparse.ArgumentParser(description=
+                                 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW')
+parser.add_argument('--save', type=str, default='model.params',
+                    help='path to save the final model.')
+parser.add_argument('--emsize', type=int, default=512,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=2048,
+                    help='number of hidden units per layer')
+parser.add_argument('--nproj', type=int, default=512,
+                    help='number of projection units per layer. Could be different from embsize')
+parser.add_argument('--nlayers', type=int, default=1,
+                    help='number of layers')
+parser.add_argument('--from-epoch', type=int, default=None,
+                    help='start training or testing from the provided epoch')
+parser.add_argument('--epochs', type=int, default=50,
+                    help='number of epoch for training')
+parser.add_argument('--batch-size', type=int, default=128,
+                    help='batch size per gpu')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--eps', type=float, default=1,
+                    help='initial history accumulation for adagrad')
+parser.add_argument('--bptt', type=int, default=20,
+                    help='sequence length')
+parser.add_argument('--k', type=int, default=8192,
+                    help='number of noise samples for estimation')
+parser.add_argument('--gpus', type=str,
+                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
+parser.add_argument('--log-interval', type=int, default=1000,
+                    help='report interval')
+parser.add_argument('--seed', type=int, default=0,
+                    help='random seed')
+parser.add_argument('--lr', type=float, default=0.2,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=1.0,
+                    help='gradient clipping by global norm.')
+parser.add_argument('--test-mode', action='store_true',
+                    help='Whether to run through the script with few examples')
+parser.add_argument('--eval-only', action='store_true',
+                    help='Whether to only run evaluation for the trained model')
+args = parser.parse_args()
+
+segments = ['train', 'test']
+max_nbatch_eval = None
+
+if args.test_mode:
+    args.emsize = 200
+    args.log_interval = 1
+    args.nhid = 200
+    args.nlayers = 1
+    args.epochs = 20
+    max_nbatch_eval = 3
+    segments = ['test', 'test']
+
+print(args)
+mx.random.seed(args.seed)
+np.random.seed(args.seed)
+
+context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
+          [mx.gpu(int(x)) for x in args.gpus.split(',')]
+
+os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context))
+os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context))
+
+###############################################################################
+# Data stream
+###############################################################################
+train_data_stream, test_data_stream = \
+    [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='<eos>')
+     for segment in segments]
+vocab = train_data_stream.vocab
+ntokens = len(vocab)
+
+# Sampler for generating negative classes during training with importance sampling
+sampler = LogUniformSampler(ntokens, args.k)
+
+# Given a list of (array, context) pairs, load array[i] on context[i]
+def _load(xs):
+    ret = []
+    for x, ctx in zip(xs, context):
+        if isinstance(x, tuple):
+            ret.append([y.as_in_context(ctx) for y in x])
+        else:
+            ret.append(x.as_in_context(ctx))
+    return ret
+
+# Transformation for a data batch for training.
+# First, load the data, target and mask to target contexts.
+# Second, the LSTM-2048-512 model performs importance sampling for decoding
+# during training, we need to sample negative candidate classes by invoking the
+# log uniform sampler.
+def _split_and_sample(x, y):
+    m = x != vocab[vocab.padding_token]  # mask padding
+    num_ctx = len(context)
+    if num_ctx > 1:
+        xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True)
+        ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True)
+        ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True)
+    else:
+        xs, ys, ms = [x], [y], [m]
+    xs = _load(xs)
+    ys = _load(ys)
+    ms = _load(ms)
+    ss = [sampler(y) for y in ys]
+    ss = _load(ss)
+    return xs, ys, ms, ss
+
+train_batch_size = args.batch_size * len(context)
+train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
+train_data = train_batchify(train_data_stream)
+train_data = train_data.transform(_split_and_sample)
+
+test_batch_size = args.batch_size
+test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
+test_data = test_batchify(test_data_stream)
+test_data = nlp.data.PrefetchingStream(test_data)
+
+###############################################################################
+# Build the model
+###############################################################################
+
+model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
+                                              args.nlayers, args.nproj, args.k,
+                                              embed_dropout=args.dropout,
+                                              encode_dropout=args.dropout)
+eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
+                                             args.nlayers, args.nproj,
+                                             embed_dropout=args.dropout,
+                                             encode_dropout=args.dropout)
+
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
+trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
+trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params)
+if args.from_epoch:
+    from_epoch = args.from_epoch
+    checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d'))
+    model.load_parameters(checkpoint_name)
+    trainer.load_states('%s.state'%args.save)
+    print('Loaded parameters from checkpoint %s'%(checkpoint_name))
+
+
+model.hybridize(static_alloc=True, static_shape=True)
+
+train_metric = mx.metric.Loss(loss)
+val_metric = LengthNormalizedLoss(loss)
+batch_processor = ParallelLanguageModelBatchProcessor(loss=loss,
+                                                      vocab=vocab,
+                                                      batch_size=args.batch_size,
+                                                      val_batch_size=args.batch_size)
+lm_estimator = LanguageModelEstimator(net=model, loss=loss,
+                                      train_metrics=train_metric,
+                                      val_metrics=val_metric,
+                                      trainer=trainer,
+                                      context=context,
+                                      val_loss=loss,
+                                      val_net=eval_model,
+                                      batch_processor=batch_processor,
+                                      bptt=args.bptt)
+
+hidden_state_handler = HiddenStateHandler()
+gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip)
+metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics,
+                                    log_interval=args.log_interval)
+checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN')
+logging_handler = ParallelLoggingHandler(log_interval=args.log_interval,
+                                         metrics=lm_estimator.train_metrics)
+val_logging_handler = LoggingHandler(log_interval=args.log_interval,
+                                     metrics=lm_estimator.val_metrics)
+
+event_handlers = [hidden_state_handler, gradient_handler,
+                  metric_handler, checkpoint_handler, logging_handler]
+
+if not args.eval_only:
+    lm_estimator.fit(train_data=train_data,
+                     epochs=args.epochs,
+                     event_handlers=event_handlers,
+                     #batches=5,
+                     batch_axis=0)
+
+val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics)
+lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0])
+lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True)
+
+for epoch_id in range(args.epochs):
+    total_batch = 78028
+    checkpoint_path = args.save + '/largeRNN-epoch%dbatch%d.params' % (epoch_id, total_batch)
+    lm_estimator.val_net.load_parameters(checkpoint_path)
+    lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler])
diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py
new file mode 100644
index 0000000000..f14ea63030
--- /dev/null
+++ b/scripts/language_model/word_language_model_estimator.py
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import time
+import math
+import os
+import sys
+import mxnet as mx
+from mxnet import gluon, autograd
+import gluonnlp as nlp
+from mxnet.gluon.contrib.estimator import LoggingHandler
+from gluonnlp.estimator import JointActivationRegularizationLoss
+from gluonnlp.estimator import LanguageModelEstimator
+from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
+from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
+from gluonnlp.estimator import WordLanguageModelCheckpointHandler
+from gluonnlp.estimator import LanguageModelBatchProcessor
+from gluonnlp.estimator import MetricResetHandler
+from mxnet.gluon.data.sampler import BatchSampler
+
+class BatchVariableLenTextSampler(BatchSampler):
+    def __init__(self, bptt, length, use_variable_length=True):
+        self.bptt = bptt
+        self.length = length
+        self.index = 0
+        self.use_variable_length = use_variable_length
+
+    def __iter__(self):
+        self.index = 0
+        while self.index < self.length - 2:
+            if self.use_variable_length:
+                bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2
+                seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
+            else:
+                seq_len = self.bptt
+            seq_len = min(seq_len, self.length - self.index - 1)
+            # batch_size = seq_len + 1
+            batch = []
+            for i in range(self.index, self.index + seq_len + 1):
+                batch.append(i)
+            self.index += seq_len
+            yield batch
+
+    def __len__(self):
+        # you may never get real size of the data sampler beforehand. May need some
+        # postprocessing after fetching the data batch
+        return int(self.length / 5) + 1
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '..', '..'))
+
+nlp.utils.check_version('0.7.0')
+
+parser = argparse.ArgumentParser(description=
+                                 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.')
+parser.add_argument('--model', type=str, default='lstm',
+                    help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)')
+parser.add_argument('--emsize', type=int, default=400,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=1150,
+                    help='number of hidden units per layer')
+parser.add_argument('--nlayers', type=int, default=3,
+                    help='number of layers')
+parser.add_argument('--lr', type=float, default=30,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=750,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=80, metavar='N',
+                    help='batch size')
+parser.add_argument('--bptt', type=int, default=70,
+                    help='sequence length')
+parser.add_argument('--dropout', type=float, default=0.4,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--dropout_h', type=float, default=0.2,
+                    help='dropout applied to hidden layer (0 = no dropout)')
+parser.add_argument('--dropout_i', type=float, default=0.65,
+                    help='dropout applied to input layer (0 = no dropout)')
+parser.add_argument('--dropout_e', type=float, default=0.1,
+                    help='dropout applied to embedding layer (0 = no dropout)')
+parser.add_argument('--weight_dropout', type=float, default=0.5,
+                    help='weight dropout applied to h2h weight matrix (0 = no weight dropout)')
+parser.add_argument('--tied', action='store_true',
+                    help='tie the word embedding and softmax weights')
+parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+                    help='report interval')
+parser.add_argument('--save', type=str, default='model.params',
+                    help='path to save the final model')
+parser.add_argument('--eval_only', action='store_true',
+                    help='Whether to only evaluate the trained model')
+parser.add_argument('--gpu', type=str, help='single gpu id')
+parser.add_argument('--optimizer', type=str, default='sgd',
+                    help='optimizer to use (sgd, adam)')
+parser.add_argument('--wd', type=float, default=1.2e-6,
+                    help='weight decay applied to all weights')
+parser.add_argument('--alpha', type=float, default=2,
+                    help='alpha L2 regularization on RNN activation '
+                         '(alpha = 0 means no regularization)')
+parser.add_argument('--beta', type=float, default=1,
+                    help='beta slowness regularization applied on RNN activation '
+                         '(beta = 0 means no regularization)')
+parser.add_argument('--ntasgd', action='store_true',
+                    help='Whether to apply ntasgd')
+parser.add_argument('--test_mode', action='store_true',
+                    help='Whether to run through the script with few examples')
+parser.add_argument('--lr_update_interval', type=int, default=30,
+                    help='lr udpate interval')
+parser.add_argument('--lr_update_factor', type=float, default=0.1,
+                    help='lr udpate factor')
+args = parser.parse_args()
+
+###############################################################################
+# Load data
+###############################################################################
+
+context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))]
+
+assert args.batch_size % len(context) == 0, \
+    'Total batch size must be multiple of the number of devices'
+
+assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \
+    'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0'
+
+train_dataset, val_dataset, test_dataset = \
+    [nlp.data.WikiText2(segment=segment,
+                        skip_empty=False, bos=None, eos='<eos>')
+     for segment in ['train', 'val', 'test']]
+
+vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
+train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size)
+train_data = train_batchify(train_dataset)
+val_batch_size = 10
+val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
+val_data = val_batchify(val_dataset)
+test_batch_size = 1
+test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
+test_data = test_batchify(test_dataset)
+
+if args.test_mode:
+    args.emsize = 200
+    args.nhid = 200
+    args.nlayers = 1
+    args.epochs = 3
+    train_data = train_data[0:100]
+    val_data = val_data[0:100]
+    test_data = test_data[0:100]
+
+print(args)
+
+###############################################################################
+# Build the model
+###############################################################################
+
+ntokens = len(vocab)
+
+if args.weight_dropout > 0:
+    print('Use AWDRNN')
+    model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
+                                   args.tied, args.dropout, args.weight_dropout,
+                                   args.dropout_h, args.dropout_i, args.dropout_e)
+    model.initialize(mx.init.Xavier(), ctx=context)
+    model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
+                                  args.tied, args.dropout, args.weight_dropout,
+                                  args.dropout_h, args.dropout_i, args.dropout_e,
+                                  params=model.collect_params())
+else:
+    model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
+                                        args.nhid, args.nlayers, args.dropout, args.tied)
+    model.initialize(mx.init.Xavier(), ctx=context)
+    model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
+                                       args.nhid, args.nlayers, args.dropout, args.tied,
+                                       params=model.collect_params())
+
+
+model.hybridize(static_alloc=True)
+
+print(model)
+
+
+def check_initialized(net):
+    params = net.collect_params()
+    for param in params:
+        try:
+            params[param].list_ctx()
+        except RuntimeError:
+            return False
+    return True
+    
+print(check_initialized(model))
+print(check_initialized(model_eval))
+                                    
+if args.optimizer == 'sgd':
+    trainer_params = {'learning_rate': args.lr,
+                      'momentum': 0,
+                      'wd': args.wd}
+elif args.optimizer == 'adam':
+    trainer_params = {'learning_rate': args.lr,
+                      'wd': args.wd,
+                      'beta1': 0,
+                      'beta2': 0.999,
+                      'epsilon': 1e-9}
+
+trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params,
+                        update_on_kvstore=False)
+
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
+
+sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data))
+val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False)
+test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data),
+                                           use_variable_length=False)
+train_data_loader = mx.gluon.data.DataLoader(train_data,
+                                             batch_sampler=sampler)
+val_data_loader = mx.gluon.data.DataLoader(val_data,
+                                           batch_sampler=val_sampler)
+test_data_loader = mx.gluon.data.DataLoader(test_data,
+                                            batch_sampler=test_sampler)
+
+train_metric = mx.metric.Loss(train_loss)
+val_metric = mx.metric.Loss(loss)
+batch_processor = LanguageModelBatchProcessor()
+est = LanguageModelEstimator(net=model, loss=train_loss,
+                             train_metrics=train_metric,
+                             val_metrics=val_metric,
+                             trainer=trainer, context=context,
+                             val_loss=loss,
+                             val_net=model_eval,
+                             batch_processor=batch_processor)
+event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)),
+                  LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
+                  RNNGradientUpdateHandler(clip=args.clip),
+                  LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
+                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval),
+                  WordLanguageModelCheckpointHandler(args.save)]
+est.fit(train_data=train_data_loader, val_data=val_data_loader,
+        epochs=args.epochs,
+        event_handlers=event_handlers,
+        batch_axis=1)
+
+est.net.load_parameters(args.save)
+est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)
+est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1)

From 757354c2fb6bb122e88d481431f59a1132402540 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Wed, 12 Feb 2020 16:20:18 +0000
Subject: [PATCH 17/32] remove files

---
 scripts/estimator/sampler.py | 109 -----------------------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 scripts/estimator/sampler.py

diff --git a/scripts/estimator/sampler.py b/scripts/estimator/sampler.py
deleted file mode 100644
index f841fba160..0000000000
--- a/scripts/estimator/sampler.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Log Uniform Candidate Sampler"""
-
-import math
-import numpy as np
-from mxnet import ndarray, gluon
-
-
-class LogUniformSampler(gluon.block.Block):
-    """Draw random samples from an approximately log-uniform or Zipfian distribution.
-
-    This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
-    The elements of sampled_candidates are drawn without replacement from the base distribution.
-
-    The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
-
-    P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
-
-    This sampler is useful when the true classes approximately follow such a distribution.
-
-    For example, if the classes represent words in a lexicon sorted in decreasing order of
-    frequency. If your classes are not ordered by decreasing frequency, do not use this op.
-
-    Additionally, it also returns the number of times each of the
-    true classes and the sampled classes is expected to occur.
-
-    As the candidates are drawn without replacement, the expected count for the sampled candidates
-    and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume
-    (falsely) that the number of tries to get a batch of batch_size distinct values is always
-    `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries.
-
-    Parameters
-    ----------
-    num_sampled: int
-        The number of classes to randomly sample.
-    range_max: int
-        The number of possible classes.
-    dtype: str or np.dtype
-        The dtype for outputs
-    """
-    def __init__(self, range_max, num_sampled, dtype=None, **kwargs):
-        super(LogUniformSampler, self).__init__(**kwargs)
-        self._num_sampled = num_sampled
-        self._log_range = math.log(range_max + 1)
-        self._dtype = np.float32 if dtype is None else dtype
-        self._range_max = range_max
-
-    def _prob_helper(self, num_tries, prob):
-        return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1
-
-    def forward(self, true_classes): # pylint: disable=arguments-differ
-        """Draw samples from log uniform distribution and returns sampled candidates,
-        expected count for true classes and sampled classes.
-
-        Parameters
-        ----------
-        true_classes: NDArray
-            The true classes.
-
-        Returns
-        -------
-        samples: NDArray
-            The sampled candidate classes.
-        expected_count_sample: NDArray
-            The expected count for sampled candidates.
-        expected_count_true: NDArray
-            The expected count for true classes in the same shape as `true_classes`.
-        """
-        num_sampled = self._num_sampled
-        ctx = true_classes.context
-        num_tries = 0
-        log_range = math.log(self._range_max + 1)
-
-        # sample candidates
-        f = ndarray._internal._sample_unique_zipfian
-        sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled))
-        sampled_classes = sampled_classes.reshape((-1,))
-        sampled_classes = sampled_classes.as_in_context(ctx)
-        num_tries = num_tries.as_in_context(ctx)
-
-        # expected count for true classes
-        true_cls = true_classes.as_in_context(ctx).astype('float64')
-        prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range
-        count_true = self._prob_helper(num_tries, prob_true)
-        # expected count for sampled classes
-        sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64')
-        sampled_cls_fp64 = sampled_classes.astype('float64')
-        prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
-        count_sampled = self._prob_helper(num_tries, prob_sampled)
-        # convert to dtype
-        sampled_classes = sampled_classes.astype(self._dtype, copy=False)
-        count_true = count_true.astype(self._dtype, copy=False)
-        count_sampled = count_sampled.astype(self._dtype, copy=False)
-        return sampled_classes, count_sampled, count_true

From 3f0862750107e9258635fdd4791a02b8aa84ec0c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 03:23:51 +0000
Subject: [PATCH 18/32] modify loading the checkpoint

---
 .../large_word_language_model_estimator.py           | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py
index d80da1b4d8..ce6d87e3c2 100644
--- a/scripts/language_model/large_word_language_model_estimator.py
+++ b/scripts/language_model/large_word_language_model_estimator.py
@@ -20,6 +20,8 @@
 import os
 import sys
 import argparse
+import re
+
 import numpy as np
 import mxnet as mx
 from mxnet import gluon, autograd
@@ -229,7 +231,9 @@ def _split_and_sample(x, y):
 lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True)
 
 for epoch_id in range(args.epochs):
-    total_batch = 78028
-    checkpoint_path = args.save + '/largeRNN-epoch%dbatch%d.params' % (epoch_id, total_batch)
-    lm_estimator.val_net.load_parameters(checkpoint_path)
-    lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler])
+    for filename in os.listdir(args.save):
+        file_pattern = 'largeRNN-epoch%dbatch\d+.params' % (epoch_id)
+        if re.match(file_pattern + '',filename):
+            checkpoint_path = args.save + '/' + filename
+            lm_estimator.val_net.load_parameters(checkpoint_path)
+            lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler])

From 48dc1e43513b731b39ef1716762e8280940807f9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 10:03:58 +0000
Subject: [PATCH 19/32] Add todo lists for event handlers

---
 .../estimator/language_model_event_handler.py       | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index d7dda14080..1f4c32eece 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -44,7 +44,11 @@ def __init__(self):
     def epoch_begin(self, estimator, *args, **kwargs):
         estimator.hiddens = None
         estimator.val_hiddens = None
-    
+
+"""TODO: Implement a general average parameter handler or rename it with
+   NTASGD average parameter handler
+
+"""
 class AvgParamHandler(BatchEnd, EpochEnd):
     def __init__(self, data_length):
         self.epoch_id = 0
@@ -92,6 +96,9 @@ def epoch_end(self, estimator, *args, **kwargs):
         self.batch_id = 0
         self.epoch_id += 1
 
+"""TODO: Can we replace learning rate handler with learning rate scheduler
+   Problem: Learning rate scheduler cannot take feedback from each iteration
+"""
 class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd):
     def __init__(self, lr_update_interval=30, lr_update_factor=0.1):
         self.lr_batch_start = 0
@@ -159,6 +166,10 @@ def batch_end(self, estimator, *args, **kwargs):
             
         estimator.trainer.step(len(estimator.context))
 
+"""This event handler reset local metrics for each few iterations
+
+   TODO: shall we move the lengthnormalizedloss part out to be an independent handler
+"""
 class MetricResetHandler(BatchBegin, MetricHandler):
     def __init__(self, metrics, log_interval=None):
         super().__init__(metrics=metrics)

From 7ac114a45da9108f06a4709e39bc617146597527 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 10:12:10 +0000
Subject: [PATCH 20/32] update index.rst

---
 scripts/language_model/index.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index 9a69f347e0..b82c8b4fc5 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -47,35 +47,35 @@ The dataset used for training the models is wikitext-2.
 
 For all the above model settings, we set Tied = True and NTASGD = True .
 
-[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.71 Test PPL 65.62 )
+[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.52 Test PPL 65.68 )
 
 .. code-block:: console
 
-   $ python word_language_model.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
+   $ python word_language_model_estimator.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
 
-[2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.89 Test PPL 80.67)
+[2] awd_lstm_lm_600_wikitext-2 (Val PPL 83.92 Test PPL 80.09)
 
 .. code-block:: console
 
-   $ python word_language_model.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
+   $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
 
-[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.51 Test PPL 82.29)
+[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 85.23 Test PPL 81.44)
 
 .. code-block:: console
 
-   $ python word_language_model.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
+   $ python word_language_model_estimator.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
 
-[4] standard_lstm_lm_650_wikitext-2 (Val PPL 90.96 Test PPL 86.91)
+[4] standard_lstm_lm_650_wikitext-2 (Val PPL 94.51 Test PPL 90.28)
 
 .. code-block:: console
 
-   $ python word_language_model.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
+   $ python word_language_model_estimator.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
 
-[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.59 Test PPL 101.64)
+[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.44 Test PPL 101.19)
 
 .. code-block:: console
 
-   $ python word_language_model.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2
+   $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2
 
 Cache Language Model
 ~~~~~~~~~~~~~~~~~~~~~
@@ -181,8 +181,8 @@ The dataset used for training the models is Google's 1 billion words dataset.
 
 .. code-block:: console
 
-   $ python large_word_language_model.py --gpus 0,1,2,3 --clip=10
-   $ python large_word_language_model.py --gpus 4 --eval-only --batch-size=1
+   $ python large_word_language_model_estimator.py --gpus 0,1,2,3 --clip=10
+   $ python large_word_language_model_estimator.py --gpus 4 --eval-only --batch-size=1
 
 
 XLNet: Generalized Autoregressive Pretraining for Language Understanding

From 091645246039939fff8f5a0ab02dcbcf26be6caa Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 10:28:26 +0000
Subject: [PATCH 21/32] remove temp files

---
 .../large_word_language_model_estimator.py    |  2 +-
 .../language_model_batch_processor.py         |  2 +-
 .../estimator/language_model_event_handler.py |  2 +-
 .../estimator/length_normalized_loss.py       | 77 -------------------
 .../estimator/parallel_language_model.py      | 43 -----------
 5 files changed, 3 insertions(+), 123 deletions(-)
 delete mode 100644 src/gluonnlp/estimator/length_normalized_loss.py
 delete mode 100644 src/gluonnlp/estimator/parallel_language_model.py

diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py
index ce6d87e3c2..1ebbe95232 100644
--- a/scripts/language_model/large_word_language_model_estimator.py
+++ b/scripts/language_model/large_word_language_model_estimator.py
@@ -35,7 +35,7 @@
 from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 from gluonnlp.estimator import LanguageModelEstimator
 from gluonnlp.estimator import ParallelLoggingHandler
-from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss
+from gluonnlp.metric.length_normalized_loss import LengthNormalizedLoss
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index c1b790a127..9582d747c6 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -23,7 +23,7 @@
 from mxnet.gluon.contrib.estimator import BatchProcessor
 from mxnet.gluon.utils import split_and_load
 from ..utils import Parallel
-from .parallel_language_model import ParallelBigRNN
+from ..model.train.language_model import ParallelBigRNN
 
 __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor']
 
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 1f4c32eece..8287754704 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -30,7 +30,7 @@
 from mxnet.gluon.contrib.estimator import MetricHandler
 from mxnet.gluon.utils import clip_global_norm
 from mxnet.metric import Loss as MetricLoss
-from .length_normalized_loss import LengthNormalizedLoss
+from ..metric.length_normalized_loss import LengthNormalizedLoss
 
 __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler',
            'RNNGradientUpdateHandler', 'MetricResetHandler',
diff --git a/src/gluonnlp/estimator/length_normalized_loss.py b/src/gluonnlp/estimator/length_normalized_loss.py
deleted file mode 100644
index e4558c6fb1..0000000000
--- a/src/gluonnlp/estimator/length_normalized_loss.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Length Normalized Loss """
-
-from mxnet import ndarray
-from mxnet.metric import EvalMetric
-
-__all__ = ['LengthNormalizedLoss']
-
-class LengthNormalizedLoss(EvalMetric):
-    """Compute length normalized loss metrics
-
-    Parameters
-    ----------
-    axis : int, default=1
-        The axis that represents classes
-    name : str
-        Name of this metric instance for display.
-    output_names : list of str, or None
-        Name of predictions that should be used when updating with update_dict.
-        By default include all predictions.
-    label_names : list of str, or None
-        Name of labels that should be used when updating with update_dict.
-        By default include all labels.
-    """
-    def __init__(self, axis=0, name='length-normalized-loss',
-                 output_names=None, label_names=None):
-        super(LengthNormalizedLoss, self).__init__(
-            name, axis=axis,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
-
-    # Parameter labels should be a list in the form of  [target_sequence,
-    # target_seqauence_valid_length]
-    def update(self, labels, preds):
-        if not isinstance(labels, list) or len(labels) != 2:
-            raise ValueError('labels must be a list. Its first element should be'
-                             ' target sequence and the second element should be'
-                             'the valid length of sequence.')
-
-        _, seq_valid_length = labels
-
-        if not isinstance(seq_valid_length, list):
-            seq_valid_length = [seq_valid_length]
-
-        if not isinstance(preds, list):
-            preds = [preds]
-
-        for length in seq_valid_length:
-            if isinstance(length, ndarray.ndarray.NDArray):
-                total_length = ndarray.sum(length).asscalar()
-            else:
-                total_length = length
-            self.num_inst += total_length
-            self.global_num_inst += total_length
-
-        for pred in preds:
-            if isinstance(pred, ndarray.ndarray.NDArray):
-                loss = ndarray.sum(pred).asscalar()
-            else:
-                loss = pred
-            self.sum_metric += loss
-            self.global_sum_metric += loss
diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py
deleted file mode 100644
index a4e43b29a8..0000000000
--- a/src/gluonnlp/estimator/parallel_language_model.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=wildcard-import, unused-variable
-""" Gluon Parallel Languange Model """
-
-from gluonnlp.utils import Parallel, Parallelizable
-from mxnet import autograd
-
-__all__ = ['ParallelBigRNN']
-
-class ParallelBigRNN(Parallelizable):
-    def __init__(self, rnn, loss_fn, batch_size):
-        self._model = rnn
-        self._loss = loss_fn
-        self._batch_size = batch_size
-
-    def forward_backward(self, x):
-        X, y, m, s, h = x
-        with autograd.record():
-            output, hidden, new_target = self._model(X, y, h, s)
-            output = output.reshape((-3, -1))
-            new_target = new_target.reshape((-1,))
-            ls = self._loss(output, new_target) * m.reshape((-1,))
-            ls = ls / self._batch_size
-            ls.backward()
-        return hidden, ls
-

From 13891e7e41f4dbf3fe4fef7ad72a3d07d846728e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 12:04:17 +0000
Subject: [PATCH 22/32] relocate joint loss file

---
 scripts/language_model/word_language_model_estimator.py | 2 +-
 src/gluonnlp/estimator/__init__.py                      | 3 +--
 src/gluonnlp/{estimator/loss.py => loss/joint_loss.py}  | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)
 rename src/gluonnlp/{estimator/loss.py => loss/joint_loss.py} (97%)

diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py
index f14ea63030..9af8d66661 100644
--- a/scripts/language_model/word_language_model_estimator.py
+++ b/scripts/language_model/word_language_model_estimator.py
@@ -24,7 +24,7 @@
 from mxnet import gluon, autograd
 import gluonnlp as nlp
 from mxnet.gluon.contrib.estimator import LoggingHandler
-from gluonnlp.estimator import JointActivationRegularizationLoss
+from gluonnlp.loss.joint_loss import JointActivationRegularizationLoss
 from gluonnlp.estimator import LanguageModelEstimator
 from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
 from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler
diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py
index 69172adde6..8af7856d1e 100644
--- a/src/gluonnlp/estimator/__init__.py
+++ b/src/gluonnlp/estimator/__init__.py
@@ -22,7 +22,6 @@
 from .language_model_estimator import *
 from .language_model_event_handler import *
 from .language_model_batch_processor import *
-from .loss import *
 
 __all__ = (language_model_estimator.__all__ + language_model_event_handler.__all__ +
-           language_model_batch_processor.__all__ + loss.__all__)
+           language_model_batch_processor.__all__)
diff --git a/src/gluonnlp/estimator/loss.py b/src/gluonnlp/loss/joint_loss.py
similarity index 97%
rename from src/gluonnlp/estimator/loss.py
rename to src/gluonnlp/loss/joint_loss.py
index 98febf217e..307dea1cd0 100644
--- a/src/gluonnlp/estimator/loss.py
+++ b/src/gluonnlp/loss/joint_loss.py
@@ -17,7 +17,7 @@
 
 
 from mxnet import gluon
-from ..loss import ActivationRegularizationLoss, TemporalActivationRegularizationLoss
+from . import ActivationRegularizationLoss, TemporalActivationRegularizationLoss
 
 __all__ = ['JointActivationRegularizationLoss']
 

From 7eddd5289f3b5b415f1ff700b80d59b6394f8950 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Thu, 13 Feb 2020 14:15:33 +0000
Subject: [PATCH 23/32] remove temporary fix

---
 src/gluonnlp/estimator/language_model_batch_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 9582d747c6..80a0ba73fc 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -62,7 +62,6 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
         return data, target, outputs, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
-        batch_axis = 1 #temporary work around, removed after estimator is fixed
         data = val_batch[:-1]
         target = val_batch[1:]
         batch_size = val_batch.shape[batch_axis]

From d5d8148e0f09c7a068bf0917224547b1c027bd74 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 14 Feb 2020 13:45:05 +0000
Subject: [PATCH 24/32] fix pylint errors and add docstrings

---
 .../language_model_batch_processor.py         |  38 ++++--
 .../estimator/language_model_estimator.py     |  46 +++++--
 .../estimator/language_model_event_handler.py | 114 ++++++++++++++----
 src/gluonnlp/loss/joint_loss.py               |   2 +-
 4 files changed, 158 insertions(+), 42 deletions(-)

diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 80a0ba73fc..6b241cdf97 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -28,8 +28,12 @@
 __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor']
 
 class LanguageModelBatchProcessor(BatchProcessor):
+    '''Word language model batch processor
+
+    Batch training and validation for word language model
+    '''
     def __init__(self):
-        pass
+        super(LanguageModelBatchProcessor, self).__init__()
 
     def fit_batch(self, estimator, train_batch, batch_axis=0):
         data = train_batch[:-1]
@@ -43,12 +47,13 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
                                                            ctx=ctx) for ctx in estimator.context]
         else:
             estimator.hiddens = estimator.detach(estimator.hiddens)
-        
+
         Ls = []
         outputs = []
         data_size = 0
         with mx.autograd.record():
             for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)):
+                data_size = X.size
                 output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h)
                 l = estimator.loss(output, y, encoder_hs, dropped_encoder_hs)
                 Ls.append(l / (len(estimator.context) * X.size))
@@ -58,7 +63,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0):
         for L in Ls:
             L.backward()
 
-        Ls = [l * (len(estimator.context) * X.size) for l in Ls]
+        Ls = [l * (len(estimator.context) * data_size) for l in Ls]
         return data, target, outputs, Ls
 
     def evaluate_batch(self, estimator, val_batch, batch_axis=0):
@@ -73,7 +78,8 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         if estimator.val_hiddens is None:
             estimator.val_hiddens = \
             [estimator.val_net.begin_state(batch_size //
-                                            len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \
+                                           len(estimator.context),
+                                           func=mx.nd.zeros, ctx=ctx) for ctx
              in estimator.context]
         else:
             estimator.val_hiddens = estimator.detach(estimator.val_hiddens)
@@ -87,7 +93,25 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         return data, target, outputs, Ls
 
 class ParallelLanguageModelBatchProcessor(BatchProcessor):
+    '''Parallel large RNN batch processor
+
+    Batch training and validation for parallel large RNN model
+
+    Parameters
+    ----------
+    loss : mxnet.gluon.loss.Loss
+        Training loss function for parallel large rnn model
+    vocab : gluonnlp.vocab
+        Vocab of training and validation dataset
+    batch_size : int
+        Training batch size. It is used to construct the initial hidden states of
+        model
+    val_batch_size : int
+        Validation batch size. It is used to construct the initial hidden states 
+        of validation model.
+    '''
     def __init__(self, loss, vocab, batch_size, val_batch_size):
+        super(ParallelLanguageModelBatchProcessor, self).__init__()
         self.loss = loss
         self.parallel_model = None
         self.batch_size = batch_size
@@ -128,9 +152,9 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         data = data.as_in_context(ctx)
         target = target.as_in_context(ctx)
         if estimator.val_hiddens is None:
-            estimator.val_hiddens = estimator.val_net.begin_state(batch_size=self.val_batch_size,
-                                                               func=mx.nd.zeros,
-                                                               ctx=ctx)
+            estimator.val_hiddens =
+            estimator.val_net.begin_state(batch_size=self.val_batch_size,
+                                          func=mx.nd.zeros, ctx=ctx)
         else:
             estimator.val_hiddens = estimator.detach(estimator.val_hiddens)
 
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index 808eabb27d..34a664c5d1 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -19,22 +19,46 @@
 # pylint: disable=wildcard-import, unused-variable
 """ Gluon Languange Model Estimator """
 
-import copy
-import warnings
-
-import numpy as np
-import mxnet as mx
 from mxnet.gluon.contrib.estimator import Estimator
-from mxnet.gluon.utils import split_and_load
-from mxnet.gluon.utils import clip_global_norm
-from mxnet.metric import Loss as metric_loss
 from .language_model_batch_processor import LanguageModelBatchProcessor
 
 __all__ = ['LanguageModelEstimator']
 
 class LanguageModelEstimator(Estimator):
+    '''Language Model Estimator
+
+    Estimator class to facilitate the language model training and validation process
+
+    Parameters
+    ----------
+    net : gluon.Block
+        The model used for training.
+    loss : gluon.loss.Loss
+        Loss (objective) function to calculate during training.
+    train_metrics : EvalMetric or list of EvalMetric
+        Training metrics for evaluating models on training dataset.
+    val_metrics : EvalMetric or list of EvalMetric
+        Validation metrics for evaluating models on validation dataset.
+    initializer : Initializer
+        Initializer to initialize the network.
+    trainer : Trainer
+        Trainer to apply optimizer on network parameters.
+    context : Context or list of Context
+        Device(s) to run the training on.
+    val_net : gluon.Block
+        The model used for validation. The validation model does not necessarily belong to
+        the same model class as the training model.
+    val_loss : gluon.loss.loss
+        Loss (objective) function to calculate during validation. If set val_loss
+        None, it will use the same loss function as self.loss
+    batch_processor: BatchProcessor
+        BatchProcessor provides customized fit_batch() and evaluate_batch() methods
+    bptt : int
+        bptt value for the language model training. It decides how many time steps
+        to backpropate
+    '''
     def __init__(self, net, loss, train_metrics=None,
-                 val_metrics = None,
+                 val_metrics=None,
                  initializer=None,
                  trainer=None,
                  context=None,
@@ -56,12 +80,10 @@ def __init__(self, net, loss, train_metrics=None,
         self.avg_param = None
         self.bptt = bptt
         self.ntasgd = False
-        
+
     def detach(self, hidden):
         if isinstance(hidden, (tuple, list)):
             hidden = [self.detach(h) for h in hidden]
         else:
             hidden = hidden.detach()
         return hidden
-
-
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 8287754704..cb08adc6d1 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -19,13 +19,11 @@
 # pylint: disable=wildcard-import, unused-variable
 """ Gluon Language Model Event Handler """
 
-import copy
-import warnings
 import time
 
 import mxnet as mx
-from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin
-from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd
+from mxnet.gluon.contrib.estimator import EpochBegin, EpochEnd
+from mxnet.gluon.contrib.estimator import BatchBegin, BatchEnd
 from mxnet.gluon.contrib.estimator import GradientUpdateHandler, LoggingHandler
 from mxnet.gluon.contrib.estimator import MetricHandler
 from mxnet.gluon.utils import clip_global_norm
@@ -38,6 +36,10 @@
            'LargeRNNGradientUpdateHandler']
 
 class HiddenStateHandler(EpochBegin):
+    '''Hidden state reset event handler
+
+    Reset hidden states for language model at each epoch
+    '''
     def __init__(self):
         pass
 
@@ -45,11 +47,17 @@ def epoch_begin(self, estimator, *args, **kwargs):
         estimator.hiddens = None
         estimator.val_hiddens = None
 
-"""TODO: Implement a general average parameter handler or rename it with
-   NTASGD average parameter handler
-
-"""
 class AvgParamHandler(BatchEnd, EpochEnd):
+    '''NTASGD average parameter event handler
+
+    Average model parameters used in word language model estimator
+
+    Parameters
+    ----------
+    data_length: int
+        Length of training data, i.e., len(train_data). It is used to normalize the weight
+        average coefficient.
+    '''
     def __init__(self, data_length):
         self.epoch_id = 0
         self.batch_id = 0
@@ -63,8 +71,10 @@ def batch_end(self, estimator, *args, **kwargs):
         parameters = estimator.net.collect_params()
         if estimator.ntasgd:
             if estimator.avg_param is None:
-                estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
-                                       for k, v in parameters.items()}
+                estimator.avg_param =
+                {k.split(estimator.net._prefix)[1]:
+                 v.data(estimator.context[0]).copy()
+                 for k, v in parameters.items()}
             else:
                 gamma = 1. / max(1, self.epoch_id * (self.data_length // estimator.bptt) +
                                  self.batch_id - self.avg_trigger + 2)
@@ -82,8 +92,11 @@ def epoch_end(self, estimator, *args, **kwargs):
         if self.avg_trigger == 0:
             if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]):
                 if estimator.avg_param is None:
-                    estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy()
-                                           for k, v in parameters.items()}
+                    estimator.avg_param =
+                    {k.split(estimator.net._prefix)[1]:
+                     v.data(estimator.context[0]).copy()
+                     for k, v in
+                     parameters.items()}
                 else:
                     for key, val in parameters.items():
                         estimator.avg_param[key.split(estimator.net._prefix)[1]] \
@@ -96,10 +109,21 @@ def epoch_end(self, estimator, *args, **kwargs):
         self.batch_id = 0
         self.epoch_id += 1
 
-"""TODO: Can we replace learning rate handler with learning rate scheduler
-   Problem: Learning rate scheduler cannot take feedback from each iteration
-"""
 class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd):
+    '''NTASGD learning rate event handler
+
+    Dynamically adjust the learning rate during word language model training
+    TODO: Investigate whether the learing rate event handler can be replaced with
+    learning rate scheduler
+
+    Parameters
+    ----------
+    lr_update_interval : int
+        Epoch interval of updating the learning rate during training the word
+        language model
+    lr_update_factor : float
+        learning rate decay factor used when updating the learning rate
+    '''
     def __init__(self, lr_update_interval=30, lr_update_factor=0.1):
         self.lr_batch_start = 0
         self.best_val = float('Inf')
@@ -133,6 +157,15 @@ def epoch_end(self, estimator, *args, **kwargs):
                 self.update_lr_epoch = 0
 
 class RNNGradientUpdateHandler(GradientUpdateHandler):
+    '''NTASGD gradient clipping update event handler
+
+    clipping gradient during word language model training
+    Parameters
+    ----------
+    clip : clip
+        Gradient clipping threshold. Gradient norm exceeds this value should be scaled
+        down within the valid range. 
+    '''
     def __init__(self, clip=None, **kwargs):
         super().__init__(**kwargs)
         self.clip = clip
@@ -143,12 +176,24 @@ def batch_end(self, estimator, *args, **kwargs):
         parameters = estimator.net.collect_params()
         grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context]
         if self.clip is not None:
-            # use multi context clipping later
             clip_global_norm(grads, self.clip)
 
         estimator.trainer.step(1)
 
 class LargeRNNGradientUpdateHandler(GradientUpdateHandler):
+    '''Parallel Large RNN gradient clipping update event handler
+
+    Rescale gradients of embedding parameters and clipping gradients of encoder parameters
+    during training parallel large RNN
+
+    Parameters
+    ----------
+    batch_size : int
+        batch size per gpu used during training parallel large RNN
+    clip : float
+        gradient clipping threshold. Gradients of encoder parameters exceed this value
+        should be scaled down within the valid range.
+    '''
     def __init__(self, batch_size, clip=None, **kwargs):
         super().__init__(**kwargs)
         self.batch_size = batch_size
@@ -163,14 +208,25 @@ def batch_end(self, estimator, *args, **kwargs):
             x[:] *= self.batch_size
             encoder_grad = [p.grad(ctx) for p in encoder_params]
             clip_global_norm(encoder_grad, self.clip)
-            
-        estimator.trainer.step(len(estimator.context))
 
-"""This event handler reset local metrics for each few iterations
+        estimator.trainer.step(len(estimator.context))
 
-   TODO: shall we move the lengthnormalizedloss part out to be an independent handler
-"""
 class MetricResetHandler(BatchBegin, MetricHandler):
+    '''Event handler for reseting local metrics
+
+    Reset local metrics for each few iterations and add support of LengthNormalizedMetrics
+    to compute both local and global metrics.
+    TODO: Move this event handler to be reusable by other estimators, e.g.,
+    MachineTranslationEstimator
+
+    Parameters
+    ----------
+    Metrics : mxnet.metric
+        Metrics to be reset during training
+    log_interval : int or None
+        If log_interval is of int type, it represents the interval of reseting local
+        metrics. Otherwise, metrics do not need to be reset. 
+    '''
     def __init__(self, metrics, log_interval=None):
         super().__init__(metrics=metrics)
         self.batch_id = 0
@@ -201,6 +257,15 @@ def batch_end(self, estimator, *args, **kwargs):
                 metric.update(label, pred)
 
 class WordLanguageModelCheckpointHandler(EpochEnd):
+    '''Checkpoint Event handler of word language model
+
+    Save the model checkpoint of word language model
+    
+    Parameters
+    ----------
+    save : string
+        The model checkpoint save path prefix
+    '''
     def __init__(self, save):
         self.save = save
         self.best_val = float('Inf')
@@ -225,6 +290,11 @@ def epoch_end(self, estimator, *args, **kwargs):
 
 
 class ParallelLoggingHandler(LoggingHandler):
+    '''Logging handler of Parallel language model training
+
+    Generating logging information of parallel large RNN training. This event handler
+    is designed specifically to handle the batches taken from multiple gpus.
+    '''
     def __init__(self, *args, **kwargs):
         super(ParallelLoggingHandler, self).__init__(*args, **kwargs)
 
@@ -245,4 +315,4 @@ def batch_end(self, estimator, *args, **kwargs):
                     msg += '%s: %.4f, ' % (name, val)
                 estimator.logger.info(msg.rstrip(', '))
         self.batch_index += 1
-                
+
diff --git a/src/gluonnlp/loss/joint_loss.py b/src/gluonnlp/loss/joint_loss.py
index 307dea1cd0..c62010cbf2 100644
--- a/src/gluonnlp/loss/joint_loss.py
+++ b/src/gluonnlp/loss/joint_loss.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
+""" Joint activation regularization loss """
 from mxnet import gluon
 from . import ActivationRegularizationLoss, TemporalActivationRegularizationLoss
 

From 3d72a32a4eb3204455ee4e41f9cb347adf57f2a3 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 14 Feb 2020 13:50:05 +0000
Subject: [PATCH 25/32] fix errors due to the pylint fix

---
 src/gluonnlp/estimator/language_model_batch_processor.py | 2 +-
 src/gluonnlp/estimator/language_model_event_handler.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index 6b241cdf97..c359a046d6 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -152,7 +152,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         data = data.as_in_context(ctx)
         target = target.as_in_context(ctx)
         if estimator.val_hiddens is None:
-            estimator.val_hiddens =
+            estimator.val_hiddens = \
             estimator.val_net.begin_state(batch_size=self.val_batch_size,
                                           func=mx.nd.zeros, ctx=ctx)
         else:
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index cb08adc6d1..04f1c3bb9a 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -71,7 +71,7 @@ def batch_end(self, estimator, *args, **kwargs):
         parameters = estimator.net.collect_params()
         if estimator.ntasgd:
             if estimator.avg_param is None:
-                estimator.avg_param =
+                estimator.avg_param = \
                 {k.split(estimator.net._prefix)[1]:
                  v.data(estimator.context[0]).copy()
                  for k, v in parameters.items()}
@@ -92,7 +92,7 @@ def epoch_end(self, estimator, *args, **kwargs):
         if self.avg_trigger == 0:
             if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]):
                 if estimator.avg_param is None:
-                    estimator.avg_param =
+                    estimator.avg_param = \
                     {k.split(estimator.net._prefix)[1]:
                      v.data(estimator.context[0]).copy()
                      for k, v in

From ca9c9a053350e662d2248c7fce6f77c918f44887 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 14 Feb 2020 15:34:56 +0000
Subject: [PATCH 26/32] fix docstring pylint errors

---
 .../language_model_batch_processor.py         | 10 ++---
 .../estimator/language_model_estimator.py     |  4 +-
 .../estimator/language_model_event_handler.py | 39 +++++++++----------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py
index c359a046d6..74b26cde23 100644
--- a/src/gluonnlp/estimator/language_model_batch_processor.py
+++ b/src/gluonnlp/estimator/language_model_batch_processor.py
@@ -28,10 +28,10 @@
 __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor']
 
 class LanguageModelBatchProcessor(BatchProcessor):
-    '''Word language model batch processor
+    """Word language model batch processor
 
     Batch training and validation for word language model
-    '''
+    """
     def __init__(self):
         super(LanguageModelBatchProcessor, self).__init__()
 
@@ -93,7 +93,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0):
         return data, target, outputs, Ls
 
 class ParallelLanguageModelBatchProcessor(BatchProcessor):
-    '''Parallel large RNN batch processor
+    """Parallel large RNN batch processor
 
     Batch training and validation for parallel large RNN model
 
@@ -107,9 +107,9 @@ class ParallelLanguageModelBatchProcessor(BatchProcessor):
         Training batch size. It is used to construct the initial hidden states of
         model
     val_batch_size : int
-        Validation batch size. It is used to construct the initial hidden states 
+        Validation batch size. It is used to construct the initial hidden states
         of validation model.
-    '''
+    """
     def __init__(self, loss, vocab, batch_size, val_batch_size):
         super(ParallelLanguageModelBatchProcessor, self).__init__()
         self.loss = loss
diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py
index 34a664c5d1..4eb120ea28 100644
--- a/src/gluonnlp/estimator/language_model_estimator.py
+++ b/src/gluonnlp/estimator/language_model_estimator.py
@@ -25,7 +25,7 @@
 __all__ = ['LanguageModelEstimator']
 
 class LanguageModelEstimator(Estimator):
-    '''Language Model Estimator
+    """Language Model Estimator
 
     Estimator class to facilitate the language model training and validation process
 
@@ -56,7 +56,7 @@ class LanguageModelEstimator(Estimator):
     bptt : int
         bptt value for the language model training. It decides how many time steps
         to backpropate
-    '''
+    """
     def __init__(self, net, loss, train_metrics=None,
                  val_metrics=None,
                  initializer=None,
diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py
index 04f1c3bb9a..77ad23d1b1 100644
--- a/src/gluonnlp/estimator/language_model_event_handler.py
+++ b/src/gluonnlp/estimator/language_model_event_handler.py
@@ -36,10 +36,10 @@
            'LargeRNNGradientUpdateHandler']
 
 class HiddenStateHandler(EpochBegin):
-    '''Hidden state reset event handler
+    """Hidden state reset event handler
 
     Reset hidden states for language model at each epoch
-    '''
+    """
     def __init__(self):
         pass
 
@@ -48,7 +48,7 @@ def epoch_begin(self, estimator, *args, **kwargs):
         estimator.val_hiddens = None
 
 class AvgParamHandler(BatchEnd, EpochEnd):
-    '''NTASGD average parameter event handler
+    """NTASGD average parameter event handler
 
     Average model parameters used in word language model estimator
 
@@ -57,7 +57,7 @@ class AvgParamHandler(BatchEnd, EpochEnd):
     data_length: int
         Length of training data, i.e., len(train_data). It is used to normalize the weight
         average coefficient.
-    '''
+    """
     def __init__(self, data_length):
         self.epoch_id = 0
         self.batch_id = 0
@@ -110,7 +110,7 @@ def epoch_end(self, estimator, *args, **kwargs):
         self.epoch_id += 1
 
 class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd):
-    '''NTASGD learning rate event handler
+    """NTASGD learning rate event handler
 
     Dynamically adjust the learning rate during word language model training
     TODO: Investigate whether the learing rate event handler can be replaced with
@@ -123,7 +123,7 @@ class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd):
         language model
     lr_update_factor : float
         learning rate decay factor used when updating the learning rate
-    '''
+    """
     def __init__(self, lr_update_interval=30, lr_update_factor=0.1):
         self.lr_batch_start = 0
         self.best_val = float('Inf')
@@ -157,15 +157,15 @@ def epoch_end(self, estimator, *args, **kwargs):
                 self.update_lr_epoch = 0
 
 class RNNGradientUpdateHandler(GradientUpdateHandler):
-    '''NTASGD gradient clipping update event handler
+    """NTASGD gradient clipping update event handler
 
     clipping gradient during word language model training
     Parameters
     ----------
     clip : clip
         Gradient clipping threshold. Gradient norm exceeds this value should be scaled
-        down within the valid range. 
-    '''
+        down within the valid range.
+    """
     def __init__(self, clip=None, **kwargs):
         super().__init__(**kwargs)
         self.clip = clip
@@ -181,7 +181,7 @@ def batch_end(self, estimator, *args, **kwargs):
         estimator.trainer.step(1)
 
 class LargeRNNGradientUpdateHandler(GradientUpdateHandler):
-    '''Parallel Large RNN gradient clipping update event handler
+    """Parallel Large RNN gradient clipping update event handler
 
     Rescale gradients of embedding parameters and clipping gradients of encoder parameters
     during training parallel large RNN
@@ -193,7 +193,7 @@ class LargeRNNGradientUpdateHandler(GradientUpdateHandler):
     clip : float
         gradient clipping threshold. Gradients of encoder parameters exceed this value
         should be scaled down within the valid range.
-    '''
+    """
     def __init__(self, batch_size, clip=None, **kwargs):
         super().__init__(**kwargs)
         self.batch_size = batch_size
@@ -212,7 +212,7 @@ def batch_end(self, estimator, *args, **kwargs):
         estimator.trainer.step(len(estimator.context))
 
 class MetricResetHandler(BatchBegin, MetricHandler):
-    '''Event handler for reseting local metrics
+    """Event handler for reseting local metrics
 
     Reset local metrics for each few iterations and add support of LengthNormalizedMetrics
     to compute both local and global metrics.
@@ -225,8 +225,8 @@ class MetricResetHandler(BatchBegin, MetricHandler):
         Metrics to be reset during training
     log_interval : int or None
         If log_interval is of int type, it represents the interval of reseting local
-        metrics. Otherwise, metrics do not need to be reset. 
-    '''
+        metrics. Otherwise, metrics do not need to be reset.
+    """
     def __init__(self, metrics, log_interval=None):
         super().__init__(metrics=metrics)
         self.batch_id = 0
@@ -257,15 +257,15 @@ def batch_end(self, estimator, *args, **kwargs):
                 metric.update(label, pred)
 
 class WordLanguageModelCheckpointHandler(EpochEnd):
-    '''Checkpoint Event handler of word language model
+    """Checkpoint Event handler of word language model
 
     Save the model checkpoint of word language model
-    
+
     Parameters
     ----------
     save : string
         The model checkpoint save path prefix
-    '''
+    """
     def __init__(self, save):
         self.save = save
         self.best_val = float('Inf')
@@ -290,11 +290,11 @@ def epoch_end(self, estimator, *args, **kwargs):
 
 
 class ParallelLoggingHandler(LoggingHandler):
-    '''Logging handler of Parallel language model training
+    """Logging handler of Parallel language model training
 
     Generating logging information of parallel large RNN training. This event handler
     is designed specifically to handle the batches taken from multiple gpus.
-    '''
+    """
     def __init__(self, *args, **kwargs):
         super(ParallelLoggingHandler, self).__init__(*args, **kwargs)
 
@@ -315,4 +315,3 @@ def batch_end(self, estimator, *args, **kwargs):
                     msg += '%s: %.4f, ' % (name, val)
                 estimator.logger.info(msg.rstrip(', '))
         self.batch_index += 1
-

From 7735fa68d2941944daa797041e9464cecb4038fc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 14 Feb 2020 17:00:49 +0000
Subject: [PATCH 27/32] fix script pylint errors

---
 .../large_word_language_model_estimator.py    | 17 +++----
 .../word_language_model_estimator.py          | 51 +++++++++++--------
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py
index 1ebbe95232..070184f9b8 100644
--- a/scripts/language_model/large_word_language_model_estimator.py
+++ b/scripts/language_model/large_word_language_model_estimator.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import time
-import math
+""" large word language model train script  """
+
 import os
 import sys
 import argparse
@@ -24,18 +24,16 @@
 
 import numpy as np
 import mxnet as mx
-from mxnet import gluon, autograd
+from mxnet import gluon
 from mxnet.gluon.contrib.estimator import CheckpointHandler, LoggingHandler
 import gluonnlp as nlp
-from gluonnlp.utils import Parallel, Parallelizable
-from sampler import LogUniformSampler
 from gluonnlp.estimator import ParallelLanguageModelBatchProcessor
 from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler
 from gluonnlp.estimator import LargeRNNGradientUpdateHandler
-from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 from gluonnlp.estimator import LanguageModelEstimator
 from gluonnlp.estimator import ParallelLoggingHandler
 from gluonnlp.metric.length_normalized_loss import LengthNormalizedLoss
+from sampler import LogUniformSampler
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '..', '..'))
@@ -232,8 +230,9 @@ def _split_and_sample(x, y):
 
 for epoch_id in range(args.epochs):
     for filename in os.listdir(args.save):
-        file_pattern = 'largeRNN-epoch%dbatch\d+.params' % (epoch_id)
-        if re.match(file_pattern + '',filename):
+        file_pattern = r'largeRNN-epoch%dbatch\d+.params' % (epoch_id)
+        if re.match(file_pattern + '', filename):
             checkpoint_path = args.save + '/' + filename
             lm_estimator.val_net.load_parameters(checkpoint_path)
-            lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler])
+            lm_estimator.evaluate(val_data=test_data,
+                                  event_handlers=[val_metric_handler, val_logging_handler])
diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py
index 9af8d66661..49b3291c6d 100644
--- a/scripts/language_model/word_language_model_estimator.py
+++ b/scripts/language_model/word_language_model_estimator.py
@@ -15,15 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
+""" word language model training script """
+
 import argparse
-import time
-import math
 import os
 import sys
+
 import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
+from mxnet import gluon
 from mxnet.gluon.contrib.estimator import LoggingHandler
+from mxnet.gluon.data.sampler import BatchSampler
+import gluonnlp as nlp
 from gluonnlp.loss.joint_loss import JointActivationRegularizationLoss
 from gluonnlp.estimator import LanguageModelEstimator
 from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler
@@ -31,10 +33,24 @@
 from gluonnlp.estimator import WordLanguageModelCheckpointHandler
 from gluonnlp.estimator import LanguageModelBatchProcessor
 from gluonnlp.estimator import MetricResetHandler
-from mxnet.gluon.data.sampler import BatchSampler
+
 
 class BatchVariableLenTextSampler(BatchSampler):
+    """Sample text of variable length
+
+    Generate batch of text of variable length from the training dataset
+
+    Parameters
+    ----------
+    bptt : int
+        bptt variable
+    length : int
+        base sequence length for sampling
+    use_variable_length : bool
+        generate sequence of variable length or not
+    """
     def __init__(self, bptt, length, use_variable_length=True):
+        super(BatchVariableLenTextSampler, self).__init__()
         self.bptt = bptt
         self.length = length
         self.index = 0
@@ -192,19 +208,6 @@ def __len__(self):
 
 print(model)
 
-
-def check_initialized(net):
-    params = net.collect_params()
-    for param in params:
-        try:
-            params[param].list_ctx()
-        except RuntimeError:
-            return False
-    return True
-    
-print(check_initialized(model))
-print(check_initialized(model_eval))
-                                    
 if args.optimizer == 'sgd':
     trainer_params = {'learning_rate': args.lr,
                       'momentum': 0,
@@ -243,11 +246,15 @@ def check_initialized(net):
                              val_loss=loss,
                              val_net=model_eval,
                              batch_processor=batch_processor)
-event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)),
-                  LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor),
+event_handlers = [HiddenStateHandler(),
+                  AvgParamHandler(data_length=len(train_data)),
+                  LearningRateHandler(lr_update_interval=args.lr_update_interval,
+                                      lr_update_factor=args.lr_update_factor),
                   RNNGradientUpdateHandler(clip=args.clip),
-                  LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics),
-                  MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval),
+                  LoggingHandler(log_interval=args.log_interval,
+                                 metrics=est.train_metrics + est.val_metrics),
+                  MetricResetHandler(metrics=est.train_metrics,
+                                     log_interval=args.log_interval),
                   WordLanguageModelCheckpointHandler(args.save)]
 est.fit(train_data=train_data_loader, val_data=val_data_loader,
         epochs=args.epochs,

From e7f80cb348fad336801d4b8d40480f85ce2b4889 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Fri, 14 Feb 2020 17:13:23 +0000
Subject: [PATCH 28/32] fix pylint errrors

---
 src/gluonnlp/estimator/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py
index 8af7856d1e..e8de9fa8e5 100644
--- a/src/gluonnlp/estimator/__init__.py
+++ b/src/gluonnlp/estimator/__init__.py
@@ -19,6 +19,9 @@
 # pylint: disable=wildcard-import, unused-variable
 
 """ Gluon NLP Estimator Module """
+from . import language_model_estimator, language_model_event_handler
+from . import language_model_batch_processor
+
 from .language_model_estimator import *
 from .language_model_event_handler import *
 from .language_model_batch_processor import *

From a0bc6160bdd8e6a6cd7e50fd71dc9ece1198df27 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Mon, 17 Feb 2020 03:49:15 +0000
Subject: [PATCH 29/32] remove hyperparameters from the table

---
 scripts/language_model/index.rst | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index b82c8b4fc5..c30cd8fea4 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -18,24 +18,6 @@ The dataset used for training the models is wikitext-2.
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Model         | awd_lstm_lm_1150_wikitext-2                                                                                                | awd_lstm_lm_600_wikitext-2                                                                                                | standard_lstm_lm_1500_wikitext-2                                                                                                | standard_lstm_lm_650_wikitext-2                                                                                                | standard_lstm_lm_200_wikitext-2                                                                                                |
 +===============+============================================================================================================================+===========================================================================================================================+=================================================================================================================================+================================================================================================================================+================================================================================================================================+
-| Mode          | LSTM                                                                                                                       | LSTM                                                                                                                      | LSTM                                                                                                                            | LSTM                                                                                                                           | LSTM                                                                                                                           |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Num_layers    | 3                                                                                                                          | 3                                                                                                                         | 2                                                                                                                               | 2                                                                                                                              | 2                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Embed size    | 400                                                                                                                        | 200                                                                                                                       | 1500                                                                                                                            | 650                                                                                                                            | 200                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size   | 1150                                                                                                                       | 600                                                                                                                       | 1500                                                                                                                            | 650                                                                                                                            | 200                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout       | 0.4                                                                                                                        | 0.2                                                                                                                       | 0.65                                                                                                                            | 0.5                                                                                                                            | 0.2                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_h     | 0.2                                                                                                                        | 0.1                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_i     | 0.65                                                                                                                       | 0.3                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_e     | 0.1                                                                                                                        | 0.05                                                                                                                      | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Weight_drop   | 0.5                                                                                                                        | 0.2                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Val PPL       | 68.71                                                                                                                      | 84.89                                                                                                                     | 86.51                                                                                                                           | 90.96                                                                                                                          | 107.59                                                                                                                         |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Test PPL      | 65.62                                                                                                                      | 80.67                                                                                                                     | 82.29                                                                                                                           | 86.91                                                                                                                          | 101.64                                                                                                                         |

From 934cba6c082fdb663a3ea92a839717e371fd5298 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Mon, 17 Feb 2020 05:29:03 +0000
Subject: [PATCH 30/32] update language model commands

---
 scripts/language_model/index.rst | 95 ++------------------------------
 1 file changed, 5 insertions(+), 90 deletions(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index c30cd8fea4..afdb3a6bff 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -22,43 +22,13 @@ The dataset used for training the models is wikitext-2.
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Test PPL      | 65.62                                                                                                                      | 80.67                                                                                                                     | 82.29                                                                                                                           | 86.91                                                                                                                          | 101.64                                                                                                                         |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Command       | [1]                                                                                                                        | [2]                                                                                                                       | [3]                                                                                                                             | [4]                                                                                                                            | [5]                                                                                                                            |
+| Command       | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_1150_wikitext-2.sh>`__     | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_600_wikitext-2.sh>`__     | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_1500_wikitext-2.sh>`__     | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_650_wikitext-2.sh>`__     | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_200_wikitext-2.sh>`__     |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 | Training logs | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_1150_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_600_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_1500_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_650_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_200_wikitext-2.log>`__        |
 +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
 
 For all the above model settings, we set Tied = True and NTASGD = True .
 
-[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.52 Test PPL 65.68 )
-
-.. code-block:: console
-
-   $ python word_language_model_estimator.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
-
-[2] awd_lstm_lm_600_wikitext-2 (Val PPL 83.92 Test PPL 80.09)
-
-.. code-block:: console
-
-   $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
-
-[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 85.23 Test PPL 81.44)
-
-.. code-block:: console
-
-   $ python word_language_model_estimator.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
-
-[4] standard_lstm_lm_650_wikitext-2 (Val PPL 94.51 Test PPL 90.28)
-
-.. code-block:: console
-
-   $ python word_language_model_estimator.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
-
-[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.44 Test PPL 101.19)
-
-.. code-block:: console
-
-   $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2
-
 Cache Language Model
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -79,43 +49,13 @@ The dataset used for training the models is wikitext-2.
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Test PPL            | 51.46                                                                                                                             | 62.19                                                                                                                            | 62.79                                                                                                                                  | 65.85                                                                                                                                 | 73.74                                                                                                                                 |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Command             | [1]                                                                                                                               | [2]                                                                                                                              | [3]                                                                                                                                    | [4]                                                                                                                                   | [5]                                                                                                                                   |
+| Command             | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_1150_wikitext-2.sh>`__      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_600_wikitext-2.sh>`__      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_1500_wikitext-2.sh>`__      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_650_wikitext-2.sh>`__      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_200_wikitext-2.sh>`__      |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Training logs       | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_1150_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_600_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_1500_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_650_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_200_wikitext-2.log>`__         |
 +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 For all the above model settings, we set lambdas = 0.1279, theta = 0.662, window = 2000 and bptt= 2000 .
 
-[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 53.41 Test PPL 51.46)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_1150
-
-[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.51 Test PPL 62.19)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_600
-
-[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 65.54 Test PPL 62.79)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_1500
-
-[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 68.47 Test PPL 65.85)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_650
-
-[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 77.51 Test PPL 73.74)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_200
-
 Large Scale Word Language Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -130,42 +70,17 @@ The dataset used for training the models is Google's 1 billion words dataset.
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Model           | LSTM-2048-512                                                                                                                |
 +=================+==============================================================================================================================+
-| Mode            | LSTMP                                                                                                                        |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num layers      | 1                                                                                                                            |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Embed size      | 512                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size     | 2048                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Projection size | 512                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Dropout         | 0.1                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Learning rate   | 0.2                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num samples     | 8192                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Batch size      | 128                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Gradient clip   | 10.0                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Test perplexity | 43.62                                                                                                                        |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num epochs      | 50                                                                                                                           |
+| Command         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.sh>`__               |
++-----------------+------------------------------------------------------------------------------------------------------------------------------+
+| Command         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw-eval.sh>`__          |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Training logs   | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.log>`__              |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Evaluation logs | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw-eval.log>`__         |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 
-[1] LSTM-2048-512 (Test PPL 43.62)
-
-.. code-block:: console
-
-   $ python large_word_language_model_estimator.py --gpus 0,1,2,3 --clip=10
-   $ python large_word_language_model_estimator.py --gpus 4 --eval-only --batch-size=1
-
 
 XLNet: Generalized Autoregressive Pretraining for Language Understanding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 450dee0be402d8347e80f3b85fe69b2443dbcebb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Mon, 17 Feb 2020 05:32:14 +0000
Subject: [PATCH 31/32] minor modification

---
 scripts/language_model/index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index afdb3a6bff..9ba0569ad6 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -72,9 +72,9 @@ The dataset used for training the models is Google's 1 billion words dataset.
 +=================+==============================================================================================================================+
 | Test perplexity | 43.62                                                                                                                        |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Command         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.sh>`__               |
+| Command         | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.sh>`__           |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Command         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw-eval.sh>`__          |
+| Command         | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw-eval.sh>`__      |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Training logs   | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.log>`__              |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+

From 159553f09e49890a1ca46c9e4ad48d0407913ca5 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-63-163.ec2.internal>
Date: Tue, 18 Feb 2020 06:45:43 +0000
Subject: [PATCH 32/32] update bigrnn final result

---
 scripts/language_model/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
index 9ba0569ad6..318050c8a3 100644
--- a/scripts/language_model/index.rst
+++ b/scripts/language_model/index.rst
@@ -70,7 +70,7 @@ The dataset used for training the models is Google's 1 billion words dataset.
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Model           | LSTM-2048-512                                                                                                                |
 +=================+==============================================================================================================================+
-| Test perplexity | 43.62                                                                                                                        |
+| Test perplexity | 43.80                                                                                                                        |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+
 | Command         | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.sh>`__           |
 +-----------------+------------------------------------------------------------------------------------------------------------------------------+