From 89deadc342fe51be0e452c182bc9d96b6f008f16 Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Fri, 13 Dec 2019 10:08:37 +0000 Subject: [PATCH 01/32] add language model estimator --- .../word_language_model_estimator.py | 225 ++++++++++++++++++ src/gluonnlp/estimator/__init__.py | 28 +++ .../language_model_batch_processor.py | 81 +++++++ .../estimator/language_model_estimator.py | 69 ++++++ .../estimator/language_model_event_handler.py | 127 ++++++++++ src/gluonnlp/estimator/loss.py | 80 +++++++ 6 files changed, 610 insertions(+) create mode 100644 scripts/estimator/word_language_model_estimator.py create mode 100644 src/gluonnlp/estimator/__init__.py create mode 100644 src/gluonnlp/estimator/language_model_batch_processor.py create mode 100644 src/gluonnlp/estimator/language_model_estimator.py create mode 100644 src/gluonnlp/estimator/language_model_event_handler.py create mode 100644 src/gluonnlp/estimator/loss.py diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py new file mode 100644 index 0000000000..92fbba8b42 --- /dev/null +++ b/scripts/estimator/word_language_model_estimator.py @@ -0,0 +1,225 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import time +import math +import os +import sys +import mxnet as mx +from mxnet import gluon, autograd +import gluonnlp as nlp +from mxnet.gluon.contrib.estimator import LoggingHandler +from gluonnlp.estimator import JointActivationRegularizationLoss +from gluonnlp.estimator import LanguageModelEstimator +from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler +from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler +from gluonnlp.estimator import LanguageModelBatchProcessor +from mxnet.gluon.data.sampler import BatchSampler + +class BatchVariableLenTextSampler(BatchSampler): + def __init__(self, bptt, length): + self.bptt = bptt + self.length = length + self.index = 0 + + def __iter__(self): + while self.index < self.length - 2: + bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2 + seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar())) + seq_len = min(seq_len, self.length - self.index - 1) + # batch_size = seq_len + 1 + batch = [] + for i in range(self.index, self.index + seq_len + 1): + batch.append(i) + self.index += seq_len + yield batch + + def __len__(self): + # you may never get real size of the data sampler beforehand. May need some + # postprocessing after fetching the data batch + return self.length / 5 + 1 + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.append(os.path.join(curr_path, '..', '..')) + +nlp.utils.check_version('0.7.0') + +parser = argparse.ArgumentParser(description= + 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.') +parser.add_argument('--model', type=str, default='lstm', + help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)') +parser.add_argument('--emsize', type=int, default=400, + help='size of word embeddings') +parser.add_argument('--nhid', type=int, default=1150, + help='number of hidden units per layer') +parser.add_argument('--nlayers', type=int, default=3, + help='number of layers') +parser.add_argument('--lr', type=float, default=30, + help='initial learning rate') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--epochs', type=int, default=750, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=80, metavar='N', + help='batch size') +parser.add_argument('--bptt', type=int, default=70, + help='sequence length') +parser.add_argument('--dropout', type=float, default=0.4, + help='dropout applied to layers (0 = no dropout)') +parser.add_argument('--dropout_h', type=float, default=0.2, + help='dropout applied to hidden layer (0 = no dropout)') +parser.add_argument('--dropout_i', type=float, default=0.65, + help='dropout applied to input layer (0 = no dropout)') +parser.add_argument('--dropout_e', type=float, default=0.1, + help='dropout applied to embedding layer (0 = no dropout)') +parser.add_argument('--weight_dropout', type=float, default=0.5, + help='weight dropout applied to h2h weight matrix (0 = no weight dropout)') +parser.add_argument('--tied', action='store_true', + help='tie the word embedding and softmax weights') +parser.add_argument('--log-interval', type=int, default=200, metavar='N', + help='report interval') +parser.add_argument('--save', type=str, default='model.params', + help='path to save the final model') +parser.add_argument('--eval_only', action='store_true', + help='Whether to only evaluate the trained model') +parser.add_argument('--gpu', type=str, help='single gpu id') +parser.add_argument('--optimizer', type=str, default='sgd', + help='optimizer to use (sgd, adam)') +parser.add_argument('--wd', type=float, default=1.2e-6, + help='weight decay applied to all weights') +parser.add_argument('--alpha', type=float, default=2, + help='alpha L2 regularization on RNN activation ' + '(alpha = 0 means no regularization)') +parser.add_argument('--beta', type=float, default=1, + help='beta slowness regularization applied on RNN activation ' + '(beta = 0 means no regularization)') +parser.add_argument('--ntasgd', action='store_true', + help='Whether to apply ntasgd') +parser.add_argument('--test_mode', action='store_true', + help='Whether to run through the script with few examples') +parser.add_argument('--lr_update_interval', type=int, default=30, + help='lr udpate interval') +parser.add_argument('--lr_update_factor', type=float, default=0.1, + help='lr udpate factor') +args = parser.parse_args() + +############################################################################### +# Load data +############################################################################### + +context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))] + +assert args.batch_size % len(context) == 0, \ + 'Total batch size must be multiple of the number of devices' + +assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \ + 'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0' + +train_dataset, val_dataset, test_dataset = \ + [nlp.data.WikiText2(segment=segment, + skip_empty=False, bos=None, eos='') + for segment in ['train', 'val', 'test']] + +vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None) +train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size) +train_data = train_batchify(train_dataset) +val_batch_size = 10 +val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size) +val_data = val_batchify(val_dataset) +test_batch_size = 1 +test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size) +test_data = test_batchify(test_dataset) + +if args.test_mode: + args.emsize = 200 + args.nhid = 200 + args.nlayers = 1 + args.epochs = 3 + train_data = train_data[0:100] + val_data = val_data[0:100] + test_data = test_data[0:100] + +print(args) + +############################################################################### +# Build the model +############################################################################### + +ntokens = len(vocab) + +if args.weight_dropout > 0: + print('Use AWDRNN') + model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, + args.tied, args.dropout, args.weight_dropout, + args.dropout_h, args.dropout_i, args.dropout_e) + model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, + args.tied, args.dropout, args.weight_dropout, + args.dropout_h, args.dropout_i, args.dropout_e, + params=model.collect_params()) +else: + model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize, + args.nhid, args.nlayers, args.dropout, args.tied) + model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize, + args.nhid, args.nlayers, args.dropout, args.tied, + params=model.collect_params()) + +model.initialize(mx.init.Xavier(), ctx=context) + +model.hybridize(static_alloc=True) + +print(model) + + +if args.optimizer == 'sgd': + trainer_params = {'learning_rate': args.lr, + 'momentum': 0, + 'wd': args.wd} +elif args.optimizer == 'adam': + trainer_params = {'learning_rate': args.lr, + 'wd': args.wd, + 'beta1': 0, + 'beta2': 0.999, + 'epsilon': 1e-9} + +trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params, + update_on_kvstore=False) + +loss = gluon.loss.SoftmaxCrossEntropyLoss() +train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta) + +sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data)) +train_data_loader = mx.gluon.data.DataLoader(train_data, + batch_sampler=sampler) + + +train_metric = mx.metric.Loss(train_loss) +val_metric = mx.metric.Loss(loss) +batch_processor = LanguageModelBatchProcessor() +est = LanguageModelEstimator(net=model, loss=train_loss, + train_metrics=train_metric, + val_metrics=val_metric, + trainer=trainer, context=context, + evaluation_loss=loss, + eval_net=model_eval, + batch_processor=batch_processor) +event_handlers = [HiddenStateHandler(), AvgParamHandler(), + LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), + RNNGradientUpdateHandler(clip=args.clip), + LoggingHandler(log_interval=20, metrics=est.train_metrics + est.val_metrics)] +est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers, + batch_axis=1) diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py new file mode 100644 index 0000000000..69172adde6 --- /dev/null +++ b/src/gluonnlp/estimator/__init__.py @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable + +""" Gluon NLP Estimator Module """ +from .language_model_estimator import * +from .language_model_event_handler import * +from .language_model_batch_processor import * +from .loss import * + +__all__ = (language_model_estimator.__all__ + language_model_event_handler.__all__ + + language_model_batch_processor.__all__ + loss.__all__) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py new file mode 100644 index 0000000000..d62a6c38db --- /dev/null +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable +""" Gluon Languange Model Estimator """ + +import mxnet as mx +from mxnet.gluon.contrib.estimator import BatchProcessor +from mxnet.gluon.utils import split_and_load + +__all__ = ['LanguageModelBatchProcessor'] + +class LanguageModelBatchProcessor(BatchProcessor): + def __init__(self): + pass + + def fit_batch(self, estimator, train_batch, batch_axis=0): + data = train_batch[:-1] + target = train_batch[1:] + batch_size = train_batch.shape[batch_axis] + data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True) + target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True) + + Ls = [] + outputs = [] + data_size = 0 + if estimator.hiddens is None: + estimator.hiddens = [estimator.net.begin_state(batch_size // len(estimator.context), + func=mx.nd.zeros, + ctx=ctx) for ctx in estimator.context] + else: + estimator.hiddens = estimator.detach(estimator.hiddens) + with mx.autograd.record(): + for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)): + output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h) + l = estimator.loss(output, y, encoder_hs, dropped_encoder_hs) + Ls.append(l / (len(estimator.context) * X.size)) + estimator.hiddens[i] = h + outputs.append(output) + + for L in Ls: + L.backward() + + return data, target, outputs, Ls + + def evaluate_batch(self, estimator, val_batch, batch_axis=0): + batch_size = val_batch.shape[batch_axis] + val_batch = [split_and_load(x, ctx_list=estimator.context, batch_axis=batch_axis) for x in val_batch] + data, target = val_batch + Ls = [] + outputs = [] + if estimator.eval_hiddens is None: + estimator.eval_hiddens = \ + [estimator.eval_net.begin_state(batch_size // + len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \ + in estimator.context] + else: + estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens) + for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)): + output, h = estimator.eval_net(X, h) + L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,)) + self.eval_hiddens[i] = h + Ls.append(L) + outputs.append(output) + + return data, target, outputs, Ls diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py new file mode 100644 index 0000000000..39b4970f20 --- /dev/null +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable +""" Gluon Languange Model Estimator """ + +import copy +import warnings + +import numpy as np +import mxnet as mx +from mxnet.gluon.contrib.estimator import Estimator +from mxnet.gluon.utils import split_and_load +from mxnet.gluon.utils import clip_global_norm +from mxnet.metric import Loss as metric_loss +from .language_model_batch_processor import LanguageModelBatchProcessor + +__all__ = ['LanguageModelEstimator'] + +class LanguageModelEstimator(Estimator): + def __init__(self, net, loss, train_metrics=None, + val_metrics = None, + initializer=None, + trainer=None, + context=None, + evaluation_loss=None, + eval_net=None, + batch_processor=LanguageModelBatchProcessor(), + bptt=70): + super().__init__(net=net, loss=loss, + train_metrics=train_metrics, + val_metrics=val_metrics, + initializer=initializer, + trainer=trainer, + context=context, + evaluation_loss=evaluation_loss, + eval_net=eval_net, + batch_processor=batch_processor) + self.hiddens = None + self.eval_hiddens = None + self.avg_param = None + self.bptt = bptt + + self.total_L = 0 + self.ntotal = 0 + + def detach(self, hidden): + if isinstance(hidden, (tuple, list)): + hidden = [self.detach(h) for h in hidden] + else: + hidden = hidden.detach() + return hidden + + diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py new file mode 100644 index 0000000000..ed958c5592 --- /dev/null +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -0,0 +1,127 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable +""" Gluon Language Model Event Handler """ + +import copy +import warnings + +import mxnet as mx +from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin +from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd +from mxnet.gluon.contrib.estimator import GradientUpdateHandler +from mxnet.gluon.utils import clip_global_norm + +__all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', + 'RNNGradientUpdateHandler'] + +class HiddenStateHandler(EpochBegin): + def __init__(self): + pass + + def epoch_begin(self, estimator, *args, **kwargs): + estimator.hiddens = None + estimator.eval_hiddens = None + +class AvgParamHandler(BatchEnd, EpochEnd): + def __init__(self): + self.ntasgd = False + self.epoch_id = 0 + self.batch_id = 0 + self.avg_trigger = 0 + # self.ntasgd is always False during the first epoch + self.batches_per_epoch = 0 + self.t = 0 + self.n = 5 + self.valid_losses = [] + + def batch_end(self, estimator, *args, **kwargs): + parameters = estimator.net.collect_params() + if self.ntasgd: + if estimator.avg_param is None: + estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() + for k, v in parameters.items()} + else: + gamma = 1. / max(1, self.epoch_id * (self.batches_per_epoch // estimator.bptt) + + self.batch_index - avg_trigger + 2) + for key, val in estimator.avg_param.items(): + val[:] += gamma * (parameters['{}{}'.format(estimator.net.__prefix, key)] + .data(estimator.context[0]) - val) + self.batch_id += 1 + + def epoch_end(self, estimator, *args, **kwargs): + parameters = estimator.net.collect_params() + self.batches_per_epoch = self.batch_id + if self.ntasgd == False and self.avg_trigger == 0: + if self.t > self.n and estimator.val_metrics > min(self.valid_losses[-self.n:]): + if estimator.avg_param is None: + estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() + for k, v in parameters.items()} + else: + for key, val in parameters.items(): + estimator.avg_param[key.split(estimator.net._prefix)[1]] \ + = val.data(estimator.context[0]).copy() + self.avg_trigger = (self.epoch_id + 1) * (self.batches_per_epoch // estimator.bptt) + print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger) + self.ntasgd = True + self.valid_losses.append(estimator.val_metrics) + self.t += 1 + self.batch_id = 0 + self.epoch_id += 1 + +class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd): + def __init__(self, lr_update_interval=30, lr_update_factor=0.1): + self.lr_batch_start = 0 + self.best_val = float('Inf') + self.update_lr_epoch = 0 + self.lr_update_interval = lr_update_interval + self.lr_update_factor = lr_update_factor + + def batch_begin(self, estimator, *args, **kwargs): + batch = kwargs['batch'] + self.lr_batch_start = estimator.trainer.learning_rate + seq_len = batch.shape[0] - 1 + estimator.trainer.set_learning_rate(self.lr_batch_start * seq_len / estimator.bptt) + + def batch_end(self, estimator, *args, **kwargs): + estimator.trainer.set_learning_rate(self.lr_batch_start) + + def epoch_end(self, estimator, *args, **kwargs): + if estimator.val_metrics < self.best_val: + self.update_lr_epoch = 0 + self.best_val = estimator.val_metrics + else: + self.update_lr_epoch += 1 + if self.update_lr_epoch % self.lr_update_interval == 0 and self.update_lr_epoch != 0: + lr_scale = estimator.trainer.learning_rate * self.lr_update_factor + estimator.trainer.set_learning_rate(lr_scale) + self.update_lr_epoch = 0 + +class RNNGradientUpdateHandler(GradientUpdateHandler): + def __init__(self, clip=None, **kwargs): + super().__init__(**kwargs) + self.clip = clip + + def batch_end(self, estimator, *args, **kwargs): + parameters = estimator.net.collect_params() + grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context] + if self.clip is not None: + clip_global_norm(grads, self.clip) + + estimator.trainer.step(1) diff --git a/src/gluonnlp/estimator/loss.py b/src/gluonnlp/estimator/loss.py new file mode 100644 index 0000000000..98febf217e --- /dev/null +++ b/src/gluonnlp/estimator/loss.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from mxnet import gluon +from ..loss import ActivationRegularizationLoss, TemporalActivationRegularizationLoss + +__all__ = ['JointActivationRegularizationLoss'] + +class JointActivationRegularizationLoss(gluon.loss.Loss): + r"""Computes Joint Regularization Loss with standard loss. + + The activation regularization refer to + gluonnlp.loss.ActivationRegularizationLoss. + + The temporal activation regularization refer to + gluonnlp.loss.TemporalActivationRegularizationLoss. + + Parameters + ---------- + loss : gluon.loss.Loss + The standard loss + alpha: float + The activation regularization parameter in gluonnlp.loss.ActivationRegularizationLoss + beta: float + The temporal activation regularization parameter in + gluonnlp.loss.TemporalActivationRegularizationLoss + + Inputs: + - **out**: NDArray + output tensor with shape `(sequence_length, batch_size, input_size)` + when `layout` is "TNC". + - **target**: NDArray + target tensor with shape `(sequence_length, batch_size, input_size)` + when `layout` is "TNC". + - **states**: the stack outputs from RNN, + which consists of output from each time step (TNC). + - **dropped_states**: the stack outputs from RNN with dropout, + which consists of output from each time step (TNC). + + Outputs: + - **loss**: loss tensor with shape (batch_size,). Dimensions other than + batch_axis are averaged out. + """ + + def __init__(self, l, alpha, beta, weight=None, batch_axis=None, **kwargs): + super(JointActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs) + self._loss = l + self._alpha, self._beta = alpha, beta + if alpha: + self._ar_loss = ActivationRegularizationLoss(alpha) + if beta: + self._tar_loss = TemporalActivationRegularizationLoss(beta) + + def __repr__(self): + s = 'JointActivationTemporalActivationRegularizationLoss' + return s + + def hybrid_forward(self, F, out, target, states, dropped_states): # pylint: disable=arguments-differ + # pylint: disable=unused-argument + l = self._loss(out.reshape(-3, -1), target.reshape(-1,)) + if self._alpha: + l = l + self._ar_loss(*dropped_states) + if self._beta: + l = l + self._tar_loss(*states) + return l From 90c5144e99d21961e3a1bd389f400c3ec183e7ef Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Fri, 13 Dec 2019 10:09:26 +0000 Subject: [PATCH 02/32] modify init file --- src/gluonnlp/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py index 7a588e8233..f9772b95fc 100644 --- a/src/gluonnlp/__init__.py +++ b/src/gluonnlp/__init__.py @@ -30,6 +30,7 @@ from . import vocab from . import optimizer from . import initializer +from . import estimator from .vocab import Vocab __version__ = '0.10.0.dev' @@ -43,7 +44,8 @@ 'initializer', 'optimizer', 'utils', - 'metric'] + 'metric', + 'estimator'] warnings.filterwarnings(module='gluonnlp', action='default', category=DeprecationWarning) utils.version.check_version('1.6.0', warning_only=True, library=mxnet) From f7c730fa1653459645cf5fa70bd690d7cfe1bce5 Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Tue, 17 Dec 2019 10:49:57 +0000 Subject: [PATCH 03/32] update language model estimator metrics computation --- .../word_language_model_estimator.py | 4 +++- .../language_model_batch_processor.py | 1 + .../estimator/language_model_event_handler.py | 23 ++++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index 92fbba8b42..f075cadfa1 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -29,6 +29,7 @@ from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler from gluonnlp.estimator import LanguageModelBatchProcessor +from gluonnlp.estimator import MetricResetHandler from mxnet.gluon.data.sampler import BatchSampler class BatchVariableLenTextSampler(BatchSampler): @@ -220,6 +221,7 @@ def __len__(self): event_handlers = [HiddenStateHandler(), AvgParamHandler(), LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), RNNGradientUpdateHandler(clip=args.clip), - LoggingHandler(log_interval=20, metrics=est.train_metrics + est.val_metrics)] + LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), + MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)] est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers, batch_axis=1) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index d62a6c38db..bbf1772cee 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -56,6 +56,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): for L in Ls: L.backward() + Ls = [l * (len(estimator.context) * X.size) for l in Ls] return data, target, outputs, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index ed958c5592..d0140506e6 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -26,10 +26,11 @@ from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd from mxnet.gluon.contrib.estimator import GradientUpdateHandler +from mxnet.gluon.contrib.estimator import MetricHandler from mxnet.gluon.utils import clip_global_norm __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', - 'RNNGradientUpdateHandler'] + 'RNNGradientUpdateHandler', 'MetricResetHandler'] class HiddenStateHandler(EpochBegin): def __init__(self): @@ -119,9 +120,29 @@ def __init__(self, clip=None, **kwargs): self.clip = clip def batch_end(self, estimator, *args, **kwargs): + loss = kwargs['loss'] + loss_size = sum([l.size for l in loss]) parameters = estimator.net.collect_params() grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context] if self.clip is not None: + # use multi context clipping later clip_global_norm(grads, self.clip) estimator.trainer.step(1) + +class MetricResetHandler(BatchBegin, MetricHandler): + def __init__(self, metrics, log_interval=1): + super().__init__(metrics=metrics) + self.batch_id = 0 + self.log_interval = log_interval + + def epoch_begin(self, estimator, *args, **kwargs): + self.batch_id = 0 + for metric in self.metrics: + metric.reset() + + def batch_begin(self, estimator, *args, **kwargs): + if self.batch_id % self.log_interval == 1: + for metric in self.metrics: + metric.reset_local() + self.batch_id += 1 From c90509a7a80b88d1fb40b71163a806a0ac14044e Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Wed, 18 Dec 2019 10:29:21 +0000 Subject: [PATCH 04/32] fix and update language model estimator --- .../word_language_model_estimator.py | 27 ++++++++++----- .../language_model_batch_processor.py | 10 ++++-- .../estimator/language_model_estimator.py | 7 ++-- .../estimator/language_model_event_handler.py | 33 +++++++++++-------- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index f075cadfa1..9ced6697fb 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -33,15 +33,20 @@ from mxnet.gluon.data.sampler import BatchSampler class BatchVariableLenTextSampler(BatchSampler): - def __init__(self, bptt, length): + def __init__(self, bptt, length, use_variable_length=True): self.bptt = bptt self.length = length self.index = 0 + self.use_variable_length = use_variable_length def __iter__(self): + self.index = 0 while self.index < self.length - 2: - bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2 - seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar())) + if self.use_variable_length: + bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2 + seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar())) + else: + seq_len = self.bptt seq_len = min(seq_len, self.length - self.index - 1) # batch_size = seq_len + 1 batch = [] @@ -53,7 +58,7 @@ def __iter__(self): def __len__(self): # you may never get real size of the data sampler beforehand. May need some # postprocessing after fetching the data batch - return self.length / 5 + 1 + return int(self.length / 5) + 1 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) @@ -168,6 +173,7 @@ def __len__(self): model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, args.tied, args.dropout, args.weight_dropout, args.dropout_h, args.dropout_i, args.dropout_e) + model.initialize(mx.init.Xavier(), ctx=context) model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, args.tied, args.dropout, args.weight_dropout, args.dropout_h, args.dropout_i, args.dropout_e, @@ -175,17 +181,16 @@ def __len__(self): else: model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) + model.initialize(mx.init.Xavier(), ctx=context) model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, params=model.collect_params()) -model.initialize(mx.init.Xavier(), ctx=context) model.hybridize(static_alloc=True) print(model) - if args.optimizer == 'sgd': trainer_params = {'learning_rate': args.lr, 'momentum': 0, @@ -204,9 +209,11 @@ def __len__(self): train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta) sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data)) +val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False) train_data_loader = mx.gluon.data.DataLoader(train_data, batch_sampler=sampler) - +val_data_loader = mx.gluon.data.DataLoader(val_data, + batch_sampler=val_sampler) train_metric = mx.metric.Loss(train_loss) val_metric = mx.metric.Loss(loss) @@ -218,10 +225,12 @@ def __len__(self): evaluation_loss=loss, eval_net=model_eval, batch_processor=batch_processor) -event_handlers = [HiddenStateHandler(), AvgParamHandler(), +event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)), LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), RNNGradientUpdateHandler(clip=args.clip), LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)] -est.fit(train_data=train_data_loader, epochs=args.epochs, event_handlers=event_handlers, +est.fit(train_data=train_data_loader, val_data=val_data_loader, + epochs=args.epochs, + event_handlers=event_handlers, batch_axis=1) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index bbf1772cee..4655dd620e 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -60,9 +60,13 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): return data, target, outputs, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): + batch_axis = 1 #temporary work around, removed after estimator is fixed + data = val_batch[:-1] + target = val_batch[1:] batch_size = val_batch.shape[batch_axis] - val_batch = [split_and_load(x, ctx_list=estimator.context, batch_axis=batch_axis) for x in val_batch] - data, target = val_batch + data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True) + target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True) + Ls = [] outputs = [] if estimator.eval_hiddens is None: @@ -75,7 +79,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)): output, h = estimator.eval_net(X, h) L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,)) - self.eval_hiddens[i] = h + estimator.eval_hiddens[i] = h Ls.append(L) outputs.append(output) diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index 39b4970f20..4cae6b9166 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -41,7 +41,8 @@ def __init__(self, net, loss, train_metrics=None, evaluation_loss=None, eval_net=None, batch_processor=LanguageModelBatchProcessor(), - bptt=70): + bptt=70, + ntasgd=True): super().__init__(net=net, loss=loss, train_metrics=train_metrics, val_metrics=val_metrics, @@ -55,9 +56,7 @@ def __init__(self, net, loss, train_metrics=None, self.eval_hiddens = None self.avg_param = None self.bptt = bptt - - self.total_L = 0 - self.ntotal = 0 + self.ntasgd = ntasgd def detach(self, hidden): if isinstance(hidden, (tuple, list)): diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index d0140506e6..1401136749 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -41,16 +41,15 @@ def epoch_begin(self, estimator, *args, **kwargs): estimator.eval_hiddens = None class AvgParamHandler(BatchEnd, EpochEnd): - def __init__(self): + def __init__(self, data_length): self.ntasgd = False self.epoch_id = 0 self.batch_id = 0 self.avg_trigger = 0 - # self.ntasgd is always False during the first epoch - self.batches_per_epoch = 0 self.t = 0 self.n = 5 self.valid_losses = [] + self.data_length = data_length def batch_end(self, estimator, *args, **kwargs): parameters = estimator.net.collect_params() @@ -59,18 +58,21 @@ def batch_end(self, estimator, *args, **kwargs): estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() for k, v in parameters.items()} else: - gamma = 1. / max(1, self.epoch_id * (self.batches_per_epoch // estimator.bptt) + - self.batch_index - avg_trigger + 2) + gamma = 1. / max(1, self.epoch_id * (self.data_length // estimator.bptt) + + self.batch_id - self.avg_trigger + 2) for key, val in estimator.avg_param.items(): - val[:] += gamma * (parameters['{}{}'.format(estimator.net.__prefix, key)] + val[:] += gamma * (parameters['{}{}'.format(estimator.net._prefix, key)] .data(estimator.context[0]) - val) self.batch_id += 1 def epoch_end(self, estimator, *args, **kwargs): + if not isinstance(estimator.val_metrics, list): + val_metrics = [estimator.val_metrics] + else: + val_metrics = estimator.val_metrics parameters = estimator.net.collect_params() - self.batches_per_epoch = self.batch_id - if self.ntasgd == False and self.avg_trigger == 0: - if self.t > self.n and estimator.val_metrics > min(self.valid_losses[-self.n:]): + if self.avg_trigger == 0: + if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]): if estimator.avg_param is None: estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() for k, v in parameters.items()} @@ -78,10 +80,10 @@ def epoch_end(self, estimator, *args, **kwargs): for key, val in parameters.items(): estimator.avg_param[key.split(estimator.net._prefix)[1]] \ = val.data(estimator.context[0]).copy() - self.avg_trigger = (self.epoch_id + 1) * (self.batches_per_epoch // estimator.bptt) + self.avg_trigger = (self.epoch_id + 1) * (self.data_length // estimator.bptt) print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger) self.ntasgd = True - self.valid_losses.append(estimator.val_metrics) + self.valid_losses.append(val_metrics[0].get()[1]) self.t += 1 self.batch_id = 0 self.epoch_id += 1 @@ -104,9 +106,14 @@ def batch_end(self, estimator, *args, **kwargs): estimator.trainer.set_learning_rate(self.lr_batch_start) def epoch_end(self, estimator, *args, **kwargs): - if estimator.val_metrics < self.best_val: + if not isinstance(estimator.val_metrics, list): + val_metrics = [estimator.val_metrics] + else: + val_metrics = estimator.val_metrics + + if val_metrics[0].get()[1] < self.best_val: self.update_lr_epoch = 0 - self.best_val = estimator.val_metrics + self.best_val = val_metrics[0].get()[1] else: self.update_lr_epoch += 1 if self.update_lr_epoch % self.lr_update_interval == 0 and self.update_lr_epoch != 0: From 8540f4bfe29ad8f694c135132f0b190675840a71 Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Wed, 18 Dec 2019 10:32:42 +0000 Subject: [PATCH 05/32] remove unnecessary argument from the language model estimator --- src/gluonnlp/estimator/language_model_estimator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index 4cae6b9166..ac763a3723 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -41,8 +41,7 @@ def __init__(self, net, loss, train_metrics=None, evaluation_loss=None, eval_net=None, batch_processor=LanguageModelBatchProcessor(), - bptt=70, - ntasgd=True): + bptt=70): super().__init__(net=net, loss=loss, train_metrics=train_metrics, val_metrics=val_metrics, @@ -56,7 +55,6 @@ def __init__(self, net, loss, train_metrics=None, self.eval_hiddens = None self.avg_param = None self.bptt = bptt - self.ntasgd = ntasgd def detach(self, hidden): if isinstance(hidden, (tuple, list)): From d030199f6f5581eb02ffa74d7ab508e78ea24daf Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Thu, 19 Dec 2019 06:35:14 +0000 Subject: [PATCH 06/32] Add checkpoint handler for word language model --- .../word_language_model_estimator.py | 4 ++- .../estimator/language_model_estimator.py | 1 + .../estimator/language_model_event_handler.py | 31 ++++++++++++++++--- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index 9ced6697fb..6e6b5a8bb3 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -28,6 +28,7 @@ from gluonnlp.estimator import LanguageModelEstimator from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler +from gluonnlp.estimator import WordLanguageModelCheckpointHandler from gluonnlp.estimator import LanguageModelBatchProcessor from gluonnlp.estimator import MetricResetHandler from mxnet.gluon.data.sampler import BatchSampler @@ -229,7 +230,8 @@ def __len__(self): LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), RNNGradientUpdateHandler(clip=args.clip), LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), - MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval)] + MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval), + WordLanguageModelCheckpointHandler(args.save)] est.fit(train_data=train_data_loader, val_data=val_data_loader, epochs=args.epochs, event_handlers=event_handlers, diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index ac763a3723..6155e0e7c5 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -55,6 +55,7 @@ def __init__(self, net, loss, train_metrics=None, self.eval_hiddens = None self.avg_param = None self.bptt = bptt + self.ntasgd = False def detach(self, hidden): if isinstance(hidden, (tuple, list)): diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 1401136749..46a6f259e1 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -30,7 +30,8 @@ from mxnet.gluon.utils import clip_global_norm __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', - 'RNNGradientUpdateHandler', 'MetricResetHandler'] + 'RNNGradientUpdateHandler', 'MetricResetHandler', + 'WordLanguageModelCheckpointHandler'] class HiddenStateHandler(EpochBegin): def __init__(self): @@ -42,7 +43,6 @@ def epoch_begin(self, estimator, *args, **kwargs): class AvgParamHandler(BatchEnd, EpochEnd): def __init__(self, data_length): - self.ntasgd = False self.epoch_id = 0 self.batch_id = 0 self.avg_trigger = 0 @@ -53,7 +53,7 @@ def __init__(self, data_length): def batch_end(self, estimator, *args, **kwargs): parameters = estimator.net.collect_params() - if self.ntasgd: + if estimator.ntasgd: if estimator.avg_param is None: estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() for k, v in parameters.items()} @@ -82,7 +82,7 @@ def epoch_end(self, estimator, *args, **kwargs): = val.data(estimator.context[0]).copy() self.avg_trigger = (self.epoch_id + 1) * (self.data_length // estimator.bptt) print('Switching to NTASGD and avg_trigger is : %d' % self.avg_trigger) - self.ntasgd = True + estimator.ntasgd = True self.valid_losses.append(val_metrics[0].get()[1]) self.t += 1 self.batch_id = 0 @@ -153,3 +153,26 @@ def batch_begin(self, estimator, *args, **kwargs): for metric in self.metrics: metric.reset_local() self.batch_id += 1 + +class WordLanguageModelCheckpointHandler(EpochEnd): + def __init__(self, save): + self.save = save + self.best_val = float('Inf') + + def epoch_end(self, estimator, *args, **kwargs): + if not isinstance(estimator.val_metrics, list): + val_metrics = [estimator.val_metrics] + else: + val_metrics = estimator.val_metrics + + if estimator.ntasgd: + mx.nd.save('{}.val.params'.format(self.save), estimator.avg_param) + else: + estimator.net.save_parameters('{}.val.params'.format(self.save)) + + if val_metrics[0].get()[1] < self.best_val: + self.best_val = val_metrics[0].get()[1] + if estimator.ntasgd: + mx.nd.save(self.save, estimator.avg_param) + else: + estimator.net.save_parameters(self.save) From 9aa824d808e466b120cefe0732c4f0c9088ef2d1 Mon Sep 17 00:00:00 2001 From: Zhuanghua Liu Date: Mon, 23 Dec 2019 04:41:57 +0000 Subject: [PATCH 07/32] Add large language model estimator --- .../large_word_language_model_estimator.py | 185 ++++++++++++++++++ scripts/estimator/sampler.py | 109 +++++++++++ .../word_language_model_estimator.py | 13 ++ .../language_model_batch_processor.py | 54 ++++- .../estimator/language_model_event_handler.py | 18 ++ .../estimator/parallel_language_model.py | 41 ++++ 6 files changed, 416 insertions(+), 4 deletions(-) create mode 100644 scripts/estimator/large_word_language_model_estimator.py create mode 100644 scripts/estimator/sampler.py create mode 100644 src/gluonnlp/estimator/parallel_language_model.py diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py new file mode 100644 index 0000000000..c7ee01ff44 --- /dev/null +++ b/scripts/estimator/large_word_language_model_estimator.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +import math +import os +import sys +import argparse +import numpy as np +import mxnet as mx +from mxnet import gluon, autograd +import gluonnlp as nlp +from gluonnlp.utils import Parallel, Parallelizable +from sampler import LogUniformSampler + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.append(os.path.join(curr_path, '..', '..')) + +nlp.utils.check_version('0.7.0') + +############################################################################### +# Arg parser +############################################################################### +parser = argparse.ArgumentParser(description= + 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW') +parser.add_argument('--save', type=str, default='model.params', + help='path to save the final model.') +parser.add_argument('--emsize', type=int, default=512, + help='size of word embeddings') +parser.add_argument('--nhid', type=int, default=2048, + help='number of hidden units per layer') +parser.add_argument('--nproj', type=int, default=512, + help='number of projection units per layer. Could be different from embsize') +parser.add_argument('--nlayers', type=int, default=1, + help='number of layers') +parser.add_argument('--from-epoch', type=int, default=None, + help='start training or testing from the provided epoch') +parser.add_argument('--epochs', type=int, default=50, + help='number of epoch for training') +parser.add_argument('--batch-size', type=int, default=128, + help='batch size per gpu') +parser.add_argument('--dropout', type=float, default=0.1, + help='dropout applied to layers (0 = no dropout)') +parser.add_argument('--eps', type=float, default=1, + help='initial history accumulation for adagrad') +parser.add_argument('--bptt', type=int, default=20, + help='sequence length') +parser.add_argument('--k', type=int, default=8192, + help='number of noise samples for estimation') +parser.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.') +parser.add_argument('--log-interval', type=int, default=1000, + help='report interval') +parser.add_argument('--seed', type=int, default=0, + help='random seed') +parser.add_argument('--lr', type=float, default=0.2, + help='initial learning rate') +parser.add_argument('--clip', type=float, default=1.0, + help='gradient clipping by global norm.') +parser.add_argument('--test-mode', action='store_true', + help='Whether to run through the script with few examples') +parser.add_argument('--eval-only', action='store_true', + help='Whether to only run evaluation for the trained model') +args = parser.parse_args() + +segments = ['train', 'test'] +max_nbatch_eval = None + +if args.test_mode: + args.emsize = 200 + args.log_interval = 1 + args.nhid = 200 + args.nlayers = 1 + args.epochs = 20 + max_nbatch_eval = 3 + segments = ['test', 'test'] + +print(args) +mx.random.seed(args.seed) +np.random.seed(args.seed) + +context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ + [mx.gpu(int(x)) for x in args.gpus.split(',')] + +os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' +os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context)) +os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context)) + +############################################################################### +# Data stream +############################################################################### +train_data_stream, test_data_stream = \ + [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='') + for segment in segments] +vocab = train_data_stream.vocab +ntokens = len(vocab) + +# Sampler for generating negative classes during training with importance sampling +sampler = LogUniformSampler(ntokens, args.k) + +# Given a list of (array, context) pairs, load array[i] on context[i] +def _load(xs): + ret = [] + for x, ctx in zip(xs, context): + if isinstance(x, tuple): + ret.append([y.as_in_context(ctx) for y in x]) + else: + ret.append(x.as_in_context(ctx)) + return ret + +# Transformation for a data batch for training. +# First, load the data, target and mask to target contexts. +# Second, the LSTM-2048-512 model performs importance sampling for decoding +# during training, we need to sample negative candidate classes by invoking the +# log uniform sampler. +def _split_and_sample(x, y): + m = x != vocab[vocab.padding_token] # mask padding + num_ctx = len(context) + if num_ctx > 1: + xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True) + ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True) + ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True) + else: + xs, ys, ms = [x], [y], [m] + xs = _load(xs) + ys = _load(ys) + ms = _load(ms) + ss = [sampler(y) for y in ys] + ss = _load(ss) + return xs, ys, ms, ss + +train_batch_size = args.batch_size * len(context) +train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size) +train_data = train_batchify(train_data_stream) +train_data = train_data.transform(_split_and_sample) + +test_batch_size = args.batch_size +test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size) +test_data = test_batchify(test_data_stream) +test_data = nlp.data.PrefetchingStream(test_data) + +############################################################################### +# Build the model +############################################################################### + +eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, + args.nlayers, args.nproj, + embed_dropout=args.dropout, + encode_dropout=args.dropout) +model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid, + args.nlayers, args.nproj, args.k, + embed_dropout=args.dropout, + encode_dropout=args.dropout) +loss = gluon.loss.SoftmaxCrossEntropyLoss() +model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) +trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} +trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params) +if args.from_epoch: + from_epoch = args.from_epoch + checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d')) + model.load_parameters(checkpoint_name) + trainer.load_states('%s.state'%args.save) + print('Loaded parameters from checkpoint %s'%(checkpoint_name)) + + +for i, batch in enumerate(train_data): + tmp = type(batch) + +model.hybridize(static_alloc=True, static_shape=True) +parallel_model = ParallelBigRNN(model, loss) +parallel = Parallel(len(context), parallel_model) diff --git a/scripts/estimator/sampler.py b/scripts/estimator/sampler.py new file mode 100644 index 0000000000..f841fba160 --- /dev/null +++ b/scripts/estimator/sampler.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Log Uniform Candidate Sampler""" + +import math +import numpy as np +from mxnet import ndarray, gluon + + +class LogUniformSampler(gluon.block.Block): + """Draw random samples from an approximately log-uniform or Zipfian distribution. + + This operation randomly samples *num_sampled* candidates the range of integers [0, range_max). + The elements of sampled_candidates are drawn without replacement from the base distribution. + + The base distribution for this operator is an approximately log-uniform or Zipfian distribution: + + P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1) + + This sampler is useful when the true classes approximately follow such a distribution. + + For example, if the classes represent words in a lexicon sorted in decreasing order of + frequency. If your classes are not ordered by decreasing frequency, do not use this op. + + Additionally, it also returns the number of times each of the + true classes and the sampled classes is expected to occur. + + As the candidates are drawn without replacement, the expected count for the sampled candidates + and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume + (falsely) that the number of tries to get a batch of batch_size distinct values is always + `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries. + + Parameters + ---------- + num_sampled: int + The number of classes to randomly sample. + range_max: int + The number of possible classes. + dtype: str or np.dtype + The dtype for outputs + """ + def __init__(self, range_max, num_sampled, dtype=None, **kwargs): + super(LogUniformSampler, self).__init__(**kwargs) + self._num_sampled = num_sampled + self._log_range = math.log(range_max + 1) + self._dtype = np.float32 if dtype is None else dtype + self._range_max = range_max + + def _prob_helper(self, num_tries, prob): + return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1 + + def forward(self, true_classes): # pylint: disable=arguments-differ + """Draw samples from log uniform distribution and returns sampled candidates, + expected count for true classes and sampled classes. + + Parameters + ---------- + true_classes: NDArray + The true classes. + + Returns + ------- + samples: NDArray + The sampled candidate classes. + expected_count_sample: NDArray + The expected count for sampled candidates. + expected_count_true: NDArray + The expected count for true classes in the same shape as `true_classes`. + """ + num_sampled = self._num_sampled + ctx = true_classes.context + num_tries = 0 + log_range = math.log(self._range_max + 1) + + # sample candidates + f = ndarray._internal._sample_unique_zipfian + sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled)) + sampled_classes = sampled_classes.reshape((-1,)) + sampled_classes = sampled_classes.as_in_context(ctx) + num_tries = num_tries.as_in_context(ctx) + + # expected count for true classes + true_cls = true_classes.as_in_context(ctx).astype('float64') + prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range + count_true = self._prob_helper(num_tries, prob_true) + # expected count for sampled classes + sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64') + sampled_cls_fp64 = sampled_classes.astype('float64') + prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range + count_sampled = self._prob_helper(num_tries, prob_sampled) + # convert to dtype + sampled_classes = sampled_classes.astype(self._dtype, copy=False) + count_true = count_true.astype(self._dtype, copy=False) + count_sampled = count_sampled.astype(self._dtype, copy=False) + return sampled_classes, count_sampled, count_true diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index 6e6b5a8bb3..aacd787cde 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -192,6 +192,19 @@ def __len__(self): print(model) + +def check_initialized(net): + params = net.collect_params() + for param in params: + try: + params[param].list_ctx() + except RuntimeError: + return False + return True + +print(check_initialized(model)) +print(check_initialized(model_eval)) + if args.optimizer == 'sgd': trainer_params = {'learning_rate': args.lr, 'momentum': 0, diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 4655dd620e..411d93ad2c 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -35,16 +35,16 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): batch_size = train_batch.shape[batch_axis] data = split_and_load(data, estimator.context, batch_axis=batch_axis, even_split=True) target = split_and_load(target, estimator.context, batch_axis=batch_axis, even_split=True) - - Ls = [] - outputs = [] - data_size = 0 if estimator.hiddens is None: estimator.hiddens = [estimator.net.begin_state(batch_size // len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx in estimator.context] else: estimator.hiddens = estimator.detach(estimator.hiddens) + + Ls = [] + outputs = [] + data_size = 0 with mx.autograd.record(): for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)): output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h) @@ -84,3 +84,49 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): outputs.append(output) return data, target, outputs, Ls + +class ParallelLanguageModelBatchProcessor(BatchProcessor): + def __init__(self): + pass + + def fit_batch(self, estimator, train_batch, batch_axis=0): + data, target, mask, sample = train_batch + batch_size = data.shape(batch_axis) + if estimator.hiddens is None: + estimator.hiddens = [estimator.net.begin_state(batch_size, + func=mx.nd.zeros, + ctx=ctx) for ctx in estimator.context] + else: + estimator.hiddens = estimator.detach(estimator.hiddens) + Ls = [] + for _, batch in enumerate(zip(data, target, mask, sample, hiddens)): + paralllel.put(batch) + + for _ in range(len(data)): + hidden, ls = parallel.get() + index = estimator.context.index(hidden[0].context) + estimator.hiddens[index] = hidden + Ls.append(ls) + + #Ls = [l / estimator.bptt for l in Ls] + return data, target, hiddens, Ls + + def evaluate_batch(self, estimator, val_batch, batch_axis=0): + data, target = val_batch + ctx = estimator.context[0] + data = data.as_in_context(ctx) + target = target.as_in_context(ctx) + if estimator.eval_hiddens is None: + estimator.eval_hiddens = estimator.eval_net.begin_state(batch_size=batch_size, + func=mx.nd.zeros, + ctx=ctx) + else: + estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens) + + mask = data != vocab[vocab.padding_token] + output, estimator.eval_hiddens = estimator.eval_net(data, estimator.eval_hiddens) + output = output.reshape((-3, -1)) + L = estimator.evaluation_loss(output, target.reshape(-1, ) * mask.reshape(-1)) + L = L * mask + + return data, target, output, L diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 46a6f259e1..378e152a14 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -137,6 +137,24 @@ def batch_end(self, estimator, *args, **kwargs): estimator.trainer.step(1) +class LargeRNNGradientUpdateHandler(GradientUpdateHandler): + def __init__(self, batch_size, clip=None, **kwargs): + super().__init__(**kwargs) + self.batch_size = batch_size + self.clip = clip + + def batch_end(self, estimator, *args, **kwargs): + encoder_params = estimator.net.encoder.collect_params().values() + embedding_params = list(estimator.net.embedding.collect_params().values()) + + for ctx in estimator.context: + x = embedding_params[0].grad(ctx) + x[:] *= self.batch_size # can I get the batch size dynamically? + encoder_grad = [p.grad(ctx) for p in encoder_params] + gluon.utils.clip_global_norm(encoder_grad, self.clip) + + estimator.trainer.step(len(estimator.context)) + class MetricResetHandler(BatchBegin, MetricHandler): def __init__(self, metrics, log_interval=1): super().__init__(metrics=metrics) diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py new file mode 100644 index 0000000000..8697d90360 --- /dev/null +++ b/src/gluonnlp/estimator/parallel_language_model.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable +""" Gluon Parallel Languange Model """ + +from gluonnlp.utils import Parallel, Parallelizable + +__all__ = ['ParallelBigRNN'] + +class ParallelBigRNN(Parallelizable): + def __init__(self, rnn, loss_fn): + self._model = rnn + self._loss = loss_fn + + def forward_backward(self, x): + X, y, m, s, h = x + with autograd.record(): + output, hidden, new_target = self._model(X, y, h, s) + output = output.reshape((-3, -1)) + new_target = new_target.reshape((-1,)) + ls = self._loss(output, new_target) * m.reshape((-1,)) + ls = ls / args.batch_size + ls.backward() + return hidden, ls + From 06295ef0dfab06684a5d697a2fdc422496aee2ff Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Jan 2020 08:02:59 +0000 Subject: [PATCH 08/32] fix name errors --- .../word_language_model_estimator.py | 4 +-- .../language_model_batch_processor.py | 26 +++++++++---------- .../estimator/language_model_estimator.py | 10 +++---- .../estimator/language_model_event_handler.py | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index aacd787cde..1138a43e8d 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -236,8 +236,8 @@ def check_initialized(net): train_metrics=train_metric, val_metrics=val_metric, trainer=trainer, context=context, - evaluation_loss=loss, - eval_net=model_eval, + val_loss=loss, + val_net=model_eval, batch_processor=batch_processor) event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)), LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 411d93ad2c..b6a72777ca 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -69,17 +69,17 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): Ls = [] outputs = [] - if estimator.eval_hiddens is None: - estimator.eval_hiddens = \ - [estimator.eval_net.begin_state(batch_size // + if estimator.val_hiddens is None: + estimator.val_hiddens = \ + [estimator.val_net.begin_state(batch_size // len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \ in estimator.context] else: - estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens) - for i, (X, y, h) in enumerate(zip(data, target, estimator.eval_hiddens)): - output, h = estimator.eval_net(X, h) - L = estimator.evaluation_loss(output.reshape(-3, -1), y.reshape(-1,)) - estimator.eval_hiddens[i] = h + estimator.val_hiddens = estimator.detach(estimator.val_hiddens) + for i, (X, y, h) in enumerate(zip(data, target, estimator.val_hiddens)): + output, h = estimator.val_net(X, h) + L = estimator.val_loss(output.reshape(-3, -1), y.reshape(-1,)) + estimator.val_hiddens[i] = h Ls.append(L) outputs.append(output) @@ -116,17 +116,17 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): ctx = estimator.context[0] data = data.as_in_context(ctx) target = target.as_in_context(ctx) - if estimator.eval_hiddens is None: - estimator.eval_hiddens = estimator.eval_net.begin_state(batch_size=batch_size, + if estimator.val_hiddens is None: + estimator.val_hiddens = estimator.val_net.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx) else: - estimator.eval_hiddens = estimator.detach(estimator.eval_hiddens) + estimator.val_hiddens = estimator.detach(estimator.val_hiddens) mask = data != vocab[vocab.padding_token] - output, estimator.eval_hiddens = estimator.eval_net(data, estimator.eval_hiddens) + output, estimator.val_hiddens = estimator.val_net(data, estimator.val_hiddens) output = output.reshape((-3, -1)) - L = estimator.evaluation_loss(output, target.reshape(-1, ) * mask.reshape(-1)) + L = estimator.val_loss(output, target.reshape(-1, ) * mask.reshape(-1)) L = L * mask return data, target, output, L diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index 6155e0e7c5..808eabb27d 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -38,8 +38,8 @@ def __init__(self, net, loss, train_metrics=None, initializer=None, trainer=None, context=None, - evaluation_loss=None, - eval_net=None, + val_loss=None, + val_net=None, batch_processor=LanguageModelBatchProcessor(), bptt=70): super().__init__(net=net, loss=loss, @@ -48,11 +48,11 @@ def __init__(self, net, loss, train_metrics=None, initializer=initializer, trainer=trainer, context=context, - evaluation_loss=evaluation_loss, - eval_net=eval_net, + val_loss=val_loss, + val_net=val_net, batch_processor=batch_processor) self.hiddens = None - self.eval_hiddens = None + self.val_hiddens = None self.avg_param = None self.bptt = bptt self.ntasgd = False diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 378e152a14..b735fc601e 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -39,7 +39,7 @@ def __init__(self): def epoch_begin(self, estimator, *args, **kwargs): estimator.hiddens = None - estimator.eval_hiddens = None + estimator.val_hiddens = None class AvgParamHandler(BatchEnd, EpochEnd): def __init__(self, data_length): From 17ef38cc1b0b94439e96333a9466979e96565efa Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 17 Jan 2020 10:05:44 +0000 Subject: [PATCH 09/32] add word language model evaluation code --- scripts/estimator/word_language_model_estimator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index 1138a43e8d..8dec869d12 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -224,10 +224,14 @@ def check_initialized(net): sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data)) val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False) +test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data), + use_variable_length=False) train_data_loader = mx.gluon.data.DataLoader(train_data, batch_sampler=sampler) val_data_loader = mx.gluon.data.DataLoader(val_data, batch_sampler=val_sampler) +test_data_loader = mx.gluon.data.DataLoader(test_data, + batch_sampler=test_sampler) train_metric = mx.metric.Loss(train_loss) val_metric = mx.metric.Loss(loss) @@ -249,3 +253,6 @@ def check_initialized(net): epochs=args.epochs, event_handlers=event_handlers, batch_axis=1) + +est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) +est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) From 87651c5d5f5544bfa1016a26a3ae03449af220cc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 17 Jan 2020 10:35:00 +0000 Subject: [PATCH 10/32] update parallel language model --- .../language_model_batch_processor.py | 20 +++++++++++++------ .../estimator/language_model_event_handler.py | 5 +++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index b6a72777ca..8cab67b478 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -22,8 +22,10 @@ import mxnet as mx from mxnet.gluon.contrib.estimator import BatchProcessor from mxnet.gluon.utils import split_and_load +from ..utils import Parallel +from .parallel_language_model import ParallelBigRNN -__all__ = ['LanguageModelBatchProcessor'] +__all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor'] class LanguageModelBatchProcessor(BatchProcessor): def __init__(self): @@ -86,10 +88,16 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): return data, target, outputs, Ls class ParallelLanguageModelBatchProcessor(BatchProcessor): - def __init__(self): - pass + def __init__(self, loss): + self.loss = loss + + def _get_parallel_model(self): + if self.parallel_model is None: + self.parallel_model = ParallelBigRNN(estimator.net, self.loss) + self.parallel_model = Parallel(len(estimator.context), self.parallel_model) def fit_batch(self, estimator, train_batch, batch_axis=0): + self._get_parallel_model() data, target, mask, sample = train_batch batch_size = data.shape(batch_axis) if estimator.hiddens is None: @@ -100,16 +108,16 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): estimator.hiddens = estimator.detach(estimator.hiddens) Ls = [] for _, batch in enumerate(zip(data, target, mask, sample, hiddens)): - paralllel.put(batch) + self.parallel_model.put(batch) for _ in range(len(data)): - hidden, ls = parallel.get() + hidden, ls = self.parallel_model.get() index = estimator.context.index(hidden[0].context) estimator.hiddens[index] = hidden Ls.append(ls) #Ls = [l / estimator.bptt for l in Ls] - return data, target, hiddens, Ls + return data, target, None, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): data, target = val_batch diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index b735fc601e..609372735f 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -31,7 +31,8 @@ __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', 'RNNGradientUpdateHandler', 'MetricResetHandler', - 'WordLanguageModelCheckpointHandler'] + 'WordLanguageModelCheckpointHandler', + 'LargeRNNGradientUpdateHandler'] class HiddenStateHandler(EpochBegin): def __init__(self): @@ -149,7 +150,7 @@ def batch_end(self, estimator, *args, **kwargs): for ctx in estimator.context: x = embedding_params[0].grad(ctx) - x[:] *= self.batch_size # can I get the batch size dynamically? + x[:] *= self.batch_size encoder_grad = [p.grad(ctx) for p in encoder_params] gluon.utils.clip_global_norm(encoder_grad, self.clip) From e56572321d9ca75bf1c944695fbd1043d2cb209d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 17 Jan 2020 11:06:44 +0000 Subject: [PATCH 11/32] update large language model estimator --- .../large_word_language_model_estimator.py | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py index c7ee01ff44..7ea7bcbb96 100644 --- a/scripts/estimator/large_word_language_model_estimator.py +++ b/scripts/estimator/large_word_language_model_estimator.py @@ -26,6 +26,10 @@ import gluonnlp as nlp from gluonnlp.utils import Parallel, Parallelizable from sampler import LogUniformSampler +from gluonnlp.estimator import ParallelLanguageModelBatchProcessor +from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler +from gluonnlp.estimator import LargeRNNGradientUpdateHandler +from gluonnlp.estimator import WordLanguageModelCheckpointHandler curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) @@ -157,14 +161,16 @@ def _split_and_sample(x, y): # Build the model ############################################################################### -eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, - args.nlayers, args.nproj, - embed_dropout=args.dropout, - encode_dropout=args.dropout) model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid, args.nlayers, args.nproj, args.k, embed_dropout=args.dropout, encode_dropout=args.dropout) +eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, + args.nlayers, args.nproj, + embed_dropout=args.dropout, + encode_dropout=args.dropout, + params=model.collect_params()) + loss = gluon.loss.SoftmaxCrossEntropyLoss() model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} @@ -181,5 +187,27 @@ def _split_and_sample(x, y): tmp = type(batch) model.hybridize(static_alloc=True, static_shape=True) -parallel_model = ParallelBigRNN(model, loss) -parallel = Parallel(len(context), parallel_model) + +train_metric = mx.metric.Loss(loss) +val_metric = mx.metric.Loss(loss) +batch_processor = ParallelLanguageModelBatchProcessor(loss) +lm_estimator = LanguageModelEstimator(net=model, loss=loss, + train_metrics=train_metric, + val_metrics=val_metric, + trainer=trainer, + context=context, + val_loss=loss, + val_net=eval_model, + batch_processor=batch_processor) + +hidden_state_handler = HiddenStateHandler() +gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip) +metric_handler = MetricResetHandler(metrics=est.train_metrics, + log_interval=args.log_interval) +checkpoint_handler = WrodLanguageModelCheckpointHandler(args.save) + +event_handlers = [hidden_state_handler, gradient_handler, + metric_handler, checkpoint_handler] + +lm_estimator.fit(train_data=train_data, epochs=args.epochs, + event_handlers=event_handlers, batch_axis=0) From cfc2f6ddde095f2fabe1c513fa2a9e87251820e6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 17 Jan 2020 11:27:18 +0000 Subject: [PATCH 12/32] fix typos --- scripts/estimator/large_word_language_model_estimator.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py index 7ea7bcbb96..a5d6468c23 100644 --- a/scripts/estimator/large_word_language_model_estimator.py +++ b/scripts/estimator/large_word_language_model_estimator.py @@ -30,6 +30,7 @@ from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler from gluonnlp.estimator import LargeRNNGradientUpdateHandler from gluonnlp.estimator import WordLanguageModelCheckpointHandler +from gluonnlp.estimator import LanguageModelEstimator curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) @@ -183,9 +184,6 @@ def _split_and_sample(x, y): print('Loaded parameters from checkpoint %s'%(checkpoint_name)) -for i, batch in enumerate(train_data): - tmp = type(batch) - model.hybridize(static_alloc=True, static_shape=True) train_metric = mx.metric.Loss(loss) @@ -202,9 +200,9 @@ def _split_and_sample(x, y): hidden_state_handler = HiddenStateHandler() gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip) -metric_handler = MetricResetHandler(metrics=est.train_metrics, +metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics, log_interval=args.log_interval) -checkpoint_handler = WrodLanguageModelCheckpointHandler(args.save) +checkpoint_handler = WordLanguageModelCheckpointHandler(args.save) event_handlers = [hidden_state_handler, gradient_handler, metric_handler, checkpoint_handler] From 3bf7679f2376621290d5eb12e2501a6827f5e660 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 19 Jan 2020 09:41:53 +0000 Subject: [PATCH 13/32] fix large language model estimator bugs --- .../large_word_language_model_estimator.py | 30 ++++++-- .../language_model_batch_processor.py | 28 ++++--- .../estimator/language_model_event_handler.py | 27 +++++-- .../estimator/length_normalized_loss.py | 77 +++++++++++++++++++ .../estimator/parallel_language_model.py | 6 +- 5 files changed, 141 insertions(+), 27 deletions(-) create mode 100644 src/gluonnlp/estimator/length_normalized_loss.py diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py index a5d6468c23..49ce2fb458 100644 --- a/scripts/estimator/large_word_language_model_estimator.py +++ b/scripts/estimator/large_word_language_model_estimator.py @@ -23,6 +23,7 @@ import numpy as np import mxnet as mx from mxnet import gluon, autograd +from mxnet.gluon.contrib.estimator import CheckpointHandler import gluonnlp as nlp from gluonnlp.utils import Parallel, Parallelizable from sampler import LogUniformSampler @@ -31,6 +32,7 @@ from gluonnlp.estimator import LargeRNNGradientUpdateHandler from gluonnlp.estimator import WordLanguageModelCheckpointHandler from gluonnlp.estimator import LanguageModelEstimator +from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) @@ -169,8 +171,7 @@ def _split_and_sample(x, y): eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, args.nlayers, args.nproj, embed_dropout=args.dropout, - encode_dropout=args.dropout, - params=model.collect_params()) + encode_dropout=args.dropout) loss = gluon.loss.SoftmaxCrossEntropyLoss() model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) @@ -187,8 +188,11 @@ def _split_and_sample(x, y): model.hybridize(static_alloc=True, static_shape=True) train_metric = mx.metric.Loss(loss) -val_metric = mx.metric.Loss(loss) -batch_processor = ParallelLanguageModelBatchProcessor(loss) +val_metric = LengthNormalizedLoss(loss) +batch_processor = ParallelLanguageModelBatchProcessor(loss=loss, + vocab=vocab, + batch_size=args.batch_size, + val_batch_size=args.batch_size) lm_estimator = LanguageModelEstimator(net=model, loss=loss, train_metrics=train_metric, val_metrics=val_metric, @@ -196,16 +200,26 @@ def _split_and_sample(x, y): context=context, val_loss=loss, val_net=eval_model, - batch_processor=batch_processor) + batch_processor=batch_processor, + bptt=args.bptt) hidden_state_handler = HiddenStateHandler() gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip) metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics, log_interval=args.log_interval) -checkpoint_handler = WordLanguageModelCheckpointHandler(args.save) +checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN') event_handlers = [hidden_state_handler, gradient_handler, metric_handler, checkpoint_handler] -lm_estimator.fit(train_data=train_data, epochs=args.epochs, - event_handlers=event_handlers, batch_axis=0) +lm_estimator.fit(train_data=train_data, + #epochs=args.epochs, + event_handlers=event_handlers, + batches=5, + batch_axis=0) + +val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics) +lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0]) +lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True) +lm_estimator.val_net.load_parameters(args.save + '/largeRNN-epoch0batch5.params') +lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler]) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 8cab67b478..d1be813fad 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -88,26 +88,29 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): return data, target, outputs, Ls class ParallelLanguageModelBatchProcessor(BatchProcessor): - def __init__(self, loss): + def __init__(self, loss, vocab, batch_size, val_batch_size): self.loss = loss + self.parallel_model = None + self.batch_size = batch_size + self.val_batch_size = val_batch_size + self.vocab = vocab - def _get_parallel_model(self): + def _get_parallel_model(self, estimator): if self.parallel_model is None: - self.parallel_model = ParallelBigRNN(estimator.net, self.loss) + self.parallel_model = ParallelBigRNN(estimator.net, self.loss, self.batch_size) self.parallel_model = Parallel(len(estimator.context), self.parallel_model) def fit_batch(self, estimator, train_batch, batch_axis=0): - self._get_parallel_model() + self._get_parallel_model(estimator) data, target, mask, sample = train_batch - batch_size = data.shape(batch_axis) if estimator.hiddens is None: - estimator.hiddens = [estimator.net.begin_state(batch_size, + estimator.hiddens = [estimator.net.begin_state(batch_size=self.batch_size, func=mx.nd.zeros, ctx=ctx) for ctx in estimator.context] else: estimator.hiddens = estimator.detach(estimator.hiddens) Ls = [] - for _, batch in enumerate(zip(data, target, mask, sample, hiddens)): + for _, batch in enumerate(zip(data, target, mask, sample, estimator.hiddens)): self.parallel_model.put(batch) for _ in range(len(data)): @@ -116,7 +119,8 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): estimator.hiddens[index] = hidden Ls.append(ls) - #Ls = [l / estimator.bptt for l in Ls] + Ls = [l / (estimator.bptt * len(estimator.context)) for l in Ls] + Ls = [mx.nd.sum(l) for l in Ls] return data, target, None, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): @@ -125,16 +129,16 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): data = data.as_in_context(ctx) target = target.as_in_context(ctx) if estimator.val_hiddens is None: - estimator.val_hiddens = estimator.val_net.begin_state(batch_size=batch_size, + estimator.val_hiddens = estimator.val_net.begin_state(batch_size=self.val_batch_size, func=mx.nd.zeros, ctx=ctx) else: estimator.val_hiddens = estimator.detach(estimator.val_hiddens) - mask = data != vocab[vocab.padding_token] + mask = data != self.vocab[self.vocab.padding_token] + mask = mask.reshape(-1) output, estimator.val_hiddens = estimator.val_net(data, estimator.val_hiddens) output = output.reshape((-3, -1)) L = estimator.val_loss(output, target.reshape(-1, ) * mask.reshape(-1)) - L = L * mask - return data, target, output, L + return data, [target, mask], output, L diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 609372735f..b9ceca599e 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -28,6 +28,8 @@ from mxnet.gluon.contrib.estimator import GradientUpdateHandler from mxnet.gluon.contrib.estimator import MetricHandler from mxnet.gluon.utils import clip_global_norm +from mxnet.metric import Loss as MetricLoss +from .length_normalized_loss import LengthNormalizedLoss __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', 'RNNGradientUpdateHandler', 'MetricResetHandler', @@ -152,12 +154,12 @@ def batch_end(self, estimator, *args, **kwargs): x = embedding_params[0].grad(ctx) x[:] *= self.batch_size encoder_grad = [p.grad(ctx) for p in encoder_params] - gluon.utils.clip_global_norm(encoder_grad, self.clip) + clip_global_norm(encoder_grad, self.clip) estimator.trainer.step(len(estimator.context)) class MetricResetHandler(BatchBegin, MetricHandler): - def __init__(self, metrics, log_interval=1): + def __init__(self, metrics, log_interval=None): super().__init__(metrics=metrics) self.batch_id = 0 self.log_interval = log_interval @@ -168,11 +170,24 @@ def epoch_begin(self, estimator, *args, **kwargs): metric.reset() def batch_begin(self, estimator, *args, **kwargs): - if self.batch_id % self.log_interval == 1: - for metric in self.metrics: - metric.reset_local() + if self.log_interval is not None: + if self.batch_id % self.log_interval == 0: + for metric in self.metrics: + metric.reset_local() self.batch_id += 1 + def batch_end(self, estimator, *args, **kwargs): + pred = kwargs['pred'] + label = kwargs['label'] + loss = kwargs['loss'] + for metric in self.metrics: + if isinstance(metric, MetricLoss): + metric.update(0, loss) + elif isinstance(metric, LengthNormalizedLoss): + metric.update(label, loss) + else: + metric.update(label, pred) + class WordLanguageModelCheckpointHandler(EpochEnd): def __init__(self, save): self.save = save @@ -195,3 +210,5 @@ def epoch_end(self, estimator, *args, **kwargs): mx.nd.save(self.save, estimator.avg_param) else: estimator.net.save_parameters(self.save) + + diff --git a/src/gluonnlp/estimator/length_normalized_loss.py b/src/gluonnlp/estimator/length_normalized_loss.py new file mode 100644 index 0000000000..e4558c6fb1 --- /dev/null +++ b/src/gluonnlp/estimator/length_normalized_loss.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Length Normalized Loss """ + +from mxnet import ndarray +from mxnet.metric import EvalMetric + +__all__ = ['LengthNormalizedLoss'] + +class LengthNormalizedLoss(EvalMetric): + """Compute length normalized loss metrics + + Parameters + ---------- + axis : int, default=1 + The axis that represents classes + name : str + Name of this metric instance for display. + output_names : list of str, or None + Name of predictions that should be used when updating with update_dict. + By default include all predictions. + label_names : list of str, or None + Name of labels that should be used when updating with update_dict. + By default include all labels. + """ + def __init__(self, axis=0, name='length-normalized-loss', + output_names=None, label_names=None): + super(LengthNormalizedLoss, self).__init__( + name, axis=axis, + output_names=output_names, label_names=label_names, + has_global_stats=True) + + # Parameter labels should be a list in the form of [target_sequence, + # target_seqauence_valid_length] + def update(self, labels, preds): + if not isinstance(labels, list) or len(labels) != 2: + raise ValueError('labels must be a list. Its first element should be' + ' target sequence and the second element should be' + 'the valid length of sequence.') + + _, seq_valid_length = labels + + if not isinstance(seq_valid_length, list): + seq_valid_length = [seq_valid_length] + + if not isinstance(preds, list): + preds = [preds] + + for length in seq_valid_length: + if isinstance(length, ndarray.ndarray.NDArray): + total_length = ndarray.sum(length).asscalar() + else: + total_length = length + self.num_inst += total_length + self.global_num_inst += total_length + + for pred in preds: + if isinstance(pred, ndarray.ndarray.NDArray): + loss = ndarray.sum(pred).asscalar() + else: + loss = pred + self.sum_metric += loss + self.global_sum_metric += loss diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py index 8697d90360..a4e43b29a8 100644 --- a/src/gluonnlp/estimator/parallel_language_model.py +++ b/src/gluonnlp/estimator/parallel_language_model.py @@ -20,13 +20,15 @@ """ Gluon Parallel Languange Model """ from gluonnlp.utils import Parallel, Parallelizable +from mxnet import autograd __all__ = ['ParallelBigRNN'] class ParallelBigRNN(Parallelizable): - def __init__(self, rnn, loss_fn): + def __init__(self, rnn, loss_fn, batch_size): self._model = rnn self._loss = loss_fn + self._batch_size = batch_size def forward_backward(self, x): X, y, m, s, h = x @@ -35,7 +37,7 @@ def forward_backward(self, x): output = output.reshape((-3, -1)) new_target = new_target.reshape((-1,)) ls = self._loss(output, new_target) * m.reshape((-1,)) - ls = ls / args.batch_size + ls = ls / self._batch_size ls.backward() return hidden, ls From 50b3a95af2f51c5752b7cd5112e03b053520a829 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 22 Jan 2020 06:33:33 +0000 Subject: [PATCH 14/32] some bug fixes on language model estimator --- scripts/estimator/large_word_language_model_estimator.py | 4 +++- scripts/estimator/word_language_model_estimator.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py index 49ce2fb458..0d9d9d0cca 100644 --- a/scripts/estimator/large_word_language_model_estimator.py +++ b/scripts/estimator/large_word_language_model_estimator.py @@ -208,9 +208,11 @@ def _split_and_sample(x, y): metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics, log_interval=args.log_interval) checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN') +logging_handler = LoggingHandler(log_interval=args.log_interval, + metrics=lm_estimator.train_metrics) event_handlers = [hidden_state_handler, gradient_handler, - metric_handler, checkpoint_handler] + metric_handler, checkpoint_handler, logging_handler] lm_estimator.fit(train_data=train_data, #epochs=args.epochs, diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py index 8dec869d12..f14ea63030 100644 --- a/scripts/estimator/word_language_model_estimator.py +++ b/scripts/estimator/word_language_model_estimator.py @@ -254,5 +254,6 @@ def check_initialized(net): event_handlers=event_handlers, batch_axis=1) +est.net.load_parameters(args.save) est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) From 275098f5bbb4088345bed278e79180762a070f4d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 12 Feb 2020 08:36:17 +0000 Subject: [PATCH 15/32] update large language model estimator --- .../large_word_language_model_estimator.py | 227 --------------- .../word_language_model_estimator.py | 259 ------------------ .../language_model_batch_processor.py | 2 +- .../estimator/language_model_event_handler.py | 27 +- 4 files changed, 26 insertions(+), 489 deletions(-) delete mode 100644 scripts/estimator/large_word_language_model_estimator.py delete mode 100644 scripts/estimator/word_language_model_estimator.py diff --git a/scripts/estimator/large_word_language_model_estimator.py b/scripts/estimator/large_word_language_model_estimator.py deleted file mode 100644 index 0d9d9d0cca..0000000000 --- a/scripts/estimator/large_word_language_model_estimator.py +++ /dev/null @@ -1,227 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import time -import math -import os -import sys -import argparse -import numpy as np -import mxnet as mx -from mxnet import gluon, autograd -from mxnet.gluon.contrib.estimator import CheckpointHandler -import gluonnlp as nlp -from gluonnlp.utils import Parallel, Parallelizable -from sampler import LogUniformSampler -from gluonnlp.estimator import ParallelLanguageModelBatchProcessor -from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler -from gluonnlp.estimator import LargeRNNGradientUpdateHandler -from gluonnlp.estimator import WordLanguageModelCheckpointHandler -from gluonnlp.estimator import LanguageModelEstimator -from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss - -curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) -sys.path.append(os.path.join(curr_path, '..', '..')) - -nlp.utils.check_version('0.7.0') - -############################################################################### -# Arg parser -############################################################################### -parser = argparse.ArgumentParser(description= - 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW') -parser.add_argument('--save', type=str, default='model.params', - help='path to save the final model.') -parser.add_argument('--emsize', type=int, default=512, - help='size of word embeddings') -parser.add_argument('--nhid', type=int, default=2048, - help='number of hidden units per layer') -parser.add_argument('--nproj', type=int, default=512, - help='number of projection units per layer. Could be different from embsize') -parser.add_argument('--nlayers', type=int, default=1, - help='number of layers') -parser.add_argument('--from-epoch', type=int, default=None, - help='start training or testing from the provided epoch') -parser.add_argument('--epochs', type=int, default=50, - help='number of epoch for training') -parser.add_argument('--batch-size', type=int, default=128, - help='batch size per gpu') -parser.add_argument('--dropout', type=float, default=0.1, - help='dropout applied to layers (0 = no dropout)') -parser.add_argument('--eps', type=float, default=1, - help='initial history accumulation for adagrad') -parser.add_argument('--bptt', type=int, default=20, - help='sequence length') -parser.add_argument('--k', type=int, default=8192, - help='number of noise samples for estimation') -parser.add_argument('--gpus', type=str, - help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.') -parser.add_argument('--log-interval', type=int, default=1000, - help='report interval') -parser.add_argument('--seed', type=int, default=0, - help='random seed') -parser.add_argument('--lr', type=float, default=0.2, - help='initial learning rate') -parser.add_argument('--clip', type=float, default=1.0, - help='gradient clipping by global norm.') -parser.add_argument('--test-mode', action='store_true', - help='Whether to run through the script with few examples') -parser.add_argument('--eval-only', action='store_true', - help='Whether to only run evaluation for the trained model') -args = parser.parse_args() - -segments = ['train', 'test'] -max_nbatch_eval = None - -if args.test_mode: - args.emsize = 200 - args.log_interval = 1 - args.nhid = 200 - args.nlayers = 1 - args.epochs = 20 - max_nbatch_eval = 3 - segments = ['test', 'test'] - -print(args) -mx.random.seed(args.seed) -np.random.seed(args.seed) - -context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ - [mx.gpu(int(x)) for x in args.gpus.split(',')] - -os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' -os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context)) -os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context)) - -############################################################################### -# Data stream -############################################################################### -train_data_stream, test_data_stream = \ - [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='') - for segment in segments] -vocab = train_data_stream.vocab -ntokens = len(vocab) - -# Sampler for generating negative classes during training with importance sampling -sampler = LogUniformSampler(ntokens, args.k) - -# Given a list of (array, context) pairs, load array[i] on context[i] -def _load(xs): - ret = [] - for x, ctx in zip(xs, context): - if isinstance(x, tuple): - ret.append([y.as_in_context(ctx) for y in x]) - else: - ret.append(x.as_in_context(ctx)) - return ret - -# Transformation for a data batch for training. -# First, load the data, target and mask to target contexts. -# Second, the LSTM-2048-512 model performs importance sampling for decoding -# during training, we need to sample negative candidate classes by invoking the -# log uniform sampler. -def _split_and_sample(x, y): - m = x != vocab[vocab.padding_token] # mask padding - num_ctx = len(context) - if num_ctx > 1: - xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True) - ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True) - ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True) - else: - xs, ys, ms = [x], [y], [m] - xs = _load(xs) - ys = _load(ys) - ms = _load(ms) - ss = [sampler(y) for y in ys] - ss = _load(ss) - return xs, ys, ms, ss - -train_batch_size = args.batch_size * len(context) -train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size) -train_data = train_batchify(train_data_stream) -train_data = train_data.transform(_split_and_sample) - -test_batch_size = args.batch_size -test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size) -test_data = test_batchify(test_data_stream) -test_data = nlp.data.PrefetchingStream(test_data) - -############################################################################### -# Build the model -############################################################################### - -model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid, - args.nlayers, args.nproj, args.k, - embed_dropout=args.dropout, - encode_dropout=args.dropout) -eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, - args.nlayers, args.nproj, - embed_dropout=args.dropout, - encode_dropout=args.dropout) - -loss = gluon.loss.SoftmaxCrossEntropyLoss() -model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) -trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} -trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params) -if args.from_epoch: - from_epoch = args.from_epoch - checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d')) - model.load_parameters(checkpoint_name) - trainer.load_states('%s.state'%args.save) - print('Loaded parameters from checkpoint %s'%(checkpoint_name)) - - -model.hybridize(static_alloc=True, static_shape=True) - -train_metric = mx.metric.Loss(loss) -val_metric = LengthNormalizedLoss(loss) -batch_processor = ParallelLanguageModelBatchProcessor(loss=loss, - vocab=vocab, - batch_size=args.batch_size, - val_batch_size=args.batch_size) -lm_estimator = LanguageModelEstimator(net=model, loss=loss, - train_metrics=train_metric, - val_metrics=val_metric, - trainer=trainer, - context=context, - val_loss=loss, - val_net=eval_model, - batch_processor=batch_processor, - bptt=args.bptt) - -hidden_state_handler = HiddenStateHandler() -gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip) -metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics, - log_interval=args.log_interval) -checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN') -logging_handler = LoggingHandler(log_interval=args.log_interval, - metrics=lm_estimator.train_metrics) - -event_handlers = [hidden_state_handler, gradient_handler, - metric_handler, checkpoint_handler, logging_handler] - -lm_estimator.fit(train_data=train_data, - #epochs=args.epochs, - event_handlers=event_handlers, - batches=5, - batch_axis=0) - -val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics) -lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0]) -lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True) -lm_estimator.val_net.load_parameters(args.save + '/largeRNN-epoch0batch5.params') -lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler]) diff --git a/scripts/estimator/word_language_model_estimator.py b/scripts/estimator/word_language_model_estimator.py deleted file mode 100644 index f14ea63030..0000000000 --- a/scripts/estimator/word_language_model_estimator.py +++ /dev/null @@ -1,259 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import time -import math -import os -import sys -import mxnet as mx -from mxnet import gluon, autograd -import gluonnlp as nlp -from mxnet.gluon.contrib.estimator import LoggingHandler -from gluonnlp.estimator import JointActivationRegularizationLoss -from gluonnlp.estimator import LanguageModelEstimator -from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler -from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler -from gluonnlp.estimator import WordLanguageModelCheckpointHandler -from gluonnlp.estimator import LanguageModelBatchProcessor -from gluonnlp.estimator import MetricResetHandler -from mxnet.gluon.data.sampler import BatchSampler - -class BatchVariableLenTextSampler(BatchSampler): - def __init__(self, bptt, length, use_variable_length=True): - self.bptt = bptt - self.length = length - self.index = 0 - self.use_variable_length = use_variable_length - - def __iter__(self): - self.index = 0 - while self.index < self.length - 2: - if self.use_variable_length: - bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2 - seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar())) - else: - seq_len = self.bptt - seq_len = min(seq_len, self.length - self.index - 1) - # batch_size = seq_len + 1 - batch = [] - for i in range(self.index, self.index + seq_len + 1): - batch.append(i) - self.index += seq_len - yield batch - - def __len__(self): - # you may never get real size of the data sampler beforehand. May need some - # postprocessing after fetching the data batch - return int(self.length / 5) + 1 - -curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) -sys.path.append(os.path.join(curr_path, '..', '..')) - -nlp.utils.check_version('0.7.0') - -parser = argparse.ArgumentParser(description= - 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.') -parser.add_argument('--model', type=str, default='lstm', - help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)') -parser.add_argument('--emsize', type=int, default=400, - help='size of word embeddings') -parser.add_argument('--nhid', type=int, default=1150, - help='number of hidden units per layer') -parser.add_argument('--nlayers', type=int, default=3, - help='number of layers') -parser.add_argument('--lr', type=float, default=30, - help='initial learning rate') -parser.add_argument('--clip', type=float, default=0.25, - help='gradient clipping') -parser.add_argument('--epochs', type=int, default=750, - help='upper epoch limit') -parser.add_argument('--batch_size', type=int, default=80, metavar='N', - help='batch size') -parser.add_argument('--bptt', type=int, default=70, - help='sequence length') -parser.add_argument('--dropout', type=float, default=0.4, - help='dropout applied to layers (0 = no dropout)') -parser.add_argument('--dropout_h', type=float, default=0.2, - help='dropout applied to hidden layer (0 = no dropout)') -parser.add_argument('--dropout_i', type=float, default=0.65, - help='dropout applied to input layer (0 = no dropout)') -parser.add_argument('--dropout_e', type=float, default=0.1, - help='dropout applied to embedding layer (0 = no dropout)') -parser.add_argument('--weight_dropout', type=float, default=0.5, - help='weight dropout applied to h2h weight matrix (0 = no weight dropout)') -parser.add_argument('--tied', action='store_true', - help='tie the word embedding and softmax weights') -parser.add_argument('--log-interval', type=int, default=200, metavar='N', - help='report interval') -parser.add_argument('--save', type=str, default='model.params', - help='path to save the final model') -parser.add_argument('--eval_only', action='store_true', - help='Whether to only evaluate the trained model') -parser.add_argument('--gpu', type=str, help='single gpu id') -parser.add_argument('--optimizer', type=str, default='sgd', - help='optimizer to use (sgd, adam)') -parser.add_argument('--wd', type=float, default=1.2e-6, - help='weight decay applied to all weights') -parser.add_argument('--alpha', type=float, default=2, - help='alpha L2 regularization on RNN activation ' - '(alpha = 0 means no regularization)') -parser.add_argument('--beta', type=float, default=1, - help='beta slowness regularization applied on RNN activation ' - '(beta = 0 means no regularization)') -parser.add_argument('--ntasgd', action='store_true', - help='Whether to apply ntasgd') -parser.add_argument('--test_mode', action='store_true', - help='Whether to run through the script with few examples') -parser.add_argument('--lr_update_interval', type=int, default=30, - help='lr udpate interval') -parser.add_argument('--lr_update_factor', type=float, default=0.1, - help='lr udpate factor') -args = parser.parse_args() - -############################################################################### -# Load data -############################################################################### - -context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))] - -assert args.batch_size % len(context) == 0, \ - 'Total batch size must be multiple of the number of devices' - -assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \ - 'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0' - -train_dataset, val_dataset, test_dataset = \ - [nlp.data.WikiText2(segment=segment, - skip_empty=False, bos=None, eos='') - for segment in ['train', 'val', 'test']] - -vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None) -train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size) -train_data = train_batchify(train_dataset) -val_batch_size = 10 -val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size) -val_data = val_batchify(val_dataset) -test_batch_size = 1 -test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size) -test_data = test_batchify(test_dataset) - -if args.test_mode: - args.emsize = 200 - args.nhid = 200 - args.nlayers = 1 - args.epochs = 3 - train_data = train_data[0:100] - val_data = val_data[0:100] - test_data = test_data[0:100] - -print(args) - -############################################################################### -# Build the model -############################################################################### - -ntokens = len(vocab) - -if args.weight_dropout > 0: - print('Use AWDRNN') - model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, - args.tied, args.dropout, args.weight_dropout, - args.dropout_h, args.dropout_i, args.dropout_e) - model.initialize(mx.init.Xavier(), ctx=context) - model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, - args.tied, args.dropout, args.weight_dropout, - args.dropout_h, args.dropout_i, args.dropout_e, - params=model.collect_params()) -else: - model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize, - args.nhid, args.nlayers, args.dropout, args.tied) - model.initialize(mx.init.Xavier(), ctx=context) - model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize, - args.nhid, args.nlayers, args.dropout, args.tied, - params=model.collect_params()) - - -model.hybridize(static_alloc=True) - -print(model) - - -def check_initialized(net): - params = net.collect_params() - for param in params: - try: - params[param].list_ctx() - except RuntimeError: - return False - return True - -print(check_initialized(model)) -print(check_initialized(model_eval)) - -if args.optimizer == 'sgd': - trainer_params = {'learning_rate': args.lr, - 'momentum': 0, - 'wd': args.wd} -elif args.optimizer == 'adam': - trainer_params = {'learning_rate': args.lr, - 'wd': args.wd, - 'beta1': 0, - 'beta2': 0.999, - 'epsilon': 1e-9} - -trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params, - update_on_kvstore=False) - -loss = gluon.loss.SoftmaxCrossEntropyLoss() -train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta) - -sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data)) -val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False) -test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data), - use_variable_length=False) -train_data_loader = mx.gluon.data.DataLoader(train_data, - batch_sampler=sampler) -val_data_loader = mx.gluon.data.DataLoader(val_data, - batch_sampler=val_sampler) -test_data_loader = mx.gluon.data.DataLoader(test_data, - batch_sampler=test_sampler) - -train_metric = mx.metric.Loss(train_loss) -val_metric = mx.metric.Loss(loss) -batch_processor = LanguageModelBatchProcessor() -est = LanguageModelEstimator(net=model, loss=train_loss, - train_metrics=train_metric, - val_metrics=val_metric, - trainer=trainer, context=context, - val_loss=loss, - val_net=model_eval, - batch_processor=batch_processor) -event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)), - LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), - RNNGradientUpdateHandler(clip=args.clip), - LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), - MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval), - WordLanguageModelCheckpointHandler(args.save)] -est.fit(train_data=train_data_loader, val_data=val_data_loader, - epochs=args.epochs, - event_handlers=event_handlers, - batch_axis=1) - -est.net.load_parameters(args.save) -est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) -est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index d1be813fad..c1b790a127 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -119,7 +119,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): estimator.hiddens[index] = hidden Ls.append(ls) - Ls = [l / (estimator.bptt * len(estimator.context)) for l in Ls] + Ls = [l / estimator.bptt for l in Ls] Ls = [mx.nd.sum(l) for l in Ls] return data, target, None, Ls diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index b9ceca599e..d7dda14080 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -21,11 +21,12 @@ import copy import warnings +import time import mxnet as mx from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd -from mxnet.gluon.contrib.estimator import GradientUpdateHandler +from mxnet.gluon.contrib.estimator import GradientUpdateHandler, LoggingHandler from mxnet.gluon.contrib.estimator import MetricHandler from mxnet.gluon.utils import clip_global_norm from mxnet.metric import Loss as MetricLoss @@ -33,7 +34,7 @@ __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', 'RNNGradientUpdateHandler', 'MetricResetHandler', - 'WordLanguageModelCheckpointHandler', + 'WordLanguageModelCheckpointHandler', 'ParallelLoggingHandler', 'LargeRNNGradientUpdateHandler'] class HiddenStateHandler(EpochBegin): @@ -212,3 +213,25 @@ def epoch_end(self, estimator, *args, **kwargs): estimator.net.save_parameters(self.save) +class ParallelLoggingHandler(LoggingHandler): + def __init__(self, *args, **kwargs): + super(ParallelLoggingHandler, self).__init__(*args, **kwargs) + + def batch_end(self, estimator, *args, **kwargs): + if isinstance(self.log_interval, int): + batch_time = time.time() - self.batch_start + msg = '[Epoch %d][Batch %d]' % (self.current_epoch, self.batch_index) + cur_batches = kwargs['batch'][0] + for batch in cur_batches: + self.processed_samples += batch.shape[0] + msg += '[Samples %s]' % (self.processed_samples) + self.log_interval_time += batch_time + if self.batch_index % self.log_interval == self.log_interval - 1: + msg += 'time/interval %.3fs ' % self.log_interval_time + self.log_interval_time = 0 + for metric in self.metrics: + name, val = metric.get() + msg += '%s: %.4f, ' % (name, val) + estimator.logger.info(msg.rstrip(', ')) + self.batch_index += 1 + From 8780711673e527304f5804d5cfa0a645bd926c64 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 12 Feb 2020 16:17:32 +0000 Subject: [PATCH 16/32] add script files --- .../large_word_language_model_estimator.py | 235 ++++++++++++++++ .../word_language_model_estimator.py | 259 ++++++++++++++++++ 2 files changed, 494 insertions(+) create mode 100644 scripts/language_model/large_word_language_model_estimator.py create mode 100644 scripts/language_model/word_language_model_estimator.py diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py new file mode 100644 index 0000000000..d80da1b4d8 --- /dev/null +++ b/scripts/language_model/large_word_language_model_estimator.py @@ -0,0 +1,235 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +import math +import os +import sys +import argparse +import numpy as np +import mxnet as mx +from mxnet import gluon, autograd +from mxnet.gluon.contrib.estimator import CheckpointHandler, LoggingHandler +import gluonnlp as nlp +from gluonnlp.utils import Parallel, Parallelizable +from sampler import LogUniformSampler +from gluonnlp.estimator import ParallelLanguageModelBatchProcessor +from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler +from gluonnlp.estimator import LargeRNNGradientUpdateHandler +from gluonnlp.estimator import WordLanguageModelCheckpointHandler +from gluonnlp.estimator import LanguageModelEstimator +from gluonnlp.estimator import ParallelLoggingHandler +from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.append(os.path.join(curr_path, '..', '..')) + +nlp.utils.check_version('0.7.0') + +############################################################################### +# Arg parser +############################################################################### +parser = argparse.ArgumentParser(description= + 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW') +parser.add_argument('--save', type=str, default='model.params', + help='path to save the final model.') +parser.add_argument('--emsize', type=int, default=512, + help='size of word embeddings') +parser.add_argument('--nhid', type=int, default=2048, + help='number of hidden units per layer') +parser.add_argument('--nproj', type=int, default=512, + help='number of projection units per layer. Could be different from embsize') +parser.add_argument('--nlayers', type=int, default=1, + help='number of layers') +parser.add_argument('--from-epoch', type=int, default=None, + help='start training or testing from the provided epoch') +parser.add_argument('--epochs', type=int, default=50, + help='number of epoch for training') +parser.add_argument('--batch-size', type=int, default=128, + help='batch size per gpu') +parser.add_argument('--dropout', type=float, default=0.1, + help='dropout applied to layers (0 = no dropout)') +parser.add_argument('--eps', type=float, default=1, + help='initial history accumulation for adagrad') +parser.add_argument('--bptt', type=int, default=20, + help='sequence length') +parser.add_argument('--k', type=int, default=8192, + help='number of noise samples for estimation') +parser.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.') +parser.add_argument('--log-interval', type=int, default=1000, + help='report interval') +parser.add_argument('--seed', type=int, default=0, + help='random seed') +parser.add_argument('--lr', type=float, default=0.2, + help='initial learning rate') +parser.add_argument('--clip', type=float, default=1.0, + help='gradient clipping by global norm.') +parser.add_argument('--test-mode', action='store_true', + help='Whether to run through the script with few examples') +parser.add_argument('--eval-only', action='store_true', + help='Whether to only run evaluation for the trained model') +args = parser.parse_args() + +segments = ['train', 'test'] +max_nbatch_eval = None + +if args.test_mode: + args.emsize = 200 + args.log_interval = 1 + args.nhid = 200 + args.nlayers = 1 + args.epochs = 20 + max_nbatch_eval = 3 + segments = ['test', 'test'] + +print(args) +mx.random.seed(args.seed) +np.random.seed(args.seed) + +context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ + [mx.gpu(int(x)) for x in args.gpus.split(',')] + +os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' +os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context)) +os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context)) + +############################################################################### +# Data stream +############################################################################### +train_data_stream, test_data_stream = \ + [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='') + for segment in segments] +vocab = train_data_stream.vocab +ntokens = len(vocab) + +# Sampler for generating negative classes during training with importance sampling +sampler = LogUniformSampler(ntokens, args.k) + +# Given a list of (array, context) pairs, load array[i] on context[i] +def _load(xs): + ret = [] + for x, ctx in zip(xs, context): + if isinstance(x, tuple): + ret.append([y.as_in_context(ctx) for y in x]) + else: + ret.append(x.as_in_context(ctx)) + return ret + +# Transformation for a data batch for training. +# First, load the data, target and mask to target contexts. +# Second, the LSTM-2048-512 model performs importance sampling for decoding +# during training, we need to sample negative candidate classes by invoking the +# log uniform sampler. +def _split_and_sample(x, y): + m = x != vocab[vocab.padding_token] # mask padding + num_ctx = len(context) + if num_ctx > 1: + xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True) + ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True) + ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True) + else: + xs, ys, ms = [x], [y], [m] + xs = _load(xs) + ys = _load(ys) + ms = _load(ms) + ss = [sampler(y) for y in ys] + ss = _load(ss) + return xs, ys, ms, ss + +train_batch_size = args.batch_size * len(context) +train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size) +train_data = train_batchify(train_data_stream) +train_data = train_data.transform(_split_and_sample) + +test_batch_size = args.batch_size +test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size) +test_data = test_batchify(test_data_stream) +test_data = nlp.data.PrefetchingStream(test_data) + +############################################################################### +# Build the model +############################################################################### + +model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid, + args.nlayers, args.nproj, args.k, + embed_dropout=args.dropout, + encode_dropout=args.dropout) +eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid, + args.nlayers, args.nproj, + embed_dropout=args.dropout, + encode_dropout=args.dropout) + +loss = gluon.loss.SoftmaxCrossEntropyLoss() +model.initialize(mx.init.Xavier(factor_type='out'), ctx=context) +trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps} +trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params) +if args.from_epoch: + from_epoch = args.from_epoch + checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d')) + model.load_parameters(checkpoint_name) + trainer.load_states('%s.state'%args.save) + print('Loaded parameters from checkpoint %s'%(checkpoint_name)) + + +model.hybridize(static_alloc=True, static_shape=True) + +train_metric = mx.metric.Loss(loss) +val_metric = LengthNormalizedLoss(loss) +batch_processor = ParallelLanguageModelBatchProcessor(loss=loss, + vocab=vocab, + batch_size=args.batch_size, + val_batch_size=args.batch_size) +lm_estimator = LanguageModelEstimator(net=model, loss=loss, + train_metrics=train_metric, + val_metrics=val_metric, + trainer=trainer, + context=context, + val_loss=loss, + val_net=eval_model, + batch_processor=batch_processor, + bptt=args.bptt) + +hidden_state_handler = HiddenStateHandler() +gradient_handler = LargeRNNGradientUpdateHandler(batch_size=args.batch_size, clip=args.clip) +metric_handler = MetricResetHandler(metrics=lm_estimator.train_metrics, + log_interval=args.log_interval) +checkpoint_handler = CheckpointHandler(model_dir=args.save, model_prefix='largeRNN') +logging_handler = ParallelLoggingHandler(log_interval=args.log_interval, + metrics=lm_estimator.train_metrics) +val_logging_handler = LoggingHandler(log_interval=args.log_interval, + metrics=lm_estimator.val_metrics) + +event_handlers = [hidden_state_handler, gradient_handler, + metric_handler, checkpoint_handler, logging_handler] + +if not args.eval_only: + lm_estimator.fit(train_data=train_data, + epochs=args.epochs, + event_handlers=event_handlers, + #batches=5, + batch_axis=0) + +val_metric_handler = MetricResetHandler(metrics=lm_estimator.val_metrics) +lm_estimator.val_net.initialize(mx.init.Xavier(), ctx=context[0]) +lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True) + +for epoch_id in range(args.epochs): + total_batch = 78028 + checkpoint_path = args.save + '/largeRNN-epoch%dbatch%d.params' % (epoch_id, total_batch) + lm_estimator.val_net.load_parameters(checkpoint_path) + lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler]) diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py new file mode 100644 index 0000000000..f14ea63030 --- /dev/null +++ b/scripts/language_model/word_language_model_estimator.py @@ -0,0 +1,259 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import time +import math +import os +import sys +import mxnet as mx +from mxnet import gluon, autograd +import gluonnlp as nlp +from mxnet.gluon.contrib.estimator import LoggingHandler +from gluonnlp.estimator import JointActivationRegularizationLoss +from gluonnlp.estimator import LanguageModelEstimator +from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler +from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler +from gluonnlp.estimator import WordLanguageModelCheckpointHandler +from gluonnlp.estimator import LanguageModelBatchProcessor +from gluonnlp.estimator import MetricResetHandler +from mxnet.gluon.data.sampler import BatchSampler + +class BatchVariableLenTextSampler(BatchSampler): + def __init__(self, bptt, length, use_variable_length=True): + self.bptt = bptt + self.length = length + self.index = 0 + self.use_variable_length = use_variable_length + + def __iter__(self): + self.index = 0 + while self.index < self.length - 2: + if self.use_variable_length: + bptt = self.bptt if mx.nd.random.uniform().asscalar() < .95 else self.bptt / 2 + seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar())) + else: + seq_len = self.bptt + seq_len = min(seq_len, self.length - self.index - 1) + # batch_size = seq_len + 1 + batch = [] + for i in range(self.index, self.index + seq_len + 1): + batch.append(i) + self.index += seq_len + yield batch + + def __len__(self): + # you may never get real size of the data sampler beforehand. May need some + # postprocessing after fetching the data batch + return int(self.length / 5) + 1 + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.append(os.path.join(curr_path, '..', '..')) + +nlp.utils.check_version('0.7.0') + +parser = argparse.ArgumentParser(description= + 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.') +parser.add_argument('--model', type=str, default='lstm', + help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)') +parser.add_argument('--emsize', type=int, default=400, + help='size of word embeddings') +parser.add_argument('--nhid', type=int, default=1150, + help='number of hidden units per layer') +parser.add_argument('--nlayers', type=int, default=3, + help='number of layers') +parser.add_argument('--lr', type=float, default=30, + help='initial learning rate') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--epochs', type=int, default=750, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=80, metavar='N', + help='batch size') +parser.add_argument('--bptt', type=int, default=70, + help='sequence length') +parser.add_argument('--dropout', type=float, default=0.4, + help='dropout applied to layers (0 = no dropout)') +parser.add_argument('--dropout_h', type=float, default=0.2, + help='dropout applied to hidden layer (0 = no dropout)') +parser.add_argument('--dropout_i', type=float, default=0.65, + help='dropout applied to input layer (0 = no dropout)') +parser.add_argument('--dropout_e', type=float, default=0.1, + help='dropout applied to embedding layer (0 = no dropout)') +parser.add_argument('--weight_dropout', type=float, default=0.5, + help='weight dropout applied to h2h weight matrix (0 = no weight dropout)') +parser.add_argument('--tied', action='store_true', + help='tie the word embedding and softmax weights') +parser.add_argument('--log-interval', type=int, default=200, metavar='N', + help='report interval') +parser.add_argument('--save', type=str, default='model.params', + help='path to save the final model') +parser.add_argument('--eval_only', action='store_true', + help='Whether to only evaluate the trained model') +parser.add_argument('--gpu', type=str, help='single gpu id') +parser.add_argument('--optimizer', type=str, default='sgd', + help='optimizer to use (sgd, adam)') +parser.add_argument('--wd', type=float, default=1.2e-6, + help='weight decay applied to all weights') +parser.add_argument('--alpha', type=float, default=2, + help='alpha L2 regularization on RNN activation ' + '(alpha = 0 means no regularization)') +parser.add_argument('--beta', type=float, default=1, + help='beta slowness regularization applied on RNN activation ' + '(beta = 0 means no regularization)') +parser.add_argument('--ntasgd', action='store_true', + help='Whether to apply ntasgd') +parser.add_argument('--test_mode', action='store_true', + help='Whether to run through the script with few examples') +parser.add_argument('--lr_update_interval', type=int, default=30, + help='lr udpate interval') +parser.add_argument('--lr_update_factor', type=float, default=0.1, + help='lr udpate factor') +args = parser.parse_args() + +############################################################################### +# Load data +############################################################################### + +context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))] + +assert args.batch_size % len(context) == 0, \ + 'Total batch size must be multiple of the number of devices' + +assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \ + 'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0' + +train_dataset, val_dataset, test_dataset = \ + [nlp.data.WikiText2(segment=segment, + skip_empty=False, bos=None, eos='') + for segment in ['train', 'val', 'test']] + +vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None) +train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size) +train_data = train_batchify(train_dataset) +val_batch_size = 10 +val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size) +val_data = val_batchify(val_dataset) +test_batch_size = 1 +test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size) +test_data = test_batchify(test_dataset) + +if args.test_mode: + args.emsize = 200 + args.nhid = 200 + args.nlayers = 1 + args.epochs = 3 + train_data = train_data[0:100] + val_data = val_data[0:100] + test_data = test_data[0:100] + +print(args) + +############################################################################### +# Build the model +############################################################################### + +ntokens = len(vocab) + +if args.weight_dropout > 0: + print('Use AWDRNN') + model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, + args.tied, args.dropout, args.weight_dropout, + args.dropout_h, args.dropout_i, args.dropout_e) + model.initialize(mx.init.Xavier(), ctx=context) + model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers, + args.tied, args.dropout, args.weight_dropout, + args.dropout_h, args.dropout_i, args.dropout_e, + params=model.collect_params()) +else: + model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize, + args.nhid, args.nlayers, args.dropout, args.tied) + model.initialize(mx.init.Xavier(), ctx=context) + model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize, + args.nhid, args.nlayers, args.dropout, args.tied, + params=model.collect_params()) + + +model.hybridize(static_alloc=True) + +print(model) + + +def check_initialized(net): + params = net.collect_params() + for param in params: + try: + params[param].list_ctx() + except RuntimeError: + return False + return True + +print(check_initialized(model)) +print(check_initialized(model_eval)) + +if args.optimizer == 'sgd': + trainer_params = {'learning_rate': args.lr, + 'momentum': 0, + 'wd': args.wd} +elif args.optimizer == 'adam': + trainer_params = {'learning_rate': args.lr, + 'wd': args.wd, + 'beta1': 0, + 'beta2': 0.999, + 'epsilon': 1e-9} + +trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params, + update_on_kvstore=False) + +loss = gluon.loss.SoftmaxCrossEntropyLoss() +train_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta) + +sampler = BatchVariableLenTextSampler(bptt=70, length=len(train_data)) +val_sampler = BatchVariableLenTextSampler(bptt=70, length=len(val_data), use_variable_length=False) +test_sampler = BatchVariableLenTextSampler(bptt=70, length=len(test_data), + use_variable_length=False) +train_data_loader = mx.gluon.data.DataLoader(train_data, + batch_sampler=sampler) +val_data_loader = mx.gluon.data.DataLoader(val_data, + batch_sampler=val_sampler) +test_data_loader = mx.gluon.data.DataLoader(test_data, + batch_sampler=test_sampler) + +train_metric = mx.metric.Loss(train_loss) +val_metric = mx.metric.Loss(loss) +batch_processor = LanguageModelBatchProcessor() +est = LanguageModelEstimator(net=model, loss=train_loss, + train_metrics=train_metric, + val_metrics=val_metric, + trainer=trainer, context=context, + val_loss=loss, + val_net=model_eval, + batch_processor=batch_processor) +event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)), + LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), + RNNGradientUpdateHandler(clip=args.clip), + LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), + MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval), + WordLanguageModelCheckpointHandler(args.save)] +est.fit(train_data=train_data_loader, val_data=val_data_loader, + epochs=args.epochs, + event_handlers=event_handlers, + batch_axis=1) + +est.net.load_parameters(args.save) +est.evaluate(val_data=val_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) +est.evaluate(val_data=test_data_loader, event_handlers=[HiddenStateHandler()], batch_axis=1) From 757354c2fb6bb122e88d481431f59a1132402540 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 12 Feb 2020 16:20:18 +0000 Subject: [PATCH 17/32] remove files --- scripts/estimator/sampler.py | 109 ----------------------------------- 1 file changed, 109 deletions(-) delete mode 100644 scripts/estimator/sampler.py diff --git a/scripts/estimator/sampler.py b/scripts/estimator/sampler.py deleted file mode 100644 index f841fba160..0000000000 --- a/scripts/estimator/sampler.py +++ /dev/null @@ -1,109 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Log Uniform Candidate Sampler""" - -import math -import numpy as np -from mxnet import ndarray, gluon - - -class LogUniformSampler(gluon.block.Block): - """Draw random samples from an approximately log-uniform or Zipfian distribution. - - This operation randomly samples *num_sampled* candidates the range of integers [0, range_max). - The elements of sampled_candidates are drawn without replacement from the base distribution. - - The base distribution for this operator is an approximately log-uniform or Zipfian distribution: - - P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1) - - This sampler is useful when the true classes approximately follow such a distribution. - - For example, if the classes represent words in a lexicon sorted in decreasing order of - frequency. If your classes are not ordered by decreasing frequency, do not use this op. - - Additionally, it also returns the number of times each of the - true classes and the sampled classes is expected to occur. - - As the candidates are drawn without replacement, the expected count for the sampled candidates - and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume - (falsely) that the number of tries to get a batch of batch_size distinct values is always - `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries. - - Parameters - ---------- - num_sampled: int - The number of classes to randomly sample. - range_max: int - The number of possible classes. - dtype: str or np.dtype - The dtype for outputs - """ - def __init__(self, range_max, num_sampled, dtype=None, **kwargs): - super(LogUniformSampler, self).__init__(**kwargs) - self._num_sampled = num_sampled - self._log_range = math.log(range_max + 1) - self._dtype = np.float32 if dtype is None else dtype - self._range_max = range_max - - def _prob_helper(self, num_tries, prob): - return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1 - - def forward(self, true_classes): # pylint: disable=arguments-differ - """Draw samples from log uniform distribution and returns sampled candidates, - expected count for true classes and sampled classes. - - Parameters - ---------- - true_classes: NDArray - The true classes. - - Returns - ------- - samples: NDArray - The sampled candidate classes. - expected_count_sample: NDArray - The expected count for sampled candidates. - expected_count_true: NDArray - The expected count for true classes in the same shape as `true_classes`. - """ - num_sampled = self._num_sampled - ctx = true_classes.context - num_tries = 0 - log_range = math.log(self._range_max + 1) - - # sample candidates - f = ndarray._internal._sample_unique_zipfian - sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled)) - sampled_classes = sampled_classes.reshape((-1,)) - sampled_classes = sampled_classes.as_in_context(ctx) - num_tries = num_tries.as_in_context(ctx) - - # expected count for true classes - true_cls = true_classes.as_in_context(ctx).astype('float64') - prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range - count_true = self._prob_helper(num_tries, prob_true) - # expected count for sampled classes - sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64') - sampled_cls_fp64 = sampled_classes.astype('float64') - prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range - count_sampled = self._prob_helper(num_tries, prob_sampled) - # convert to dtype - sampled_classes = sampled_classes.astype(self._dtype, copy=False) - count_true = count_true.astype(self._dtype, copy=False) - count_sampled = count_sampled.astype(self._dtype, copy=False) - return sampled_classes, count_sampled, count_true From 3f0862750107e9258635fdd4791a02b8aa84ec0c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 03:23:51 +0000 Subject: [PATCH 18/32] modify loading the checkpoint --- .../large_word_language_model_estimator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py index d80da1b4d8..ce6d87e3c2 100644 --- a/scripts/language_model/large_word_language_model_estimator.py +++ b/scripts/language_model/large_word_language_model_estimator.py @@ -20,6 +20,8 @@ import os import sys import argparse +import re + import numpy as np import mxnet as mx from mxnet import gluon, autograd @@ -229,7 +231,9 @@ def _split_and_sample(x, y): lm_estimator.val_net.hybridize(static_alloc=True, static_shape=True) for epoch_id in range(args.epochs): - total_batch = 78028 - checkpoint_path = args.save + '/largeRNN-epoch%dbatch%d.params' % (epoch_id, total_batch) - lm_estimator.val_net.load_parameters(checkpoint_path) - lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler]) + for filename in os.listdir(args.save): + file_pattern = 'largeRNN-epoch%dbatch\d+.params' % (epoch_id) + if re.match(file_pattern + '',filename): + checkpoint_path = args.save + '/' + filename + lm_estimator.val_net.load_parameters(checkpoint_path) + lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler]) From 48dc1e43513b731b39ef1716762e8280940807f9 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 10:03:58 +0000 Subject: [PATCH 19/32] Add todo lists for event handlers --- .../estimator/language_model_event_handler.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index d7dda14080..1f4c32eece 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -44,7 +44,11 @@ def __init__(self): def epoch_begin(self, estimator, *args, **kwargs): estimator.hiddens = None estimator.val_hiddens = None - + +"""TODO: Implement a general average parameter handler or rename it with + NTASGD average parameter handler + +""" class AvgParamHandler(BatchEnd, EpochEnd): def __init__(self, data_length): self.epoch_id = 0 @@ -92,6 +96,9 @@ def epoch_end(self, estimator, *args, **kwargs): self.batch_id = 0 self.epoch_id += 1 +"""TODO: Can we replace learning rate handler with learning rate scheduler + Problem: Learning rate scheduler cannot take feedback from each iteration +""" class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd): def __init__(self, lr_update_interval=30, lr_update_factor=0.1): self.lr_batch_start = 0 @@ -159,6 +166,10 @@ def batch_end(self, estimator, *args, **kwargs): estimator.trainer.step(len(estimator.context)) +"""This event handler reset local metrics for each few iterations + + TODO: shall we move the lengthnormalizedloss part out to be an independent handler +""" class MetricResetHandler(BatchBegin, MetricHandler): def __init__(self, metrics, log_interval=None): super().__init__(metrics=metrics) From 7ac114a45da9108f06a4709e39bc617146597527 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 10:12:10 +0000 Subject: [PATCH 20/32] update index.rst --- scripts/language_model/index.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst index 9a69f347e0..b82c8b4fc5 100644 --- a/scripts/language_model/index.rst +++ b/scripts/language_model/index.rst @@ -47,35 +47,35 @@ The dataset used for training the models is wikitext-2. For all the above model settings, we set Tied = True and NTASGD = True . -[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.71 Test PPL 65.62 ) +[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.52 Test PPL 65.68 ) .. code-block:: console - $ python word_language_model.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2 + $ python word_language_model_estimator.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2 -[2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.89 Test PPL 80.67) +[2] awd_lstm_lm_600_wikitext-2 (Val PPL 83.92 Test PPL 80.09) .. code-block:: console - $ python word_language_model.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2 + $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2 -[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.51 Test PPL 82.29) +[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 85.23 Test PPL 81.44) .. code-block:: console - $ python word_language_model.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2 + $ python word_language_model_estimator.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2 -[4] standard_lstm_lm_650_wikitext-2 (Val PPL 90.96 Test PPL 86.91) +[4] standard_lstm_lm_650_wikitext-2 (Val PPL 94.51 Test PPL 90.28) .. code-block:: console - $ python word_language_model.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2 + $ python word_language_model_estimator.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2 -[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.59 Test PPL 101.64) +[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.44 Test PPL 101.19) .. code-block:: console - $ python word_language_model.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2 + $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2 Cache Language Model ~~~~~~~~~~~~~~~~~~~~~ @@ -181,8 +181,8 @@ The dataset used for training the models is Google's 1 billion words dataset. .. code-block:: console - $ python large_word_language_model.py --gpus 0,1,2,3 --clip=10 - $ python large_word_language_model.py --gpus 4 --eval-only --batch-size=1 + $ python large_word_language_model_estimator.py --gpus 0,1,2,3 --clip=10 + $ python large_word_language_model_estimator.py --gpus 4 --eval-only --batch-size=1 XLNet: Generalized Autoregressive Pretraining for Language Understanding From 091645246039939fff8f5a0ab02dcbcf26be6caa Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 10:28:26 +0000 Subject: [PATCH 21/32] remove temp files --- .../large_word_language_model_estimator.py | 2 +- .../language_model_batch_processor.py | 2 +- .../estimator/language_model_event_handler.py | 2 +- .../estimator/length_normalized_loss.py | 77 ------------------- .../estimator/parallel_language_model.py | 43 ----------- 5 files changed, 3 insertions(+), 123 deletions(-) delete mode 100644 src/gluonnlp/estimator/length_normalized_loss.py delete mode 100644 src/gluonnlp/estimator/parallel_language_model.py diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py index ce6d87e3c2..1ebbe95232 100644 --- a/scripts/language_model/large_word_language_model_estimator.py +++ b/scripts/language_model/large_word_language_model_estimator.py @@ -35,7 +35,7 @@ from gluonnlp.estimator import WordLanguageModelCheckpointHandler from gluonnlp.estimator import LanguageModelEstimator from gluonnlp.estimator import ParallelLoggingHandler -from gluonnlp.estimator.length_normalized_loss import LengthNormalizedLoss +from gluonnlp.metric.length_normalized_loss import LengthNormalizedLoss curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index c1b790a127..9582d747c6 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -23,7 +23,7 @@ from mxnet.gluon.contrib.estimator import BatchProcessor from mxnet.gluon.utils import split_and_load from ..utils import Parallel -from .parallel_language_model import ParallelBigRNN +from ..model.train.language_model import ParallelBigRNN __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor'] diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 1f4c32eece..8287754704 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -30,7 +30,7 @@ from mxnet.gluon.contrib.estimator import MetricHandler from mxnet.gluon.utils import clip_global_norm from mxnet.metric import Loss as MetricLoss -from .length_normalized_loss import LengthNormalizedLoss +from ..metric.length_normalized_loss import LengthNormalizedLoss __all__ = ['HiddenStateHandler', 'AvgParamHandler', 'LearningRateHandler', 'RNNGradientUpdateHandler', 'MetricResetHandler', diff --git a/src/gluonnlp/estimator/length_normalized_loss.py b/src/gluonnlp/estimator/length_normalized_loss.py deleted file mode 100644 index e4558c6fb1..0000000000 --- a/src/gluonnlp/estimator/length_normalized_loss.py +++ /dev/null @@ -1,77 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" Length Normalized Loss """ - -from mxnet import ndarray -from mxnet.metric import EvalMetric - -__all__ = ['LengthNormalizedLoss'] - -class LengthNormalizedLoss(EvalMetric): - """Compute length normalized loss metrics - - Parameters - ---------- - axis : int, default=1 - The axis that represents classes - name : str - Name of this metric instance for display. - output_names : list of str, or None - Name of predictions that should be used when updating with update_dict. - By default include all predictions. - label_names : list of str, or None - Name of labels that should be used when updating with update_dict. - By default include all labels. - """ - def __init__(self, axis=0, name='length-normalized-loss', - output_names=None, label_names=None): - super(LengthNormalizedLoss, self).__init__( - name, axis=axis, - output_names=output_names, label_names=label_names, - has_global_stats=True) - - # Parameter labels should be a list in the form of [target_sequence, - # target_seqauence_valid_length] - def update(self, labels, preds): - if not isinstance(labels, list) or len(labels) != 2: - raise ValueError('labels must be a list. Its first element should be' - ' target sequence and the second element should be' - 'the valid length of sequence.') - - _, seq_valid_length = labels - - if not isinstance(seq_valid_length, list): - seq_valid_length = [seq_valid_length] - - if not isinstance(preds, list): - preds = [preds] - - for length in seq_valid_length: - if isinstance(length, ndarray.ndarray.NDArray): - total_length = ndarray.sum(length).asscalar() - else: - total_length = length - self.num_inst += total_length - self.global_num_inst += total_length - - for pred in preds: - if isinstance(pred, ndarray.ndarray.NDArray): - loss = ndarray.sum(pred).asscalar() - else: - loss = pred - self.sum_metric += loss - self.global_sum_metric += loss diff --git a/src/gluonnlp/estimator/parallel_language_model.py b/src/gluonnlp/estimator/parallel_language_model.py deleted file mode 100644 index a4e43b29a8..0000000000 --- a/src/gluonnlp/estimator/parallel_language_model.py +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 -# pylint: disable=wildcard-import, unused-variable -""" Gluon Parallel Languange Model """ - -from gluonnlp.utils import Parallel, Parallelizable -from mxnet import autograd - -__all__ = ['ParallelBigRNN'] - -class ParallelBigRNN(Parallelizable): - def __init__(self, rnn, loss_fn, batch_size): - self._model = rnn - self._loss = loss_fn - self._batch_size = batch_size - - def forward_backward(self, x): - X, y, m, s, h = x - with autograd.record(): - output, hidden, new_target = self._model(X, y, h, s) - output = output.reshape((-3, -1)) - new_target = new_target.reshape((-1,)) - ls = self._loss(output, new_target) * m.reshape((-1,)) - ls = ls / self._batch_size - ls.backward() - return hidden, ls - From 13891e7e41f4dbf3fe4fef7ad72a3d07d846728e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 12:04:17 +0000 Subject: [PATCH 22/32] relocate joint loss file --- scripts/language_model/word_language_model_estimator.py | 2 +- src/gluonnlp/estimator/__init__.py | 3 +-- src/gluonnlp/{estimator/loss.py => loss/joint_loss.py} | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) rename src/gluonnlp/{estimator/loss.py => loss/joint_loss.py} (97%) diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py index f14ea63030..9af8d66661 100644 --- a/scripts/language_model/word_language_model_estimator.py +++ b/scripts/language_model/word_language_model_estimator.py @@ -24,7 +24,7 @@ from mxnet import gluon, autograd import gluonnlp as nlp from mxnet.gluon.contrib.estimator import LoggingHandler -from gluonnlp.estimator import JointActivationRegularizationLoss +from gluonnlp.loss.joint_loss import JointActivationRegularizationLoss from gluonnlp.estimator import LanguageModelEstimator from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler from gluonnlp.estimator import LearningRateHandler, RNNGradientUpdateHandler diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py index 69172adde6..8af7856d1e 100644 --- a/src/gluonnlp/estimator/__init__.py +++ b/src/gluonnlp/estimator/__init__.py @@ -22,7 +22,6 @@ from .language_model_estimator import * from .language_model_event_handler import * from .language_model_batch_processor import * -from .loss import * __all__ = (language_model_estimator.__all__ + language_model_event_handler.__all__ + - language_model_batch_processor.__all__ + loss.__all__) + language_model_batch_processor.__all__) diff --git a/src/gluonnlp/estimator/loss.py b/src/gluonnlp/loss/joint_loss.py similarity index 97% rename from src/gluonnlp/estimator/loss.py rename to src/gluonnlp/loss/joint_loss.py index 98febf217e..307dea1cd0 100644 --- a/src/gluonnlp/estimator/loss.py +++ b/src/gluonnlp/loss/joint_loss.py @@ -17,7 +17,7 @@ from mxnet import gluon -from ..loss import ActivationRegularizationLoss, TemporalActivationRegularizationLoss +from . import ActivationRegularizationLoss, TemporalActivationRegularizationLoss __all__ = ['JointActivationRegularizationLoss'] From 7eddd5289f3b5b415f1ff700b80d59b6394f8950 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Feb 2020 14:15:33 +0000 Subject: [PATCH 23/32] remove temporary fix --- src/gluonnlp/estimator/language_model_batch_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 9582d747c6..80a0ba73fc 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -62,7 +62,6 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): return data, target, outputs, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): - batch_axis = 1 #temporary work around, removed after estimator is fixed data = val_batch[:-1] target = val_batch[1:] batch_size = val_batch.shape[batch_axis] From d5d8148e0f09c7a068bf0917224547b1c027bd74 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Feb 2020 13:45:05 +0000 Subject: [PATCH 24/32] fix pylint errors and add docstrings --- .../language_model_batch_processor.py | 38 ++++-- .../estimator/language_model_estimator.py | 46 +++++-- .../estimator/language_model_event_handler.py | 114 ++++++++++++++---- src/gluonnlp/loss/joint_loss.py | 2 +- 4 files changed, 158 insertions(+), 42 deletions(-) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 80a0ba73fc..6b241cdf97 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -28,8 +28,12 @@ __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor'] class LanguageModelBatchProcessor(BatchProcessor): + '''Word language model batch processor + + Batch training and validation for word language model + ''' def __init__(self): - pass + super(LanguageModelBatchProcessor, self).__init__() def fit_batch(self, estimator, train_batch, batch_axis=0): data = train_batch[:-1] @@ -43,12 +47,13 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): ctx=ctx) for ctx in estimator.context] else: estimator.hiddens = estimator.detach(estimator.hiddens) - + Ls = [] outputs = [] data_size = 0 with mx.autograd.record(): for i, (X, y, h) in enumerate(zip(data, target, estimator.hiddens)): + data_size = X.size output, h, encoder_hs, dropped_encoder_hs = estimator.net(X, h) l = estimator.loss(output, y, encoder_hs, dropped_encoder_hs) Ls.append(l / (len(estimator.context) * X.size)) @@ -58,7 +63,7 @@ def fit_batch(self, estimator, train_batch, batch_axis=0): for L in Ls: L.backward() - Ls = [l * (len(estimator.context) * X.size) for l in Ls] + Ls = [l * (len(estimator.context) * data_size) for l in Ls] return data, target, outputs, Ls def evaluate_batch(self, estimator, val_batch, batch_axis=0): @@ -73,7 +78,8 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): if estimator.val_hiddens is None: estimator.val_hiddens = \ [estimator.val_net.begin_state(batch_size // - len(estimator.context), func=mx.nd.zeros, ctx=ctx) for ctx \ + len(estimator.context), + func=mx.nd.zeros, ctx=ctx) for ctx in estimator.context] else: estimator.val_hiddens = estimator.detach(estimator.val_hiddens) @@ -87,7 +93,25 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): return data, target, outputs, Ls class ParallelLanguageModelBatchProcessor(BatchProcessor): + '''Parallel large RNN batch processor + + Batch training and validation for parallel large RNN model + + Parameters + ---------- + loss : mxnet.gluon.loss.Loss + Training loss function for parallel large rnn model + vocab : gluonnlp.vocab + Vocab of training and validation dataset + batch_size : int + Training batch size. It is used to construct the initial hidden states of + model + val_batch_size : int + Validation batch size. It is used to construct the initial hidden states + of validation model. + ''' def __init__(self, loss, vocab, batch_size, val_batch_size): + super(ParallelLanguageModelBatchProcessor, self).__init__() self.loss = loss self.parallel_model = None self.batch_size = batch_size @@ -128,9 +152,9 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): data = data.as_in_context(ctx) target = target.as_in_context(ctx) if estimator.val_hiddens is None: - estimator.val_hiddens = estimator.val_net.begin_state(batch_size=self.val_batch_size, - func=mx.nd.zeros, - ctx=ctx) + estimator.val_hiddens = + estimator.val_net.begin_state(batch_size=self.val_batch_size, + func=mx.nd.zeros, ctx=ctx) else: estimator.val_hiddens = estimator.detach(estimator.val_hiddens) diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index 808eabb27d..34a664c5d1 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -19,22 +19,46 @@ # pylint: disable=wildcard-import, unused-variable """ Gluon Languange Model Estimator """ -import copy -import warnings - -import numpy as np -import mxnet as mx from mxnet.gluon.contrib.estimator import Estimator -from mxnet.gluon.utils import split_and_load -from mxnet.gluon.utils import clip_global_norm -from mxnet.metric import Loss as metric_loss from .language_model_batch_processor import LanguageModelBatchProcessor __all__ = ['LanguageModelEstimator'] class LanguageModelEstimator(Estimator): + '''Language Model Estimator + + Estimator class to facilitate the language model training and validation process + + Parameters + ---------- + net : gluon.Block + The model used for training. + loss : gluon.loss.Loss + Loss (objective) function to calculate during training. + train_metrics : EvalMetric or list of EvalMetric + Training metrics for evaluating models on training dataset. + val_metrics : EvalMetric or list of EvalMetric + Validation metrics for evaluating models on validation dataset. + initializer : Initializer + Initializer to initialize the network. + trainer : Trainer + Trainer to apply optimizer on network parameters. + context : Context or list of Context + Device(s) to run the training on. + val_net : gluon.Block + The model used for validation. The validation model does not necessarily belong to + the same model class as the training model. + val_loss : gluon.loss.loss + Loss (objective) function to calculate during validation. If set val_loss + None, it will use the same loss function as self.loss + batch_processor: BatchProcessor + BatchProcessor provides customized fit_batch() and evaluate_batch() methods + bptt : int + bptt value for the language model training. It decides how many time steps + to backpropate + ''' def __init__(self, net, loss, train_metrics=None, - val_metrics = None, + val_metrics=None, initializer=None, trainer=None, context=None, @@ -56,12 +80,10 @@ def __init__(self, net, loss, train_metrics=None, self.avg_param = None self.bptt = bptt self.ntasgd = False - + def detach(self, hidden): if isinstance(hidden, (tuple, list)): hidden = [self.detach(h) for h in hidden] else: hidden = hidden.detach() return hidden - - diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 8287754704..cb08adc6d1 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -19,13 +19,11 @@ # pylint: disable=wildcard-import, unused-variable """ Gluon Language Model Event Handler """ -import copy -import warnings import time import mxnet as mx -from mxnet.gluon.contrib.estimator import TrainBegin, TrainEnd, EpochBegin -from mxnet.gluon.contrib.estimator import EpochEnd, BatchBegin, BatchEnd +from mxnet.gluon.contrib.estimator import EpochBegin, EpochEnd +from mxnet.gluon.contrib.estimator import BatchBegin, BatchEnd from mxnet.gluon.contrib.estimator import GradientUpdateHandler, LoggingHandler from mxnet.gluon.contrib.estimator import MetricHandler from mxnet.gluon.utils import clip_global_norm @@ -38,6 +36,10 @@ 'LargeRNNGradientUpdateHandler'] class HiddenStateHandler(EpochBegin): + '''Hidden state reset event handler + + Reset hidden states for language model at each epoch + ''' def __init__(self): pass @@ -45,11 +47,17 @@ def epoch_begin(self, estimator, *args, **kwargs): estimator.hiddens = None estimator.val_hiddens = None -"""TODO: Implement a general average parameter handler or rename it with - NTASGD average parameter handler - -""" class AvgParamHandler(BatchEnd, EpochEnd): + '''NTASGD average parameter event handler + + Average model parameters used in word language model estimator + + Parameters + ---------- + data_length: int + Length of training data, i.e., len(train_data). It is used to normalize the weight + average coefficient. + ''' def __init__(self, data_length): self.epoch_id = 0 self.batch_id = 0 @@ -63,8 +71,10 @@ def batch_end(self, estimator, *args, **kwargs): parameters = estimator.net.collect_params() if estimator.ntasgd: if estimator.avg_param is None: - estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() - for k, v in parameters.items()} + estimator.avg_param = + {k.split(estimator.net._prefix)[1]: + v.data(estimator.context[0]).copy() + for k, v in parameters.items()} else: gamma = 1. / max(1, self.epoch_id * (self.data_length // estimator.bptt) + self.batch_id - self.avg_trigger + 2) @@ -82,8 +92,11 @@ def epoch_end(self, estimator, *args, **kwargs): if self.avg_trigger == 0: if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]): if estimator.avg_param is None: - estimator.avg_param = {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() - for k, v in parameters.items()} + estimator.avg_param = + {k.split(estimator.net._prefix)[1]: + v.data(estimator.context[0]).copy() + for k, v in + parameters.items()} else: for key, val in parameters.items(): estimator.avg_param[key.split(estimator.net._prefix)[1]] \ @@ -96,10 +109,21 @@ def epoch_end(self, estimator, *args, **kwargs): self.batch_id = 0 self.epoch_id += 1 -"""TODO: Can we replace learning rate handler with learning rate scheduler - Problem: Learning rate scheduler cannot take feedback from each iteration -""" class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd): + '''NTASGD learning rate event handler + + Dynamically adjust the learning rate during word language model training + TODO: Investigate whether the learing rate event handler can be replaced with + learning rate scheduler + + Parameters + ---------- + lr_update_interval : int + Epoch interval of updating the learning rate during training the word + language model + lr_update_factor : float + learning rate decay factor used when updating the learning rate + ''' def __init__(self, lr_update_interval=30, lr_update_factor=0.1): self.lr_batch_start = 0 self.best_val = float('Inf') @@ -133,6 +157,15 @@ def epoch_end(self, estimator, *args, **kwargs): self.update_lr_epoch = 0 class RNNGradientUpdateHandler(GradientUpdateHandler): + '''NTASGD gradient clipping update event handler + + clipping gradient during word language model training + Parameters + ---------- + clip : clip + Gradient clipping threshold. Gradient norm exceeds this value should be scaled + down within the valid range. + ''' def __init__(self, clip=None, **kwargs): super().__init__(**kwargs) self.clip = clip @@ -143,12 +176,24 @@ def batch_end(self, estimator, *args, **kwargs): parameters = estimator.net.collect_params() grads = [p.grad(ctx) for p in parameters.values() for ctx in estimator.context] if self.clip is not None: - # use multi context clipping later clip_global_norm(grads, self.clip) estimator.trainer.step(1) class LargeRNNGradientUpdateHandler(GradientUpdateHandler): + '''Parallel Large RNN gradient clipping update event handler + + Rescale gradients of embedding parameters and clipping gradients of encoder parameters + during training parallel large RNN + + Parameters + ---------- + batch_size : int + batch size per gpu used during training parallel large RNN + clip : float + gradient clipping threshold. Gradients of encoder parameters exceed this value + should be scaled down within the valid range. + ''' def __init__(self, batch_size, clip=None, **kwargs): super().__init__(**kwargs) self.batch_size = batch_size @@ -163,14 +208,25 @@ def batch_end(self, estimator, *args, **kwargs): x[:] *= self.batch_size encoder_grad = [p.grad(ctx) for p in encoder_params] clip_global_norm(encoder_grad, self.clip) - - estimator.trainer.step(len(estimator.context)) -"""This event handler reset local metrics for each few iterations + estimator.trainer.step(len(estimator.context)) - TODO: shall we move the lengthnormalizedloss part out to be an independent handler -""" class MetricResetHandler(BatchBegin, MetricHandler): + '''Event handler for reseting local metrics + + Reset local metrics for each few iterations and add support of LengthNormalizedMetrics + to compute both local and global metrics. + TODO: Move this event handler to be reusable by other estimators, e.g., + MachineTranslationEstimator + + Parameters + ---------- + Metrics : mxnet.metric + Metrics to be reset during training + log_interval : int or None + If log_interval is of int type, it represents the interval of reseting local + metrics. Otherwise, metrics do not need to be reset. + ''' def __init__(self, metrics, log_interval=None): super().__init__(metrics=metrics) self.batch_id = 0 @@ -201,6 +257,15 @@ def batch_end(self, estimator, *args, **kwargs): metric.update(label, pred) class WordLanguageModelCheckpointHandler(EpochEnd): + '''Checkpoint Event handler of word language model + + Save the model checkpoint of word language model + + Parameters + ---------- + save : string + The model checkpoint save path prefix + ''' def __init__(self, save): self.save = save self.best_val = float('Inf') @@ -225,6 +290,11 @@ def epoch_end(self, estimator, *args, **kwargs): class ParallelLoggingHandler(LoggingHandler): + '''Logging handler of Parallel language model training + + Generating logging information of parallel large RNN training. This event handler + is designed specifically to handle the batches taken from multiple gpus. + ''' def __init__(self, *args, **kwargs): super(ParallelLoggingHandler, self).__init__(*args, **kwargs) @@ -245,4 +315,4 @@ def batch_end(self, estimator, *args, **kwargs): msg += '%s: %.4f, ' % (name, val) estimator.logger.info(msg.rstrip(', ')) self.batch_index += 1 - + diff --git a/src/gluonnlp/loss/joint_loss.py b/src/gluonnlp/loss/joint_loss.py index 307dea1cd0..c62010cbf2 100644 --- a/src/gluonnlp/loss/joint_loss.py +++ b/src/gluonnlp/loss/joint_loss.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. - +""" Joint activation regularization loss """ from mxnet import gluon from . import ActivationRegularizationLoss, TemporalActivationRegularizationLoss From 3d72a32a4eb3204455ee4e41f9cb347adf57f2a3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Feb 2020 13:50:05 +0000 Subject: [PATCH 25/32] fix errors due to the pylint fix --- src/gluonnlp/estimator/language_model_batch_processor.py | 2 +- src/gluonnlp/estimator/language_model_event_handler.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index 6b241cdf97..c359a046d6 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -152,7 +152,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): data = data.as_in_context(ctx) target = target.as_in_context(ctx) if estimator.val_hiddens is None: - estimator.val_hiddens = + estimator.val_hiddens = \ estimator.val_net.begin_state(batch_size=self.val_batch_size, func=mx.nd.zeros, ctx=ctx) else: diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index cb08adc6d1..04f1c3bb9a 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -71,7 +71,7 @@ def batch_end(self, estimator, *args, **kwargs): parameters = estimator.net.collect_params() if estimator.ntasgd: if estimator.avg_param is None: - estimator.avg_param = + estimator.avg_param = \ {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() for k, v in parameters.items()} @@ -92,7 +92,7 @@ def epoch_end(self, estimator, *args, **kwargs): if self.avg_trigger == 0: if self.t > self.n and val_metrics[0].get()[1] > min(self.valid_losses[-self.n:]): if estimator.avg_param is None: - estimator.avg_param = + estimator.avg_param = \ {k.split(estimator.net._prefix)[1]: v.data(estimator.context[0]).copy() for k, v in From ca9c9a053350e662d2248c7fce6f77c918f44887 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Feb 2020 15:34:56 +0000 Subject: [PATCH 26/32] fix docstring pylint errors --- .../language_model_batch_processor.py | 10 ++--- .../estimator/language_model_estimator.py | 4 +- .../estimator/language_model_event_handler.py | 39 +++++++++---------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/gluonnlp/estimator/language_model_batch_processor.py b/src/gluonnlp/estimator/language_model_batch_processor.py index c359a046d6..74b26cde23 100644 --- a/src/gluonnlp/estimator/language_model_batch_processor.py +++ b/src/gluonnlp/estimator/language_model_batch_processor.py @@ -28,10 +28,10 @@ __all__ = ['LanguageModelBatchProcessor', 'ParallelLanguageModelBatchProcessor'] class LanguageModelBatchProcessor(BatchProcessor): - '''Word language model batch processor + """Word language model batch processor Batch training and validation for word language model - ''' + """ def __init__(self): super(LanguageModelBatchProcessor, self).__init__() @@ -93,7 +93,7 @@ def evaluate_batch(self, estimator, val_batch, batch_axis=0): return data, target, outputs, Ls class ParallelLanguageModelBatchProcessor(BatchProcessor): - '''Parallel large RNN batch processor + """Parallel large RNN batch processor Batch training and validation for parallel large RNN model @@ -107,9 +107,9 @@ class ParallelLanguageModelBatchProcessor(BatchProcessor): Training batch size. It is used to construct the initial hidden states of model val_batch_size : int - Validation batch size. It is used to construct the initial hidden states + Validation batch size. It is used to construct the initial hidden states of validation model. - ''' + """ def __init__(self, loss, vocab, batch_size, val_batch_size): super(ParallelLanguageModelBatchProcessor, self).__init__() self.loss = loss diff --git a/src/gluonnlp/estimator/language_model_estimator.py b/src/gluonnlp/estimator/language_model_estimator.py index 34a664c5d1..4eb120ea28 100644 --- a/src/gluonnlp/estimator/language_model_estimator.py +++ b/src/gluonnlp/estimator/language_model_estimator.py @@ -25,7 +25,7 @@ __all__ = ['LanguageModelEstimator'] class LanguageModelEstimator(Estimator): - '''Language Model Estimator + """Language Model Estimator Estimator class to facilitate the language model training and validation process @@ -56,7 +56,7 @@ class LanguageModelEstimator(Estimator): bptt : int bptt value for the language model training. It decides how many time steps to backpropate - ''' + """ def __init__(self, net, loss, train_metrics=None, val_metrics=None, initializer=None, diff --git a/src/gluonnlp/estimator/language_model_event_handler.py b/src/gluonnlp/estimator/language_model_event_handler.py index 04f1c3bb9a..77ad23d1b1 100644 --- a/src/gluonnlp/estimator/language_model_event_handler.py +++ b/src/gluonnlp/estimator/language_model_event_handler.py @@ -36,10 +36,10 @@ 'LargeRNNGradientUpdateHandler'] class HiddenStateHandler(EpochBegin): - '''Hidden state reset event handler + """Hidden state reset event handler Reset hidden states for language model at each epoch - ''' + """ def __init__(self): pass @@ -48,7 +48,7 @@ def epoch_begin(self, estimator, *args, **kwargs): estimator.val_hiddens = None class AvgParamHandler(BatchEnd, EpochEnd): - '''NTASGD average parameter event handler + """NTASGD average parameter event handler Average model parameters used in word language model estimator @@ -57,7 +57,7 @@ class AvgParamHandler(BatchEnd, EpochEnd): data_length: int Length of training data, i.e., len(train_data). It is used to normalize the weight average coefficient. - ''' + """ def __init__(self, data_length): self.epoch_id = 0 self.batch_id = 0 @@ -110,7 +110,7 @@ def epoch_end(self, estimator, *args, **kwargs): self.epoch_id += 1 class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd): - '''NTASGD learning rate event handler + """NTASGD learning rate event handler Dynamically adjust the learning rate during word language model training TODO: Investigate whether the learing rate event handler can be replaced with @@ -123,7 +123,7 @@ class LearningRateHandler(BatchBegin, BatchEnd, EpochEnd): language model lr_update_factor : float learning rate decay factor used when updating the learning rate - ''' + """ def __init__(self, lr_update_interval=30, lr_update_factor=0.1): self.lr_batch_start = 0 self.best_val = float('Inf') @@ -157,15 +157,15 @@ def epoch_end(self, estimator, *args, **kwargs): self.update_lr_epoch = 0 class RNNGradientUpdateHandler(GradientUpdateHandler): - '''NTASGD gradient clipping update event handler + """NTASGD gradient clipping update event handler clipping gradient during word language model training Parameters ---------- clip : clip Gradient clipping threshold. Gradient norm exceeds this value should be scaled - down within the valid range. - ''' + down within the valid range. + """ def __init__(self, clip=None, **kwargs): super().__init__(**kwargs) self.clip = clip @@ -181,7 +181,7 @@ def batch_end(self, estimator, *args, **kwargs): estimator.trainer.step(1) class LargeRNNGradientUpdateHandler(GradientUpdateHandler): - '''Parallel Large RNN gradient clipping update event handler + """Parallel Large RNN gradient clipping update event handler Rescale gradients of embedding parameters and clipping gradients of encoder parameters during training parallel large RNN @@ -193,7 +193,7 @@ class LargeRNNGradientUpdateHandler(GradientUpdateHandler): clip : float gradient clipping threshold. Gradients of encoder parameters exceed this value should be scaled down within the valid range. - ''' + """ def __init__(self, batch_size, clip=None, **kwargs): super().__init__(**kwargs) self.batch_size = batch_size @@ -212,7 +212,7 @@ def batch_end(self, estimator, *args, **kwargs): estimator.trainer.step(len(estimator.context)) class MetricResetHandler(BatchBegin, MetricHandler): - '''Event handler for reseting local metrics + """Event handler for reseting local metrics Reset local metrics for each few iterations and add support of LengthNormalizedMetrics to compute both local and global metrics. @@ -225,8 +225,8 @@ class MetricResetHandler(BatchBegin, MetricHandler): Metrics to be reset during training log_interval : int or None If log_interval is of int type, it represents the interval of reseting local - metrics. Otherwise, metrics do not need to be reset. - ''' + metrics. Otherwise, metrics do not need to be reset. + """ def __init__(self, metrics, log_interval=None): super().__init__(metrics=metrics) self.batch_id = 0 @@ -257,15 +257,15 @@ def batch_end(self, estimator, *args, **kwargs): metric.update(label, pred) class WordLanguageModelCheckpointHandler(EpochEnd): - '''Checkpoint Event handler of word language model + """Checkpoint Event handler of word language model Save the model checkpoint of word language model - + Parameters ---------- save : string The model checkpoint save path prefix - ''' + """ def __init__(self, save): self.save = save self.best_val = float('Inf') @@ -290,11 +290,11 @@ def epoch_end(self, estimator, *args, **kwargs): class ParallelLoggingHandler(LoggingHandler): - '''Logging handler of Parallel language model training + """Logging handler of Parallel language model training Generating logging information of parallel large RNN training. This event handler is designed specifically to handle the batches taken from multiple gpus. - ''' + """ def __init__(self, *args, **kwargs): super(ParallelLoggingHandler, self).__init__(*args, **kwargs) @@ -315,4 +315,3 @@ def batch_end(self, estimator, *args, **kwargs): msg += '%s: %.4f, ' % (name, val) estimator.logger.info(msg.rstrip(', ')) self.batch_index += 1 - From 7735fa68d2941944daa797041e9464cecb4038fc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Feb 2020 17:00:49 +0000 Subject: [PATCH 27/32] fix script pylint errors --- .../large_word_language_model_estimator.py | 17 +++---- .../word_language_model_estimator.py | 51 +++++++++++-------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/scripts/language_model/large_word_language_model_estimator.py b/scripts/language_model/large_word_language_model_estimator.py index 1ebbe95232..070184f9b8 100644 --- a/scripts/language_model/large_word_language_model_estimator.py +++ b/scripts/language_model/large_word_language_model_estimator.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -import time -import math +""" large word language model train script """ + import os import sys import argparse @@ -24,18 +24,16 @@ import numpy as np import mxnet as mx -from mxnet import gluon, autograd +from mxnet import gluon from mxnet.gluon.contrib.estimator import CheckpointHandler, LoggingHandler import gluonnlp as nlp -from gluonnlp.utils import Parallel, Parallelizable -from sampler import LogUniformSampler from gluonnlp.estimator import ParallelLanguageModelBatchProcessor from gluonnlp.estimator import HiddenStateHandler, MetricResetHandler from gluonnlp.estimator import LargeRNNGradientUpdateHandler -from gluonnlp.estimator import WordLanguageModelCheckpointHandler from gluonnlp.estimator import LanguageModelEstimator from gluonnlp.estimator import ParallelLoggingHandler from gluonnlp.metric.length_normalized_loss import LengthNormalizedLoss +from sampler import LogUniformSampler curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.append(os.path.join(curr_path, '..', '..')) @@ -232,8 +230,9 @@ def _split_and_sample(x, y): for epoch_id in range(args.epochs): for filename in os.listdir(args.save): - file_pattern = 'largeRNN-epoch%dbatch\d+.params' % (epoch_id) - if re.match(file_pattern + '',filename): + file_pattern = r'largeRNN-epoch%dbatch\d+.params' % (epoch_id) + if re.match(file_pattern + '', filename): checkpoint_path = args.save + '/' + filename lm_estimator.val_net.load_parameters(checkpoint_path) - lm_estimator.evaluate(val_data=test_data, event_handlers=[val_metric_handler, val_logging_handler]) + lm_estimator.evaluate(val_data=test_data, + event_handlers=[val_metric_handler, val_logging_handler]) diff --git a/scripts/language_model/word_language_model_estimator.py b/scripts/language_model/word_language_model_estimator.py index 9af8d66661..49b3291c6d 100644 --- a/scripts/language_model/word_language_model_estimator.py +++ b/scripts/language_model/word_language_model_estimator.py @@ -15,15 +15,17 @@ # specific language governing permissions and limitations # under the License. +""" word language model training script """ + import argparse -import time -import math import os import sys + import mxnet as mx -from mxnet import gluon, autograd -import gluonnlp as nlp +from mxnet import gluon from mxnet.gluon.contrib.estimator import LoggingHandler +from mxnet.gluon.data.sampler import BatchSampler +import gluonnlp as nlp from gluonnlp.loss.joint_loss import JointActivationRegularizationLoss from gluonnlp.estimator import LanguageModelEstimator from gluonnlp.estimator import HiddenStateHandler, AvgParamHandler @@ -31,10 +33,24 @@ from gluonnlp.estimator import WordLanguageModelCheckpointHandler from gluonnlp.estimator import LanguageModelBatchProcessor from gluonnlp.estimator import MetricResetHandler -from mxnet.gluon.data.sampler import BatchSampler + class BatchVariableLenTextSampler(BatchSampler): + """Sample text of variable length + + Generate batch of text of variable length from the training dataset + + Parameters + ---------- + bptt : int + bptt variable + length : int + base sequence length for sampling + use_variable_length : bool + generate sequence of variable length or not + """ def __init__(self, bptt, length, use_variable_length=True): + super(BatchVariableLenTextSampler, self).__init__() self.bptt = bptt self.length = length self.index = 0 @@ -192,19 +208,6 @@ def __len__(self): print(model) - -def check_initialized(net): - params = net.collect_params() - for param in params: - try: - params[param].list_ctx() - except RuntimeError: - return False - return True - -print(check_initialized(model)) -print(check_initialized(model_eval)) - if args.optimizer == 'sgd': trainer_params = {'learning_rate': args.lr, 'momentum': 0, @@ -243,11 +246,15 @@ def check_initialized(net): val_loss=loss, val_net=model_eval, batch_processor=batch_processor) -event_handlers = [HiddenStateHandler(), AvgParamHandler(data_length=len(train_data)), - LearningRateHandler(lr_update_interval=args.lr_update_interval, lr_update_factor=args.lr_update_factor), +event_handlers = [HiddenStateHandler(), + AvgParamHandler(data_length=len(train_data)), + LearningRateHandler(lr_update_interval=args.lr_update_interval, + lr_update_factor=args.lr_update_factor), RNNGradientUpdateHandler(clip=args.clip), - LoggingHandler(log_interval=args.log_interval, metrics=est.train_metrics + est.val_metrics), - MetricResetHandler(metrics=est.train_metrics, log_interval=args.log_interval), + LoggingHandler(log_interval=args.log_interval, + metrics=est.train_metrics + est.val_metrics), + MetricResetHandler(metrics=est.train_metrics, + log_interval=args.log_interval), WordLanguageModelCheckpointHandler(args.save)] est.fit(train_data=train_data_loader, val_data=val_data_loader, epochs=args.epochs, From e7f80cb348fad336801d4b8d40480f85ce2b4889 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Feb 2020 17:13:23 +0000 Subject: [PATCH 28/32] fix pylint errrors --- src/gluonnlp/estimator/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gluonnlp/estimator/__init__.py b/src/gluonnlp/estimator/__init__.py index 8af7856d1e..e8de9fa8e5 100644 --- a/src/gluonnlp/estimator/__init__.py +++ b/src/gluonnlp/estimator/__init__.py @@ -19,6 +19,9 @@ # pylint: disable=wildcard-import, unused-variable """ Gluon NLP Estimator Module """ +from . import language_model_estimator, language_model_event_handler +from . import language_model_batch_processor + from .language_model_estimator import * from .language_model_event_handler import * from .language_model_batch_processor import * From a0bc6160bdd8e6a6cd7e50fd71dc9ece1198df27 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 17 Feb 2020 03:49:15 +0000 Subject: [PATCH 29/32] remove hyperparameters from the table --- scripts/language_model/index.rst | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst index b82c8b4fc5..c30cd8fea4 100644 --- a/scripts/language_model/index.rst +++ b/scripts/language_model/index.rst @@ -18,24 +18,6 @@ The dataset used for training the models is wikitext-2. +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ | Model | awd_lstm_lm_1150_wikitext-2 | awd_lstm_lm_600_wikitext-2 | standard_lstm_lm_1500_wikitext-2 | standard_lstm_lm_650_wikitext-2 | standard_lstm_lm_200_wikitext-2 | +===============+============================================================================================================================+===========================================================================================================================+=================================================================================================================================+================================================================================================================================+================================================================================================================================+ -| Mode | LSTM | LSTM | LSTM | LSTM | LSTM | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Num_layers | 3 | 3 | 2 | 2 | 2 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Embed size | 400 | 200 | 1500 | 650 | 200 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Hidden size | 1150 | 600 | 1500 | 650 | 200 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Dropout | 0.4 | 0.2 | 0.65 | 0.5 | 0.2 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Dropout_h | 0.2 | 0.1 | 0 | 0 | 0 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Dropout_i | 0.65 | 0.3 | 0 | 0 | 0 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Dropout_e | 0.1 | 0.05 | 0 | 0 | 0 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Weight_drop | 0.5 | 0.2 | 0 | 0 | 0 | -+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ | Val PPL | 68.71 | 84.89 | 86.51 | 90.96 | 107.59 | +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ | Test PPL | 65.62 | 80.67 | 82.29 | 86.91 | 101.64 | From 934cba6c082fdb663a3ea92a839717e371fd5298 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 17 Feb 2020 05:29:03 +0000 Subject: [PATCH 30/32] update language model commands --- scripts/language_model/index.rst | 95 ++------------------------------ 1 file changed, 5 insertions(+), 90 deletions(-) diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst index c30cd8fea4..afdb3a6bff 100644 --- a/scripts/language_model/index.rst +++ b/scripts/language_model/index.rst @@ -22,43 +22,13 @@ The dataset used for training the models is wikitext-2. +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ | Test PPL | 65.62 | 80.67 | 82.29 | 86.91 | 101.64 | +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ -| Command | [1] | [2] | [3] | [4] | [5] | +| Command | `command `__ | `command `__ | `command `__ | `command `__ | `command `__ | +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ | Training logs | `log `__ | `log `__ | `log `__ | `log `__ | `log `__ | +---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+ For all the above model settings, we set Tied = True and NTASGD = True . -[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.52 Test PPL 65.68 ) - -.. code-block:: console - - $ python word_language_model_estimator.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2 - -[2] awd_lstm_lm_600_wikitext-2 (Val PPL 83.92 Test PPL 80.09) - -.. code-block:: console - - $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2 - -[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 85.23 Test PPL 81.44) - -.. code-block:: console - - $ python word_language_model_estimator.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2 - -[4] standard_lstm_lm_650_wikitext-2 (Val PPL 94.51 Test PPL 90.28) - -.. code-block:: console - - $ python word_language_model_estimator.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2 - -[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.44 Test PPL 101.19) - -.. code-block:: console - - $ python word_language_model_estimator.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2 - Cache Language Model ~~~~~~~~~~~~~~~~~~~~~ @@ -79,43 +49,13 @@ The dataset used for training the models is wikitext-2. +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | Test PPL | 51.46 | 62.19 | 62.79 | 65.85 | 73.74 | +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| Command | [1] | [2] | [3] | [4] | [5] | +| Command | `command `__ | `command `__ | `command `__ | `command `__ | `command `__ | +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | Training logs | `log `__ | `log `__ | `log `__ | `log `__ | `log `__ | +---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ For all the above model settings, we set lambdas = 0.1279, theta = 0.662, window = 2000 and bptt= 2000 . -[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 53.41 Test PPL 51.46) - -.. code-block:: console - - $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_1150 - -[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.51 Test PPL 62.19) - -.. code-block:: console - - $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_600 - -[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 65.54 Test PPL 62.79) - -.. code-block:: console - - $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_1500 - -[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 68.47 Test PPL 65.85) - -.. code-block:: console - - $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_650 - -[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 77.51 Test PPL 73.74) - -.. code-block:: console - - $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_200 - Large Scale Word Language Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -130,42 +70,17 @@ The dataset used for training the models is Google's 1 billion words dataset. +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Model | LSTM-2048-512 | +=================+==============================================================================================================================+ -| Mode | LSTMP | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Num layers | 1 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Embed size | 512 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Hidden size | 2048 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Projection size | 512 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Dropout | 0.1 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Learning rate | 0.2 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Num samples | 8192 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Batch size | 128 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Gradient clip | 10.0 | -+-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Test perplexity | 43.62 | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Num epochs | 50 | +| Command | `log `__ | ++-----------------+------------------------------------------------------------------------------------------------------------------------------+ +| Command | `log `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Training logs | `log `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Evaluation logs | `log `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ -[1] LSTM-2048-512 (Test PPL 43.62) - -.. code-block:: console - - $ python large_word_language_model_estimator.py --gpus 0,1,2,3 --clip=10 - $ python large_word_language_model_estimator.py --gpus 4 --eval-only --batch-size=1 - XLNet: Generalized Autoregressive Pretraining for Language Understanding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 450dee0be402d8347e80f3b85fe69b2443dbcebb Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 17 Feb 2020 05:32:14 +0000 Subject: [PATCH 31/32] minor modification --- scripts/language_model/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst index afdb3a6bff..9ba0569ad6 100644 --- a/scripts/language_model/index.rst +++ b/scripts/language_model/index.rst @@ -72,9 +72,9 @@ The dataset used for training the models is Google's 1 billion words dataset. +=================+==============================================================================================================================+ | Test perplexity | 43.62 | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Command | `log `__ | +| Command | `command `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ -| Command | `log `__ | +| Command | `command `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Training logs | `log `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ From 159553f09e49890a1ca46c9e4ad48d0407913ca5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 18 Feb 2020 06:45:43 +0000 Subject: [PATCH 32/32] update bigrnn final result --- scripts/language_model/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst index 9ba0569ad6..318050c8a3 100644 --- a/scripts/language_model/index.rst +++ b/scripts/language_model/index.rst @@ -70,7 +70,7 @@ The dataset used for training the models is Google's 1 billion words dataset. +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Model | LSTM-2048-512 | +=================+==============================================================================================================================+ -| Test perplexity | 43.62 | +| Test perplexity | 43.80 | +-----------------+------------------------------------------------------------------------------------------------------------------------------+ | Command | `command `__ | +-----------------+------------------------------------------------------------------------------------------------------------------------------+