diff --git a/README.md b/README.md index 23cc8e9..79302b8 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,8 @@ The `train/train.py` accepts the following arguments. -batch BATCH_SIZE, batch size, default = 2 # Parameters for model - -encoder ENCODER_STYLE, type of encoder NN (LIN, BiLSTM, RNN, BiLSTMMax, HierarchicalRNN, - HierarchicalBiLSTM, HierarchicalLIN) + -encoder ENCODER_STYLE, type of encoder NN (LIN, BiLSTM, RNN, BiLSTMMaxPool, HierarchicalRNN, + HierarchicalBiLSTMMaxPool, HierarchicalLIN) -decoder DECODER_STYLE, type of decoder NN (RNN, HierarchcialRNN) -copy, if apply pointer-generator network(True, False), default = False diff --git a/train/model.py b/train/model.py index 22e3ead..300c84d 100644 --- a/train/model.py +++ b/train/model.py @@ -60,7 +60,8 @@ def forward(self, rt, re, rm): emb_rm = self.embedding3(rm) emb_all = torch.cat([emb_rt, emb_re, emb_rm], dim=len(rt.size())) - output = self.linear(emb_all) + output = F.relu(self.linear(emb_all)) + # output = self.linear(emb_all) return output def init_weights(self): @@ -84,19 +85,22 @@ def __init__(self, hidden_size, local_embed): class EncoderLIN(nn.Module): """This is the linear encoder for the box score. - From the origin paper, they use a linear encoder instead of standard + From the original paper, they use a linear encoder instead of standard sequential RNN style encoder. The encoder will mean pool over the entities and then linearly transform the concatenation of these pooled entity representations to initialize the decoder. """ - def __init__(self, hidden_size, embedding_layer, level='local'): + + def __init__(self, hidden_size, embedding_layer, level='plain'): + """.""" super(EncoderLIN, self).__init__() + self.name = 'LIN' self.level = level self.hidden_size = hidden_size - if self.level == 'local': + if self.level == 'plain' or self.level == 'local': self.embedding = embedding_layer - self.avgpool = nn.AvgPool1d(3, stride=2, padding=1) + self.avgpool = nn.AvgPool1d(32) def forward(self, inputs, hidden): """Dims.""" @@ -104,42 +108,22 @@ def forward(self, inputs, hidden): # embedded (n_batch, seq_len, emb_dim) # global inp: MAX_BLOCK, batch_length, input_length # hiddens (max_length, batch, hidden size) - # inp: (batch, seq_len, hidden) - - if self.level == 'local': - batch_size = inputs['rt'].size()[0] - seq_len = inputs['rt'].size()[1] - inp = self.embedding(inputs['rt'], inputs['re'], inputs['rm']) - hiddens = Variable(torch.zeros(seq_len, batch_size, self.hidden_size), - requires_grad=False) - hiddens = hiddens.cuda() if use_cuda else hiddens - output = hidden - - for ei in range(seq_len): - if ei > 0 and ei % 32 == 0: - output = self.initHidden(batch_size) - - output = torch.cat((inp[:, ei, :], output), dim=1) - output = self.avgpool(output.unsqueeze(1)) - output = output.squeeze(1) - hiddens[ei, :, :] = output + if self.level == 'global': + # AvgPool for each row as R, AvgPool for each + inp = inputs['local_hidden_states'].permute(2, 1, 0) + # inp: (seq_len, batch, dimension) + seq_len = inp.size(0) + outputs = F.avg_pool1d(inp, 32) + # 32 is the size of each block + hidden = F.avg_pool1d(outputs, int(seq_len / 32)).permute(2, 1, 0) + outputs = outputs.permute(2, 1, 0) else: - batch_size = inputs['local_hidden_states'].size()[1] - seq_len = inputs['local_hidden_states'].size()[0] - inp = inputs['local_hidden_states'].permute(1, 0, 2) - hiddens = Variable(torch.zeros(seq_len, batch_size, self.hidden_size), - requires_grad=False) - hiddens = hiddens.cuda() if use_cuda else hiddens - output = hidden - - for ei in range(seq_len): - output = torch.cat((inp[:, ei, :], output), dim=1) - output = self.avgpool(output.unsqueeze(1)) - output = output.squeeze(1) - hiddens[ei, :, :] = output - - return hiddens, output.unsqueeze(0) + # Local and Plain. + inp = self.embedding(inputs['rt'], inputs['re'], inputs['rm']) + outputs = inp.permute(1, 0, 2) + hidden = outputs[-1, :, :] + return outputs, hidden def initHidden(self, batch_size): result = Variable(torch.zeros(batch_size, self.hidden_size), requires_grad=False) @@ -160,41 +144,42 @@ def __init__(self, hidden_size, local_embed, n_layers=LAYER_DEPTH): class EncoderRNN(nn.Module): """Vanilla encoder using pure gru.""" - def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='local'): + def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='plain'): super(EncoderRNN, self).__init__() + self.name = 'RNN' self.level = level self.n_layers = n_layers self.hidden_size = hidden_size - if self.level == 'local': + if self.level == 'local' or self.level == 'plain': self.embedding = embedding_layer self.gru = nn.GRU(hidden_size, hidden_size, num_layers=self.n_layers) def forward(self, inputs, hidden): - # embedded is of size (n_batch, seq_len, emb_dim) + # emb (n_batch, seq_len, emb_dim) + # inp (seq_len, batch, emb_dim) # gru needs (seq_len, n_batch, emb_dim) - if self.level == 'local': - # local encoder: input is (rt, re, rm) + if self.level == 'global': + outputs, hidden = self.gru(inputs['local_hidden_states'], hidden) + else: embedded = self.embedding(inputs['rt'], inputs['re'], inputs['rm']) inp = embedded.permute(1, 0, 2) - # inp (seq_len, batch, emb_dim) - seq_len = embedded.size(1) - batch_size = embedded.size(0) - embed_dim = embedded.size(2) - outputs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) - outputs = outputs.cuda() if use_cuda else outputs - - for ei in range(seq_len): - if ei > 0 and ei % 32 == 0: - hidden = self.initHidden(batch_size) - seq_i = inp[ei, :, :].unsqueeze(0) - # inputs of size: (1, batch, emb_dim) - output, hidden = self.gru(seq_i, hidden) - # output of size: (1, batch, emb_dim) - outputs[ei, :, :] = output[0, :, :] - else: - outputs, hidden = self.gru(inputs['local_hidden_states'], hidden) - # output (seq_len, batch, hidden_size * num_directions) - # 1. hidden is the at t = seq_len + if self.level == 'plain': + outputs, hidden = self.gru(inp, hidden) + else: + # Local. + seq_len, batch_size, embed_dim = inp.size() + outputs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) + outputs = outputs.cuda() if use_cuda else outputs + for ei in range(seq_len): + if ei > 0 and ei % 32 == 0: + hidden = self.initHidden(batch_size) + seq_i = inp[ei, :, :].unsqueeze(0) + # seq_i of size: (1, batch, emb_dim) + output, hidden = self.gru(seq_i, hidden) + # output of size: (1, batch, emb_dim) + outputs[ei, :, :] = output[0, :, :] + # outputs (seq_len, batch, hidden_size * num_directions) + # hidden is the at t = seq_len return outputs, hidden def initHidden(self, batch_size): @@ -206,14 +191,25 @@ def initHidden(self, batch_size): return result +class HierarchicalBiLSTM(nn.Module): + """""" + def __init__(self, hidden_size, local_embed, n_layers=LAYER_DEPTH): + super(HierarchicalBiLSTM, self).__init__() + self.LocalEncoder = EncoderBiLSTM(hidden_size, local_embed, + n_layers=n_layers, level='local') + self.GlobalEncoder = EncoderBiLSTM(hidden_size, None, + n_layers=n_layers, level='global') + + class EncoderBiLSTM(nn.Module): """Vanilla encoder using pure LSTM.""" - def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='local'): + def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='plain'): super(EncoderBiLSTM, self).__init__() + self.name = 'BiLSTM' self.level = level self.n_layers = n_layers self.hidden_size = hidden_size - if self.level == 'local': + if self.level == 'plain' or self.level == 'local': self.embedding = embedding_layer self.bilstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers=n_layers, bidirectional=True) @@ -221,44 +217,47 @@ def forward(self, inputs, hidden): # embedded is of size (n_batch, seq_len, emb_dim) # lstm needs: (seq_len, batch, input_size) # lstm output: (seq_len, batch, hidden_size * num_directions) - if self.level == 'local': + if self.level == 'global': + inp = inputs['local_hidden_states'] + outputs, (hn, cn) = self.bilstm(inp, hidden) + # hn: (num_layers * num_directions, batch, hidden_size): + return outputs, hn.view(self.n_layers, -1, self.hidden_size) + else: embedded = self.embedding(inputs['rt'], inputs['re'], inputs['rm']) - # local encoder: input is (rt, re, rm) inp = embedded.permute(1, 0, 2) - # inp (seq_len, batch, emb_dim) - seq_len = embedded.size(1) - batch_size = embedded.size(0) - embed_dim = embedded.size(2) - outputs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) - outputs = outputs.cuda() if use_cuda else outputs - - for ei in range(seq_len): - if ei > 0 and ei % 32 == 0: - hidden = self.initHidden(batch_size) - seq_i = inp[ei, :, :].unsqueeze(0) - # inputs of size: (1, batch, emb_dim) - output, hidden = self.bilstm(seq_i, hidden) - # output of size: (1, batch, emb_dim) - outputs[ei, :, :] = output[0, :, :] - - else: - inp = inputs['local_hidden_states'] - outputs, hidden = self.bilstm(inp, hidden) - return outputs, hidden + if self.level == 'plain': + outputs, (hn, cn) = self.bilstm(inp, hidden) + else: + # Local. + seq_len, batch_size, embed_dim = inp.size() + outputs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) + outputs = outputs.cuda() if use_cuda else outputs + for ei in range(seq_len): + if ei > 0 and ei % 32 == 0: + # Local needs to reinit by block. + hidden = self.initHidden(batch_size) + seq_i = inp[ei, :, :].unsqueeze(0) + # inputs of size: (1, batch, emb_dim) + output, (hn, cn) = self.bilstm(seq_i, hidden) + outputs[ei, :, :] = output[0, :, :] + # output of size: (1, batch, emb_dim) + return outputs, hn.view(self.n_layers, -1, self.hidden_size) def initHidden(self, batch_size): - forward = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size // 2), requires_grad=False) - backward = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size // 2), requires_grad=False) + forward = Variable(torch.zeros(2 * self.n_layers, batch_size, + self.hidden_size // 2), requires_grad=False) + backward = Variable(torch.zeros(2 * self.n_layers, batch_size, + self.hidden_size // 2), requires_grad=False) if use_cuda: return (forward.cuda(), backward.cuda()) else: return (forward, backward) -class HierarchicalBiLSTM(nn.Module): +class HierarchicalBiLSTMMaxPool(nn.Module): """""" def __init__(self, hidden_size, local_embed, n_layers=LAYER_DEPTH): - super(HierarchicalBiLSTM, self).__init__() + super(HierarchicalBiLSTMMaxPool, self).__init__() self.LocalEncoder = EncoderBiLSTMMaxPool(hidden_size, local_embed, n_layers=n_layers, level='local') self.GlobalEncoder = EncoderBiLSTMMaxPool(hidden_size, None, @@ -267,50 +266,52 @@ def __init__(self, hidden_size, local_embed, n_layers=LAYER_DEPTH): class EncoderBiLSTMMaxPool(nn.Module): """Vanilla encoder using pure LSTM.""" - def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='local'): + def __init__(self, hidden_size, embedding_layer, n_layers=LAYER_DEPTH, level='plain'): super(EncoderBiLSTMMaxPool, self).__init__() + self.name = 'BiLSTMMaxPool' self.level = level self.n_layers = n_layers self.hidden_size = hidden_size - if self.level == 'local': + if self.level == 'plain' or self.level == 'local': self.embedding = embedding_layer self.bilstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers=n_layers, bidirectional=True) def forward(self, inputs, hidden): # embedded is of size (n_batch, seq_len, emb_dim) # lstm needs: (seq_len, batch, input_size) - if self.level == 'local': + if self.level == 'global': + inp = inputs['local_hidden_states'] + bilstm_outs, hidden = self.bilstm(inp, hidden) + else: + # Local or Plain. embedded = self.embedding(inputs['rt'], inputs['re'], inputs['rm']) inp = embedded.permute(1, 0, 2) - seq_len = embedded.size(1) - batch_size = embedded.size(0) - embed_dim = embedded.size(2) - bilstm_outs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) - bilstm_outs = bilstm_outs.cuda() if use_cuda else bilstm_outs - - for ei in range(seq_len): - if ei > 0 and ei % 32 == 0: - hidden = self.initHidden(batch_size) - - inputs = inp[ei, :, :].unsqueeze(0) - # inputs of size: (1, batch, emb_dim) - outputs, hidden = self.bilstm(inputs, hidden) - # output of size: (1, batch, emb_dim) - bilstm_outs[ei, :, :] = outputs[0, :, :] - - else: - inp = inputs['local_hidden_states'] - bilstm_outs, nh = self.bilstm(inp, hidden) - - # bilstm_outs: (seq_len, batch, hidden_size * num_directions ) + if self.level == 'plain': + bilstm_outs, hidden = self.bilstm(inp, hidden) + else: + # Local. + seq_len, batch_size, embed_dim = inp.size() + bilstm_outs = Variable(torch.zeros(seq_len, batch_size, embed_dim)) + bilstm_outs = bilstm_outs.cuda() if use_cuda else bilstm_outs + for ei in range(seq_len): + if ei > 0 and ei % 32 == 0: + hidden = self.initHidden(batch_size) + inputs = inp[ei, :, :].unsqueeze(0) + # inputs of size: (1, batch, emb_dim) + outputs, hidden = self.bilstm(inputs, hidden) + # output of size: (1, batch, emb_dim) + bilstm_outs[ei, :, :] = outputs[0, :, :] + # bilstm_outs: (seq_len, batch, hidden_size * num_directions) output = bilstm_outs.permute(1, 2, 0) # bilstm_outs: (batch, hidden_size * num_directions, seq_len) output = F.max_pool1d(output, output.size(2)).squeeze(2) - return bilstm_outs, output + return bilstm_outs, output.unsqueeze(0) def initHidden(self, batch_size): - forward = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size // 2), requires_grad=False) - backward = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size // 2), requires_grad=False) + forward = Variable(torch.zeros(2 * self.n_layers, batch_size, + self.hidden_size // 2), requires_grad=False) + backward = Variable(torch.zeros(2 * self.n_layers, batch_size, + self.hidden_size // 2), requires_grad=False) if use_cuda: return (forward.cuda(), backward.cuda()) else: @@ -422,7 +423,6 @@ def __init__(self, hidden_size, n_layers=LAYER_DEPTH, dropout_p=0.1): self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p) def forward(self, input, hidden, encoder_outputs): - attn_weights = self.attn(hidden[-1, :, :], encoder_outputs) context = torch.bmm(attn_weights, encoder_outputs) output = torch.cat((input, context.squeeze(1)), dim=1) @@ -557,7 +557,7 @@ def forward(self, hidden, encoder_outputs): def score(self, hidden, encoder_outputs): # print('size of hidden: {}'.format(hidden.size())) # print('size of encoder_hidden: {}'.format(encoder_output.size())) - energy = self.attn(encoder_outputs) + energy = encoder_outputs # batch-wise calculate dot-product hidden = hidden.unsqueeze(2) # (batch, seq, 1, d) diff --git a/train/train.py b/train/train.py index 6e73ff0..54eb2dc 100644 --- a/train/train.py +++ b/train/train.py @@ -10,8 +10,9 @@ from preprocessing import data_iter from dataprepare import loaddata, data2index from model import docEmbedding, Seq2Seq -from model import EncoderLIN, EncoderBiLSTM, EncoderBiLSTMMaxPool -from model import HierarchicalRNN, HierarchicalBiLSTM, HierarchicalLIN +from model import EncoderLIN, EncoderBiLSTM, EncoderBiLSTMMaxPool, EncoderRNN +from model import HierarchicalLIN, HierarchicalRNN +from model import HierarchicalBiLSTM, HierarchicalBiLSTMMaxPool from model import AttnDecoderRNN, HierarchicalDecoder from util import gettime, load_model, show_triplets from util import PriorityQueue @@ -73,12 +74,14 @@ def find_max_block_numbers(batch_length, langs, rm): def initGlobalEncoderInput(MAX_BLOCK, batch_length, input_length, embedding_size, - local_outputs, BLOCK_JUMPS=32): + local_outputs, BLOCK_JUMPS=32, name=None): """ Args: local_outputs: (batch, seq_len, embed_size) """ # print("Max block = ", MAX_BLOCK) # print("input length = ", input_length) + if name == 'LIN': + return local_outputs global_input = Variable(torch.zeros(MAX_BLOCK, batch_length, embedding_size)) global_input = global_input.cuda() if use_cuda else global_input @@ -107,7 +110,6 @@ def Hierarchical_seq_train(rt, re, rm, summary, encoder, decoder, batch_length = rt.size()[0] input_length = rt.size()[1] target_length = summary.size()[1] - # MAX_BLOCK is the number of global hidden states # block_lens is the start position of each block MAX_BLOCK, blocks_lens = find_max_block_numbers(batch_length, langs, rm) @@ -119,23 +121,25 @@ def Hierarchical_seq_train(rt, re, rm, summary, encoder, decoder, loss = 0 - # Encoding + # Encoding: Encoder output all has (seq_len, batch, hid_size) init_local_hidden = LocalEncoder.initHidden(batch_length) init_global_hidden = GlobalEncoder.initHidden(batch_length) local_encoder_outputs, local_hidden = LocalEncoder(inputs, init_local_hidden) global_input = initGlobalEncoderInput(MAX_BLOCK, batch_length, input_length, - embedding_size, local_encoder_outputs) + embedding_size, local_encoder_outputs, + name=GlobalEncoder.name) global_encoder_outputs, global_hidden = GlobalEncoder({"local_hidden_states": - global_input}, init_global_hidden) + global_input}, init_global_hidden) + """ - Encoder Result Dimension: (batch, sequence length, hidden size) + Encoder Final Dimension: (batch, sequence length, hidden size) """ local_encoder_outputs = local_encoder_outputs.permute(1, 0, 2) global_encoder_outputs = global_encoder_outputs.permute(1, 0, 2) # Debugging: Test encoder outputs - # print(local_encoder_outputs) - # print(global_encoder_outputs) + # print("Local Encoder shape: ", local_encoder_outputs.shape) + # print("Global Encoder shape: ", global_encoder_outputs.shape) # The decoder init for developing global_decoder = decoder.global_decoder @@ -144,7 +148,6 @@ def Hierarchical_seq_train(rt, re, rm, summary, encoder, decoder, # Currently, we pad all box-scores to be the same length and blocks blocks_len = blocks_lens[0] - # Initialize the inputs for global decoder and local decoder """ g_input_{0} should be 0 vector with dim (batch, hidden) gnh should be the last hidden state of global encoder @@ -362,10 +365,13 @@ def train(train_set, langs, embedding_size=EMBEDDING_SIZE, learning_rate=LR, if encoder_style == 'LIN': encoder = EncoderLIN(embedding_size, emb) + elif encoder_style == 'RNN': + encoder = EncoderRNN(embedding_size, emb) + elif encoder_style == 'BiLSTM': encoder = EncoderBiLSTM(embedding_size, emb, n_layers=layer_depth) - elif encoder_style == 'BiLSTMMax': + elif encoder_style == 'BiLSTMMaxPool': encoder = EncoderBiLSTMMaxPool(embedding_size, emb, n_layers=layer_depth) elif encoder_style == 'HierarchicalBiLSTM': @@ -373,6 +379,11 @@ def train(train_set, langs, embedding_size=EMBEDDING_SIZE, learning_rate=LR, "n_layers": layer_depth} encoder = HierarchicalBiLSTM(**encoder_args) + elif encoder_style == 'HierarchicalBiLSTMMaxPool': + encoder_args = {"hidden_size": embedding_size, "local_embed": emb, + "n_layers": layer_depth} + encoder = HierarchicalBiLSTMMaxPool(**encoder_args) + elif encoder_style == 'HierarchicalLIN': encoder_args = {"hidden_size": embedding_size, "local_embed": emb} encoder = HierarchicalLIN(**encoder_args) @@ -788,9 +799,7 @@ def evaluate(encoder, decoder, valid_set, lang, def setupconfig(args): """Set up and display the configuration.""" - # print("Command Line Options:") - # # Read in command line parameters. - + # TODO: restricted lin for layer = 1 parameters = {} for arg in vars(args): parameters[arg] = getattr(args, arg) @@ -809,6 +818,10 @@ def setupconfig(args): print("You must give me two plain NNs!!!!!!!!!") quit() + if (parameters['encoder_style'] == 'LIN' or parameters['encoder_style'] == 'HierarchicalLIN') and parameters['layer_depth'] != 1: + print("Linear encoder only has depth = 1, adjust layer to 1.") + parameters['layer_depth'] = 1 + copy_player = COPY_PLAYER for arg in parameters: if arg == 'copy_player': @@ -848,8 +861,8 @@ def main(args): def parse_argument(): """Hyperparmeter tuning.""" - encoder_choices = ['LIN', 'BiLSTM', 'RNN', - 'BiLSTMMax', 'HierarchicalRNN', + encoder_choices = ['LIN', 'BiLSTM', 'RNN', 'BiLSTMMaxPool', + 'HierarchicalRNN', 'HierarchicalBiLSTMMaxPool', 'HierarchicalBiLSTM', 'HierarchicalLIN'] decoder_choices = ['RNN', 'HierarchicalRNN'] @@ -900,7 +913,6 @@ def parse_argument(): return ap.parse_args() - if __name__ == '__main__': args = parse_argument() main(args)