diff --git a/.gitignore b/.gitignore
index 1bfae15..3d229f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -107,3 +107,6 @@ ENV/
 
 # Vagrant
 .vagrant/
+
+# Pngs
+*.png
diff --git a/train/dataprepare.py b/train/dataprepare.py
index e092771..97b8a9c 100644
--- a/train/dataprepare.py
+++ b/train/dataprepare.py
@@ -20,13 +20,14 @@ class Lang:
 
     """
 
-    def __init__(self, name):
+    def __init__(self, name, threshold=1):
         """Init Lang with a name."""
         # Ken added <EOB> on 04/04/2018
         self.name = name
         self.word2index = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 3, "<EOB>": 4, "<BLK>": 5}
         self.word2count = {"<EOS>": 0, "<PAD>": 0, "<EOB>": 0, "<BLK>": 0}
         self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<PAD>", 3: "<UNK>", 4: "<EOB>", 5: "<BLK>"}
+        self.threshold = threshold
         self.n_words = 5  # Count SOS and EOS
 
     def addword(self, word):
@@ -45,7 +46,7 @@ def addword(self, word):
             self.n_words += 1
         else:
             self.word2count[word] += 1
-
+        
 
 def readLang(data_set):
     """The function to wrap up a data_set.
@@ -157,7 +158,7 @@ def findword2index(lang, word):
         # data_set[i].append([idx_triplets] + [idx_summary])
         data_set[i].idx_data = [idx_triplets] + [idx_summary]
         data_set[i].sent_leng = sentence_cnt
-
+    
     return data_set
 
 
diff --git a/train/settings.py b/train/settings.py
index f8f1418..fc67498 100644
--- a/train/settings.py
+++ b/train/settings.py
@@ -4,13 +4,13 @@
 use_cuda = torch.cuda.is_available()
 
 MAX_LENGTH = 800
-LAYER_DEPTH = 2
-MAX_SENTENCES = 5
-MAX_TRAIN_NUM = 200
+LAYER_DEPTH = 4
+MAX_SENTENCES = None
+MAX_TRAIN_NUM = None
 
 Model_name = None
-#Model_name = 'pretrain_ms8'
-iterNum = 500
+Model_name = 'pl'
+iterNum = 8495
 USE_MODEL = None
 if Model_name is not None:
     USE_MODEL = ['./models/'+Model_name + '_' + s + '_' + str(iterNum) for s in ['encoder', 'decoder', 'optim']]
@@ -25,23 +25,23 @@
 # LR = 0.003  # Adam
 ITER_TIME = 220
 BATCH_SIZE = 2
-GRAD_CLIP = 5
+GRAD_CLIP = 2
 
 # Parameter for display
-GET_LOSS = 1
-SAVE_MODEL = 5
+GET_LOSS = 10
+SAVE_MODEL = 1
 
 # Choose models
 
 # ENCODER_STYLE = 'LIN'
-ENCODER_STYLE = 'BiLSTM'
+ENCODER_STYLE = 'BiLSTMMax'
 #ENCODER_STYLE = 'RNN'
 DECODER_STYLE = 'RNN'
 
 # ENCODER_STYLE = 'HierarchicalBiLSTM'
 #ENCODER_STYLE = 'HierarchicalRNN'
 #DECODER_STYLE = 'HierarchicalRNN'
-OUTPUT_FILE = 'pretrain_copy_ms5'
+OUTPUT_FILE = 'copy'
 COPY_PLAYER = True
 TOCOPY = True
 
diff --git a/train/small_evaluate.py b/train/small_evaluate.py
index e03b86b..6410156 100644
--- a/train/small_evaluate.py
+++ b/train/small_evaluate.py
@@ -1,22 +1,27 @@
 """Evaluate the model."""
+import time
+
 import torch
 from torch.autograd import Variable
 
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+
 from preprocessing import data_iter
 from dataprepare import loaddata, data2index
-from train import get_batch, model_initialization, addpaddings
+from train import get_batch, model_initialization, addpaddings, find_max_block_numbers, initGlobalEncoderInput
 
 
 from model import docEmbedding, Seq2Seq
 from model import EncoderLIN, EncoderBiLSTM, EncoderBiLSTMMaxPool
 from model import HierarchicalEncoderRNN, HierarchicalBiLSTM, HierarchicalLIN
 from model import AttnDecoderRNN, HierarchicalDecoder
-from util import PriorityQueue
+from util import PriorityQueue, gettime
 
 
 from settings import file_loc, use_cuda
 from settings import EMBEDDING_SIZE, ENCODER_STYLE, DECODER_STYLE
-from settings import USE_MODEL
+from settings import USE_MODEL, Model_name
 
 from util import load_model
 
@@ -27,8 +32,7 @@
 EOB_TOKEN = 4
 BLK_TOKEN = 5
 
-def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
-                              encoder_style, beam_size):
+def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size, langs, beam_size):
     """The function will predict the sentecnes given boxscore.
 
     Encode the given box score, decode it to sentences, and then
@@ -41,51 +45,32 @@ def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
     input_length = rt.size()[1]
     target_length = 1000
 
+    # MAX_BLOCK is the number of global hidden states
+    # block_lens is the start position of each block
     MAX_BLOCK, blocks_lens = find_max_block_numbers(batch_length, langs, rm)
-    BLOCK_JUMPS = 32
+
+    inputs = {"rt": rt, "re": re, "rm": rm}
 
     LocalEncoder = encoder.LocalEncoder
     GlobalEncoder = encoder.GlobalEncoder
 
-    # For now, these are redundant
-    local_encoder_outputs = Variable(torch.zeros(batch_length, input_length, embedding_size))
-    local_encoder_outputs = local_encoder_outputs.cuda() if use_cuda else local_encoder_outputs
-    global_encoder_outputs = Variable(torch.zeros(batch_length, MAX_BLOCK, embedding_size))
-    global_encoder_outputs = global_encoder_outputs.cuda() if use_cuda else global_encoder_outputs
-
-
     # Encoding
-    if encoder_style == 'BiLSTM':
-        init_hidden = encoder.initHidden(batch_length)
-        encoder_hidden, encoder_hiddens = encoder(rt, re, rm, init_hidden)
-
-        # Store memory information
-        for ei in range(input_length):
-            encoder_outputs[:, ei] = encoder_hiddens[:, ei]
+    init_local_hidden = LocalEncoder.initHidden(batch_length)
+    init_global_hidden = GlobalEncoder.initHidden(batch_length)
+    local_encoder_outputs, local_hidden = LocalEncoder(inputs, init_local_hidden)
+    global_input = initGlobalEncoderInput(MAX_BLOCK, batch_length, input_length,
+                                          embedding_size, local_encoder_outputs)
+    global_encoder_outputs, global_hidden = GlobalEncoder({"local_hidden_states":
+                                                          global_input}, init_global_hidden)
+    """
+    Encoder Result Dimension: (batch, sequence length, hidden size)
+    """
+    local_encoder_outputs = local_encoder_outputs.permute(1, 0, 2)
+    global_encoder_outputs = global_encoder_outputs.permute(1, 0, 2)
 
-    else:
-        # Local Encoder set up
-        init_local_hidden = LocalEncoder.initHidden(batch_length)
-        local_out, local_hidden = LocalEncoder({"rt": rt, "re": re, "rm": rm},
-                                               init_local_hidden)
-        # Global Encoder setup
-        global_input = Variable(torch.zeros(MAX_BLOCK, batch_length,
-                                            embedding_size))
-        global_input = global_input.cuda() if use_cuda else global_input
-        for ei in range(input_length):
-            if ei % BLOCK_JUMPS == 0:
-                # map ei to block number
-                global_input[int(ei / (BLOCK_JUMPS + 1)), :, :] = local_out[ei, :, :]
-
-        init_global_hidden = GlobalEncoder.initHidden(batch_length)
-        global_out, global_hidden = GlobalEncoder({"local_hidden_states":
-                                                  global_input}, init_global_hidden)
-        """
-        Store memory information
-        Unify dimension: (batch, sequence length, hidden size)
-        """
-        local_encoder_outputs = local_out.permute(1, 0, 2)
-        global_encoder_outputs = global_out.permute(1, 0, 2)
+    # Debugging: Test encoder outputs
+    # print(local_encoder_outputs)
+    # print(global_encoder_outputs)
 
     # The decoder init for developing
     global_decoder = decoder.global_decoder
@@ -102,11 +87,16 @@ def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
     l_input = Variable(torch.LongTensor(batch_length).zero_(), requires_grad=False)
     l_input = l_input.cuda() if use_cuda else l_input
 
+    # Reshape the local_encoder outputs to (batch * blocks, blk_size, hidden)
+    local_encoder_outputs = local_encoder_outputs.contiguous().view(batch_length * len(blocks_len),
+                                                                    input_length // len(blocks_len),
+                                                                    embedding_size)
+
     decoder_attentions = torch.zeros(target_length, input_length)
 
     # Initialize the Beam
-    # Each Beam cell contains [prob, route, decoder_hidden, atten]
-    beams = [[0, [SOS_TOKEN], encoder_hidden, decoder_attentions]]
+    # Each Beam cell contains [prob, route,gnh, lnh, g_input, g_attn_weight, atten]
+    beams = [[0, [SOS_TOKEN], gnh, lnh, g_input, None, decoder_attentions]]
 
     # For each step
     for di in range(target_length):
@@ -115,7 +105,7 @@ def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
         q = PriorityQueue()
         for beam in beams:
 
-            prob, route, decoder_hidden, atten = beam
+            prob, route, gnh, lnh, g_input, g_attn_weights, atten = beam
             destination = len(route) - 1
 
             # Get the lastest predecition
@@ -125,23 +115,48 @@ def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
             if decoder_input == EOS_TOKEN:
                 q.push(beam, prob)
                 continue
-
-            decoder_input = Variable(torch.LongTensor([decoder_input]))
-            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
-
-            decoder_output, decoder_hidden, decoder_attention = decoder(
-                decoder_input, decoder_hidden, encoder_outputs)
+            
+            if di == 0 or decoder_input == BLK_TOKEN:
+                g_output, gnh, g_context, g_attn_weights = global_decoder(
+                    g_input, gnh, global_encoder_outputs)
+
+            l_input = Variable(torch.LongTensor([decoder_input]), requires_grad=False)
+            l_input = l_input.cuda() if use_cuda else l_input
+
+            l_output, lnh, l_context, l_attn_weights, pgen = local_decoder(
+                l_input, lnh, g_attn_weights, local_encoder_outputs, blocks_len)
+            
+            l_attn_weights = l_attn_weights.squeeze(1)
+            bg_attn_weights = g_attn_weights.view(batch_length * len(blocks_len), -1)
+            # print(l_attn_weights)
+            # print(g_attn_weights)
+            # print(bg_attn_weights)
+            combine_attn_weights = l_attn_weights * bg_attn_weights
+            combine_attn_weights = combine_attn_weights.view(batch_length, -1)
+            # print(torch.sum(combine_attn_weights))
+
+            if local_decoder.copy:
+                prob_copy = Variable(torch.zeros(l_output.shape), requires_grad=False)
+                prob_copy = prob_copy.cuda() if use_cuda else prob_copy
+
+                # Now we had rm as (batch, input) and combine_attn_weights as (batch, input)
+                # Add up to the pgen probability matrix
+                prob_copy = prob_copy.scatter_add(1, rm, combine_attn_weights)
+
+                l_output_new = (l_output.exp() + (1 - pgen) * prob_copy).log()
+            else:
+                l_output_new = l_output
 
             # Get the attention vector at each prediction
-            atten[destination, :decoder_attention.shape[2]] = decoder_attention.data[0, 0, :]
+            atten[destination, :combine_attn_weights.shape[1]] = combine_attn_weights.data[0, :]
 
             # decode the word
-            topv, topi = decoder_output.data.topk(beam_size)
+            topv, topi = l_output_new.data.topk(beam_size)
 
             for i in range(beam_size):
                 p = topv[0][i]
                 idp = topi[0][i]
-                new_beam = [prob + p, route + [idp], decoder_hidden, atten]
+                new_beam = [prob + p, route + [idp], gnh, lnh, lnh[-1, :, :], g_attn_weights, atten]
                 q.push(new_beam, new_beam[0])
 
         # Keep the highest K probability
@@ -152,13 +167,12 @@ def hierarchical_predictwords(rt, re, rm, encoder, decoder, embedding_size,
             break
 
     # Get decoded_words and decoder_attetntions
-    decoded_words = [lang.index2word[w] for w in beams[0][1][1:]]
-    decoder_attentions = beams[0][3]
+    decoded_words = [langs['summary'].index2word[w.item()] for w in beams[0][1][1:]]
+    decoder_attentions = beams[0][6]
     return decoded_words, decoder_attentions[:len(decoded_words)]
 
 
-def predictwords(rt, re, rm, summary, encoder, decoder, lang, embedding_size,
-                 encoder_style, beam_size):
+def predictwords(rt, re, rm, encoder, decoder, embedding_size, langs, beam_size):
     """The function will predict the sentecnes given boxscore.
     Encode the given box score, decode it to sentences, and then
     return the prediction and attention matrix.
@@ -214,13 +228,13 @@ def predictwords(rt, re, rm, summary, encoder, decoder, lang, embedding_size,
                 decoder_input, decoder_hidden, encoder_outputs)
 
             if decoder.copy:
-                prob = Variable(torch.zeros(decoder_output.shape), requires_grad=False)
-                prob = prob.cuda() if use_cuda else prob
+                prob_copy = Variable(torch.zeros(decoder_output.shape), requires_grad=False)
+                prob_copy = prob_copy.cuda() if use_cuda else prob_copy
 
                 decoder_attention = decoder_attention.squeeze(1)
-                prob = prob.scatter_add(1, rm, decoder_attention)
+                prob_copy = prob_copy.scatter_add(1, rm, decoder_attention)
 
-                decoder_output_new = (decoder_output.exp() + (1-pgen)*prob).log()
+                decoder_output_new = (decoder_output.exp() + (1-pgen)*prob_copy).log()
                 decoder_attention = decoder_attention.unsqueeze(1)                
             else:
                 decoder_output_new = decoder_output
@@ -230,13 +244,13 @@ def predictwords(rt, re, rm, summary, encoder, decoder, lang, embedding_size,
 
             # decode the word
             # print(decoder_output)
-            topv, topi = decoder_output.data.topk(beam_size)
-
+            topv, topi = decoder_output_new.data.topk(beam_size)
+            
+            
             for i in range(beam_size):
-                p = topv[0][i]
-                idp = topi[0][i]
+                p = topv[0,i]
+                idp = topi[0,i]
                 new_beam = [prob + p, route + [idp], decoder_hidden, atten]
-                # print(new_beam[0])
                 q.push(new_beam, new_beam[0])
 
         # Keep the highest K probability
@@ -247,7 +261,7 @@ def predictwords(rt, re, rm, summary, encoder, decoder, lang, embedding_size,
             break
 
     # Get decoded_words and decoder_attetntions
-    decoded_words = [lang.index2word[w.item()] for w in beams[0][1][1:]]
+    decoded_words = [langs['summary'].index2word[w.item()] for w in beams[0][1][1:]]
     decoder_attentions = beams[0][3]
     return decoded_words, decoder_attentions[:len(decoded_words)]
 
@@ -270,7 +284,7 @@ def evaluate(valid_set, langs, embedding_size,
 
     # Build the model
     model = Seq2Seq(encoder, decoder, None, decode_func, criterion, embedding_size, langs)
-
+    model.eval()
     # Get evaluate data
     valid_iter = data_iter(valid_set, batch_size=1, shuffle=False)
 
@@ -303,23 +317,45 @@ def evaluate(valid_set, langs, embedding_size,
             rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda(), summary.cuda()
 
         # Get decoding words and attention matrix
-        decoded_words, decoder_attentions = predictwords(rt, re, rm, summary,
-                                                         encoder, decoder, langs['summary'],
-                                                         embedding_size, encoder_style,
-                                                         beam_size)
+        decoded_words, decoder_attentions = model.seq_decode(rt, re, rm, beam_size)
 
-        res = ' '.join(decoded_words[:-1])
+        res = ' '.join([ w for w in decoded_words[:-1] if w!='<PAD>'])
+        # res = ' '.join(decoded_words[:-1])
         if verbose:
+            print("Generate Summary {}:".format(iteration))
             print(res)
-        yield res
 
         # # FOR WRITING REPORTS ONLY
         # # Compare to the origin data
-        # triplets, gold_summary = data[0]
+            print("Reference Summary:")
+            triplets, gold_summary = data[0]
+
+            for word in gold_summary:
+                print(word, end=' ')
+            print(' ')
+            print(torch.sum(rt==EOB_TOKEN))
+            block_num = torch.sum(rt==EOB_TOKEN).item()
+            ctr = 0
+            fig = plt.figure(figsize=(40,60))
+            for i in range(block_num):
+                ctr_end = ctr
+                while rt[0,ctr_end] != EOB_TOKEN and ctr_end+1 < rt.shape[1]:
+                    ctr_end +=1
+                ax = fig.add_subplot(block_num, 1 ,i+1)
+                mat = ax.matshow(decoder_attentions.t()[ ctr: ctr_end+1 ,:], interpolation='nearest')
+                ctr = ctr_end+1
+            fig.subplots_adjust(right=0.8)
+            cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
+            fig.colorbar(mat, cax=cbar_ax)
+
+            #ax.set_xticklabels(decoded_words)
+            #ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+            # ax.set_yticklabels(['']+alpha)
+
+            plt.savefig(Model_name+'_'+str(iteration)+'.png')
+        yield res
 
-        # for word in gold_summary:
-        #     print(word, end=' ')
-        # print(' ')
+        
 
         # showAttention(triplets, decoded_words, decoder_attentions)
     return
@@ -333,19 +369,23 @@ def main():
     decoder_style = DECODER_STYLE
     use_model = USE_MODEL
 
+    print(use_model)
+
     # Prepare data for loading the model
-    train_data, train_lang = loaddata(file_loc, 'train')
+    _, train_lang = loaddata(file_loc, 'train')
 
     # Load data for evaluation
     valid_data, _ = loaddata(file_loc, 'valid')
     valid_data = data2index(valid_data, train_lang)
     text_generator = evaluate(valid_data, train_lang, embedding_size,
                               encoder_style, decoder_style,
-                              use_model, beam_size=1, verbose=False)
+                              use_model, beam_size=15, verbose=True)
 
     # Generate Text
+    start = time.time()
     for idx, text in enumerate(text_generator):
-        print('Generate Summary {}:\n{}'.format(idx + 1, text))
+        print('Time: {}:\n'.format(gettime(start)))
 
 if __name__ == '__main__':
-    main()
+    with torch.no_grad():
+        main()
diff --git a/train/train.py b/train/train.py
index ebd8c82..a0f16df 100644
--- a/train/train.py
+++ b/train/train.py
@@ -63,7 +63,7 @@ def find_max_block_numbers(batch_length, langs, rm):
     BLOCK_NUMBERS = np.ones(batch_length)
     for bi in range(batch_length):
         for ei in range(len(rm[bi, :])):
-            if langs['rm'].index2word[int(rm[bi, ei].data[0])] == '<EOB>':
+            if langs['rm'].index2word[int(rm[bi, ei].item())] == '<EOB>':
                 blocks_lens[bi].append(ei)
                 BLOCK_NUMBERS[bi] += 1
     return int(np.max(BLOCK_NUMBERS)), blocks_lens
@@ -226,6 +226,20 @@ def Plain_seq_train(rt, re, rm, summary, encoder, decoder,
     decoder_hidden[0, :, :] = context_vec  # might be zero
     decoder_input = Variable(torch.LongTensor(batch_length).zero_(), requires_grad=False)
     decoder_input = decoder_input.cuda() if use_cuda else decoder_input
+    
+    # Calculate OOVs
+    # 0 is <KWN>
+    # oov2index = {'<KWN>':0}
+    # oov2_ctr = 1
+    # print(rm[rm==3])
+    # for b in range(batch_length):
+    #     print(data[b][1])
+    #     for i in range(len(data[b][1])):
+    #         if rm.data[b,i] == 3 and data[b][1][i] not in oov2index:
+    #             oov2index[data[b][1][i]] = oov2_ctr
+    #             oov2_ctr += 1
+
+    # print(oov2index)
 
     # Feed the target as the next input
     for di in range(target_length):
@@ -240,8 +254,27 @@ def Plain_seq_train(rt, re, rm, summary, encoder, decoder,
 
             decoder_attention = decoder_attention.squeeze(1)
             prob = prob.scatter_add(1, rm, decoder_attention)
+            # reset <UNK> prob
+            # prob[:,3] = 0
+            # # calculate oov prob.
+            # oovrm = [[ (oov2index[w] if w in oov2index else 0) for w in data[i][1]] for i in range(batch_length)]
+            # oovrm = addpaddings(oovrm ,toZero=True)
+            # oovrm = Variable(torch.LongTensor(oovrm), requires_grad=False)
+            # oovrm = oovrm.cuda() if use_cuda else oovrm
+            # #print(rm!=3)
+            # #print(oovrm==0)
+            # #print(torch.sum((rm!=3) - (oovrm==0), 1))
+            # prob_oov = Variable(torch.zeros([batch_length, oov2_ctr]), requires_grad=False)
+            # prob_oov = prob_oov.cuda() if use_cuda else prob_oov
+
+            # prob_oov = prob_oov.scatter_add(1, oovrm, decoder_attention)
+            # prob_oov[:,0] = 0
+
+            #print(torch.sum(prob, 1))
+            #print(torch.sum(prob_oov, 1))
 
             decoder_output_new = (decoder_output.exp() + (1-pgen)*prob).log()
+            print(torch.sum(decoder_output_new.exp(), 1) + torch.sum((1-pgen)*prob_oov, 1))
         else:
             decoder_output_new = decoder_output
         loss += criterion(decoder_output_new, summary[:, di])
@@ -300,7 +333,7 @@ def to_list(matrix):
     return summarizes
 
 
-def addpaddings(tokens):
+def addpaddings(tokens, toZero=False):
     """A helper function to add paddings to tokens.
 
     Args:
@@ -311,7 +344,10 @@ def addpaddings(tokens):
     """
     max_length = len(max(tokens, key=len))
     for i in range(len(tokens)):
-        tokens[i] += [PAD_TOKEN for i in range(max_length - len(tokens[i]))]
+        if toZero:
+            tokens[i] += [0 for i in range(max_length - len(tokens[i]))]
+        else:
+            tokens[i] += [PAD_TOKEN for i in range(max_length - len(tokens[i]))]
     return tokens
 
 def model_initialization(encoder_style, decoder_style, langs, embedding_size, learning_rate, use_model):
@@ -402,6 +438,7 @@ def train(train_set, langs, embedding_size=600, learning_rate=0.01,
         for dt in train_iter:
             iteration += 1
             data, idx_data = get_batch(dt)
+            print(idx_data)
             rt, re, rm, summary = idx_data
 
             # Debugging: check the input triplets
@@ -412,6 +449,7 @@ def train(train_set, langs, embedding_size=600, learning_rate=0.01,
             re = addpaddings(re)
             rm = addpaddings(rm)
 
+
             # For summary paddings, if the model is herarchical then pad between sentences
             if decoder_style == 'HierarchicalRNN':
                 summary = add_sentence_paddings(summary)
@@ -421,6 +459,11 @@ def train(train_set, langs, embedding_size=600, learning_rate=0.01,
             rt = Variable(torch.LongTensor(rt), requires_grad=False)
             re = Variable(torch.LongTensor(re), requires_grad=False)
             rm = Variable(torch.LongTensor(rm), requires_grad=False)
+            
+            # DEBUG:
+            if torch.sum(rm==3).item() == 0:
+                print('skip')
+                continue
 
             # For Decoding
             summary = Variable(torch.LongTensor(summary), requires_grad=False)