microsoft
diff --git a/‎LearningMachine.py‎
Lines changed: 74 additions & 31 deletions b/‎LearningMachine.py‎
Lines changed: 74 additions & 31 deletions
diff --git a/‎Model.py‎
Lines changed: 19 additions & 7 deletions b/‎Model.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎ModelConf.py‎
Lines changed: 3 additions & 0 deletions b/‎ModelConf.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎block_zoo/BaseLayer.py‎
Lines changed: 5 additions & 0 deletions b/‎block_zoo/BaseLayer.py‎
Lines changed: 5 additions & 0 deletions
@@ -13,7 +13,7 @@
 import pickle as pkl
 
 from utils.common_utils import dump_to_pkl, load_from_pkl, get_param_num, get_trainable_param_num, \
-    transfer_to_gpu, transform_params2tensors
+    transfer_to_gpu, transform_params2tensors, get_layer_class
 from utils.philly_utils import HDFSDirectTransferer, open_and_move, convert_to_tmppath, \
     convert_to_hdfspath, move_from_local_to_hdfs
 from Model import Model
@@ -24,6 +24,8 @@
 from core.LRScheduler import LRScheduler
 from settings import ProblemTypes
 from block_zoo import Linear
+from block_zoo import CRF
+from losses.CRFLoss import CRFLoss
 
 
 class LearningMachine(object):
@@ -169,6 +171,8 @@ def train(self, optimizer, loss_fn):
                                 (not self.model.module.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
                             logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
                                 logits[tmp_output_layer_id], dim=-1)
+                        elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                            pass
                         else:
                             logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]
                 else:
@@ -177,6 +181,8 @@ def train(self, optimizer, loss_fn):
                                 (not self.model.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
                             logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
                                 logits[tmp_output_layer_id], dim=-1)
+                        elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                            pass
                         else:
                             logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]
 
@@ -194,8 +200,9 @@ def train(self, optimizer, loss_fn):
                         prediction_scores_all = None
                 elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
                     logits = list(logits.values())[0]
-                    logits_softmax = list(logits_softmax.values())[0]
-                    assert len(logits_softmax.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence length, representation dim]' % (str(list(logits_softmax.shape)), )
+                    if not isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                        logits_softmax = list(logits_softmax.values())[0]
+                        assert len(logits_softmax.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence length, representation dim]' % (str(list(logits_softmax.shape)), )
                     prediction_scores = None
                     prediction_scores_all = None
                 elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
@@ -214,16 +221,25 @@ def train(self, optimizer, loss_fn):
                 if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
                     # Transform output shapes for metric evaluation
                     # for seq_tag_f1 metric
-                    prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()    # [batch_size, seq_len]
-                    streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
-                                                      prediction_scores, self.problem.decode(target_batches[i][self.conf.answer_column_name[0]],
-                                                                                             length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
+                    if isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                        forward_score, scores, masks, tag_seq, transitions, layer_conf = logits
+                        prediction_indices = tag_seq.cpu().numpy()
+                        streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                                                          prediction_scores, self.problem.decode(
+                                target_batches[i][self.conf.answer_column_name[0]],
+                                length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
 
-                    # pytorch's CrossEntropyLoss only support this
-                    logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))    # [batch_size * seq_len, # of tags]
-                    #target_batches[i] = target_batches[i].view(-1)                      # [batch_size * seq_len]
-                    # [batch_size * seq_len]
-                    target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][self.conf.answer_column_name[0]].reshape(-1)
+                    else:
+                        prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()    # [batch_size, seq_len]
+                        # pytorch's CrossEntropyLoss only support this
+                        logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))  # [batch_size * seq_len, # of tags]
+                        streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                                                          prediction_scores, self.problem.decode(
+                                target_batches[i][self.conf.answer_column_name[0]],
+                                length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
+
+                        target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][
+                            self.conf.answer_column_name[0]].reshape(-1)
 
                 elif ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
                     prediction_indices = logits_softmax.detach().max(1)[1].cpu().numpy()
@@ -260,7 +276,10 @@ def train(self, optimizer, loss_fn):
                     for single_target in self.conf.answer_column_name:
                         if isinstance(target_batches[i][single_target], torch.Tensor):
                             target_batches[i][single_target] = transfer_to_gpu(target_batches[i][single_target])
-                loss = loss_fn(logits_flat, target_batches[i])
+                if isinstance(loss_fn.loss_fn[0], CRFLoss):
+                    loss = loss_fn.loss_fn[0](forward_score, scores, masks, list(target_batches[i].values())[0], transitions, layer_conf)
+                else:
+                    loss = loss_fn(logits_flat, target_batches[i])
 
                 all_costs.append(loss.item())
                 optimizer.zero_grad()
@@ -297,7 +316,7 @@ def train(self, optimizer, loss_fn):
 
                     if torch.cuda.device_count() > 1:
                         logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
-                            (epoch, i * torch.cuda.device_count(), lr_scheduler.get_lr(), np.mean(all_costs), result))
+                            (epoch, i * torch.cuda.device_count(), lr_scheduler.get_lr(), np.sum(all_costs), result))
                     else:
                         logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
                             (epoch, i, lr_scheduler.get_lr(), np.mean(all_costs), result))
@@ -473,18 +492,29 @@ def evaluate(self, data, length, target, input_types, evaluator,
                 logits_flat = {}
                 if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
                     logits = list(logits.values())[0]
-                    logits_softmax = list(logits_softmax.values())[0]
-                    # Transform output shapes for metric evaluation
-                    # for seq_tag_f1 metric
-                    prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()  # [batch_size, seq_len]
-                    streaming_recoder.record_one_row(
-                        [self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()), prediction_pos_scores,
-                         self.problem.decode(target_batches[i], length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
+                    if isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                        forward_score, scores, masks, tag_seq, transitions, layer_conf = logits
+                        prediction_indices = tag_seq.cpu().numpy()
+                        streaming_recoder.record_one_row(
+                            [self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                             prediction_pos_scores,
+                             self.problem.decode(target_batches[i], length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())],
+                            keep_dim=False)
+                    else:
+                        logits_softmax = list(logits_softmax.values())[0]
+                        # Transform output shapes for metric evaluation
+                        # for seq_tag_f1 metric
+                        prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()  # [batch_size, seq_len]
+                        # pytorch's CrossEntropyLoss only support this
+                        logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))  # [batch_size * seq_len, # of tags]
+                        streaming_recoder.record_one_row(
+                            [self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                             prediction_pos_scores,
+                             self.problem.decode(target_batches[i], length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())],
+                            keep_dim=False)
 
-                    # pytorch's CrossEntropyLoss only support this
-                    logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))  # [batch_size * seq_len, # of tags]
-                    #target_batches[i] = target_batches[i].view(-1)  # [batch_size * seq_len]
-                    target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][self.conf.answer_column_name[0]].reshape(-1)  # [batch_size * seq_len]
+                        target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][
+                            self.conf.answer_column_name[0]].reshape(-1)  # [batch_size * seq_len]
 
                     if to_predict:
                         prediction_batch = self.problem.decode(prediction_indices, length_batches[i][key_random].numpy())
@@ -547,8 +577,13 @@ def evaluate(self, data, length, target, input_types, evaluator,
                         predict_stream_recoder.record_one_row([prediction])
 
                 if to_predict:
-                    logits_len = len(list(logits.values())[0]) \
-                        if ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc else len(logits)
+                    if ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
+                        logits_len = len(list(logits.values())[0])
+                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging and isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                        # for sequence_tagging task, logits is tuple type which index 3 is tag_seq [batch_size*seq_len]
+                        logits_len = logits[3].size(0)
+                    else:
+                        logits_len = len(logits)
                     for sample_idx in range(logits_len):
                         while True:
                             sample = fin.readline().rstrip()
@@ -564,7 +599,10 @@ def evaluate(self, data, length, target, input_types, evaluator,
                     for single_target in self.conf.answer_column_name:
                         if isinstance(target_batches[i][single_target], torch.Tensor):
                             target_batches[i][single_target] = transfer_to_gpu(target_batches[i][single_target])
-                loss = loss_fn(logits_flat, target_batches[i])
+                if isinstance(loss_fn.loss_fn[0], CRFLoss):
+                    loss = loss_fn.loss_fn[0](forward_score, scores, masks, list(target_batches[i].values())[0], transitions, layer_conf)
+                else:
+                    loss = loss_fn(logits_flat, target_batches[i])
                 loss_recoder.record('loss', loss.item())
 
                 del loss, logits, logits_softmax, logits_flat
@@ -686,9 +724,14 @@ def predict(self, predict_data_path, output_path, file_columns, predict_fields=[
 
                     if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
                         logits = list(logits.values())[0]
-                        logits_softmax = list(logits_softmax.values())[0]
-                        # Transform output shapes for metric evaluation
-                        prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()  # [batch_size, seq_len]
+                        if isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                            forward_score, scores, masks, tag_seq, transitions, layer_conf = logits
+                            prediction_indices = tag_seq.cpu().numpy()
+                        else:
+                            logits_softmax = list(logits_softmax.values())[0]
+                            # Transform output shapes for metric evaluation
+                            # for seq_tag_f1 metric
+                            prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()  # [batch_size, seq_len]
                         prediction_batch = self.problem.decode(prediction_indices, length_batches[i][key_random].numpy())
                         for prediction_sample in prediction_batch:
                             streaming_recoder.record('prediction', " ".join(prediction_sample))
 
@@ -18,7 +18,7 @@
 
 def get_conf(layer_id, layer_name, input_layer_ids, all_layer_configs, model_input_ids, use_gpu,
         conf_dict=None, shared_conf=None, succeed_embedding_flag=False, output_layer_flag=False,
-        target_num=None, fixed_lengths=None):
+        target_num=None, fixed_lengths=None, target_dict=None):
     """ get layer configuration
 
     Args
@@ -51,14 +51,24 @@ def get_conf(layer_id, layer_name, input_layer_ids, all_layer_configs, model_inp
 
             # for classification tasks, we usually add a Linear layer to project the output to dimension of number of classes. If we don't know the #classes, we can use '-1' instead and we would calculate the number of classes from the corpus.
             if layer_name == 'Linear':
-                if isinstance(conf_dict['hidden_dim'], list) and conf_dict['hidden_dim'][-1] == -1:
-                    assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!"
-                    assert target_num is not None, "Number of targets should be given!"
-                    conf_dict['hidden_dim'][-1] = target_num
+                if isinstance(conf_dict['hidden_dim'], list):
+                    if conf_dict['hidden_dim'][-1] == -1:
+                        assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!"
+                        assert target_num is not None, "Number of targets should be given!"
+                        conf_dict['hidden_dim'][-1] = target_num
+                    elif conf_dict['hidden_dim'][-1] == '#target#':
+                        logging.info('#target# position will be replace by target num: %d' % target_num)
+                        conf_dict['hidden_dim'][-1] = target_num
                 elif isinstance(conf_dict['hidden_dim'], int) and conf_dict['hidden_dim'] == -1:
                     assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!"
                     assert target_num is not None, "Number of targets should be given!"
                     conf_dict['hidden_dim'] = target_num
+                elif isinstance(conf_dict['hidden_dim'], str) and conf_dict['hidden_dim'] == '#target#':
+                    logging.info('#target# position will be replace by target num: %d' % target_num)
+                    conf_dict['hidden_dim'] = target_num
+            # add some necessary attribute for CRF layer
+            if layer_name == 'CRF':
+                conf_dict['target_dict'] = target_dict
 
             conf = eval(layer_name + "Conf")(**conf_dict)
         except NameError as e:
@@ -104,6 +114,8 @@ def get_conf(layer_id, layer_name, input_layer_ids, all_layer_configs, model_inp
     # inference and varification inside the layer
     conf.inference()        # update some attributes which relies on input dimension or something else
     conf.verify()           # verify if the configuration is legal
+    former_conf = None if len(all_layer_configs) == 0 else list(all_layer_configs.values())[-1]
+    conf.verify_former_block(former_conf)  # check if has special attribute rely on former layer
 
     logging.debug('Layer id: %s; name: %s; input_dims: %s; input_ranks: %s; output_dim: %s; output_rank: %s' % (layer_id, layer_name, conf.input_dims if layer_id != 'embedding' else 'None', conf.input_ranks, conf.output_dim, conf.output_rank))
 
@@ -211,7 +223,7 @@ def __init__(self, conf, problem, vocab_info, use_gpu):
                 all_layer_configs[EMBED_LAYER_ID] = get_conf(EMBED_LAYER_ID, layer_arch['layer'],
                     None, all_layer_configs, inputs, self.use_gpu, conf_dict={'conf': emb_conf},
                     shared_conf=None, succeed_embedding_flag=False, output_layer_flag=output_layer_flag,
-                    target_num=target_num, fixed_lengths=fixed_lengths_corrected)
+                    target_num=target_num, fixed_lengths=fixed_lengths_corrected, target_dict=problem.output_dict)
                 self.add_layer(EMBED_LAYER_ID, get_layer(layer_arch['layer'], all_layer_configs[EMBED_LAYER_ID]))
             else:
                 if layer_arch['layer'] in self.layers and not 'conf' in layer_arch:
@@ -230,7 +242,7 @@ def __init__(self, conf, problem, vocab_info, use_gpu):
                     layer_arch['inputs'], all_layer_configs, inputs, self.use_gpu, conf_dict=conf_dict,
                     shared_conf=shared_conf, succeed_embedding_flag=succeed_embedding_flag,
                     output_layer_flag=output_layer_flag, target_num=target_num,
-                    fixed_lengths=fixed_lengths_corrected)
+                    fixed_lengths=fixed_lengths_corrected, target_dict=problem.output_dict)
 
                 if layer_arch['layer'] in self.layers and not 'conf' in layer_arch:
                     self.add_layer(layer_arch['layer_id'], self.layers[layer_arch['layer']])
 
@@ -309,6 +309,9 @@ def load_from_file(self, conf_path):
         self.fixed_lengths = self.get_item(['training_params', 'fixed_lengths'], default=None, use_default=True)
         if self.fixed_lengths:
             self.max_lengths = None
+        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
+            self.fixed_lengths = None
+            self.max_lengths = None
 
         if torch.cuda.device_count() > 1:
             self.batch_size_total = torch.cuda.device_count() * self.training_params['batch_size']
 
@@ -154,6 +154,11 @@ def verify(self):
         # To check if deepcopy is applied
         assert id(self.output_dim) != id(self.input_dims[0]), 'Please use copy.deepcopy to copy the input_dim to output_dim'
 
+    def verify_former_block(self, former_conf):
+        """check if has special attribute rely on former layer
+
+        """
+        return True
 
     def add_attr_type_assertion(self, attr, specified_type):
         """ check if the types of attributes are legal