IntelLabs
diff --git a/‎docs-source/source/transformers_distillation.rst‎
Lines changed: 15 additions & 1 deletion b/‎docs-source/source/transformers_distillation.rst‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎nlp_architect/common/config.py‎
Lines changed: 1 addition & 1 deletion b/‎nlp_architect/common/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nlp_architect/data/sequential_tagging.py‎
Lines changed: 6 additions & 6 deletions b/‎nlp_architect/data/sequential_tagging.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎nlp_architect/data/utils.py‎
Lines changed: 26 additions & 9 deletions b/‎nlp_architect/data/utils.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎nlp_architect/models/absa/train/acquire_terms.py‎
Lines changed: 1 addition & 1 deletion b/‎nlp_architect/models/absa/train/acquire_terms.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nlp_architect/models/matchlstm_ansptr.py‎
Lines changed: 4 additions & 2 deletions b/‎nlp_architect/models/matchlstm_ansptr.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎nlp_architect/models/tagging.py‎
Lines changed: 143 additions & 3 deletions b/‎nlp_architect/models/tagging.py‎
Lines changed: 143 additions & 3 deletions
diff --git a/‎nlp_architect/models/transformers/base_model.py‎
Lines changed: 2 additions & 1 deletion b/‎nlp_architect/models/transformers/base_model.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nlp_architect/models/transformers/quantized_bert.py‎
Lines changed: 8 additions & 4 deletions b/‎nlp_architect/models/transformers/quantized_bert.py‎
Lines changed: 8 additions & 4 deletions
@@ -39,7 +39,7 @@ One approach is similar to the method in Hinton 2015 [#]_. The loss function is
 modified to include a measure of distributions divergence, which can be measured
 using KL divergence or MSE between the logits of the student and the teacher network.
 
-    :math:`loss = w_s \cdot loss_{student} + w_d \cdot KL(logits_{student} / T || logits_{teacher} / T)`
+:math:`loss = w_s \cdot loss_{student} + w_d \cdot KL(logits_{student} / T || logits_{teacher} / T)`
 
 where *T* is a value representing temperature for softening the logits prior to 
 applying softmax. `loss_{student}` is the original loss of the student network 
@@ -73,5 +73,19 @@ Usage:
 .. note::
     More models supporting distillation will be added in next releases
 
+Pseudo Labeling
+================
+
+This method can be used in order to produce pseudo-labels when training the student on unlabeled examples.
+The pseudo-guess is produced by applying arg max on the logits of the teacher model, and results in the following loss:
+
+.. math::
+
+    loss &= \Bigg\{\begin{eqnarray}CE(yˆ, y) && labeled&example\\ CE(yˆ, yˆt) && unlabeled&example\end{eqnarray}
+
+
+where CE is Cross Entropy loss, yˆ is the predicted entity label class by the student model and yˆt is
+the predicted label by the teacher model.
+
 
 .. [#] Distilling the Knowledge in a Neural Network: Geoffrey Hinton, Oriol Vinyals, Jeff Dean, https://arxiv.org/abs/1503.02531
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ******************************************************************************
 """
-Generic config object:  
+Generic config object:
     load config from json file
     load config from ordinary python dict
     export config as dictionaty or json string
 
@@ -326,14 +326,14 @@ def _read_examples(self, data_dir, file_name, set_name):
         return self._create_examples(read_column_tagged_file(os.path.join(data_dir, file_name),
                                                              tag_col=self.tag_col), set_name)
 
-    def get_train_examples(self):
-        return self._read_examples(self.data_dir, "train.txt", "train")
+    def get_train_examples(self, filename="train.txt"):
+        return self._read_examples(self.data_dir, filename, "train")
 
-    def get_dev_examples(self):
-        return self._read_examples(self.data_dir, "dev.txt", "dev")
+    def get_dev_examples(self, filename="dev.txt"):
+        return self._read_examples(self.data_dir, filename, "dev")
 
-    def get_test_examples(self):
-        return self._read_examples(self.data_dir, "test.txt", "test")
+    def get_test_examples(self, filename="test.txt"):
+        return self._read_examples(self.data_dir, filename, "test")
 
     # pylint: disable=arguments-differ
     def get_labels(self):
 
@@ -35,15 +35,15 @@ def __init__(self, guid: str, text, label=None):
 class DataProcessor(object):
     """Base class for data converters for sequence/token classification data sets."""
 
-    def get_train_examples(self, data_dir):
+    def get_train_examples(self):
         """Gets a collection of `InputExample`s for the train set."""
         raise NotImplementedError()
 
-    def get_dev_examples(self, data_dir):
+    def get_dev_examples(self):
         """Gets a collection of `InputExample`s for the dev set."""
         raise NotImplementedError()
 
-    def get_test_examples(self, data_dir):
+    def get_test_examples(self):
         """Gets a collection of `InputExample`s for the test set."""
         raise NotImplementedError()
 
@@ -66,12 +66,6 @@ def __init__(self, name: str, processor: DataProcessor, data_dir: str, task_type
         self.data_dir = data_dir
         self.task_type = task_type
 
-    def get_split_train_examples(self, labeled: int, unlabeled: int):
-        """split the train set into 2 sub sets (given by input size) to be
-        used as labelled and unlabeled sets for semi-supervision tasks
-        """
-        return self.processor.get_split_train_examples(self.data_dir, labeled, unlabeled)
-
     def get_train_examples(self):
         return self.processor.get_train_examples(self.data_dir)
 
@@ -154,3 +148,26 @@ def sample_label_unlabeled(samples: List[InputExample], no_labeled: int, no_unla
     label_samples = [samples[i] for i in labeled_indices]
     unlabel_samples = [samples[i] for i in unlabeled_indices]
     return label_samples, unlabel_samples
+
+
+def split_column_dataset(
+        first_count: int, second_count: int, out_folder, dataset, first_filename, second_filename, tag_col=-1):
+    """
+    Splits a single column tagged dataset into two files according to the amount of examples
+    requested to be included in each file.
+    split1_count (int) : the amount of examples to include in the first split file
+    split2_count (int) : the amount of examples to include in the second split file
+    out_folder (str) : the folder in which the result files will be stored
+    dataset (str) : the path to the original data file
+    split1_filename (str) : the name of the first split file
+    split2_filename (str) : the name of the second split file
+    tag_col (int) : the index of the tag column
+    """
+    lines = read_column_tagged_file(dataset, tag_col=tag_col)
+    num_of_examples = len(lines)
+    assert first_count + second_count <= num_of_examples and first_count > 0 and second_count > 0
+    selected_lines = random.sample(lines, first_count + second_count)
+    first_data = selected_lines[:first_count]
+    second_data = selected_lines[first_count:]
+    write_column_tagged_file(out_folder + os.sep + first_filename, first_data)
+    write_column_tagged_file(out_folder + os.sep + second_filename, second_data)
@@ -21,7 +21,7 @@
 from tqdm import tqdm
 
 from nlp_architect.models.absa import TRAIN_LEXICONS, LEXICONS_OUT
-from nlp_architect.models.absa import TRAIN_OUT, GENERIC_OP_LEX
+from nlp_architect.models.absa import GENERIC_OP_LEX
 from nlp_architect.models.absa.inference.data_types import Polarity
 from nlp_architect.models.absa.train.data_types import AspectTerm, \
     DepRelation, DepRelationTerm, LoadOpinionStopLists, LoadAspectStopLists, OpinionTerm, \
 
@@ -526,12 +526,14 @@ def inference_mode(self, session, valid, vocab_tuple, num_examples, dropout=1.0,
                 # Print Paragraph
                 print("\n")
                 print("Paragraph Number AA:", idx)
-                test_paragraph = [vocab_forward[ele].replace(" ", "") for ele in valid[idx][0] if ele != 0]
+                test_paragraph = [vocab_forward[ele].replace(
+                    " ", "") for ele in valid[idx][0] if ele != 0]
                 para_string = " ".join(map(str, test_paragraph))
                 print(para_string)
 
                 # Print corresponding Question
-                test_question = [vocab_forward[ele].replace(" ", "") for ele in valid[idx][1] if ele != 0]
+                test_question = [vocab_forward[ele].replace(
+                    " ", "") for ele in valid[idx][1] if ele != 0]
                 ques_string = " ".join(map(str, test_question))
                 print("Question:", ques_string)
                 question_ids = valid[idx][1]
 
@@ -221,11 +221,12 @@ def train(self, train_data_set: DataLoader,
         for _ in epoch_it:
             step_it = tqdm(train_data_set, desc="Train iteration")
             avg_loss = 0
-            for s_idx, batch in enumerate(step_it):
+            for step, batch in enumerate(step_it):
                 self.model.train()
                 if distiller:
                     batch, t_batch = batch[:2]
                     t_batch = tuple(t.to(self.device) for t in t_batch)
+                    t_logits = distiller.get_teacher_logits(t_batch)
                 batch = tuple(t.to(self.device) for t in batch)
                 inputs = self.batch_mapper(batch)
                 logits = self.model(**inputs)
@@ -239,7 +240,6 @@ def train(self, train_data_set: DataLoader,
 
                 # add distillation loss if activated
                 if distiller:
-                    t_logits = distiller.get_teacher_logits(t_batch)
                     loss = distiller.distill_loss(loss, logits, t_logits)
 
                 loss.backward()
@@ -251,17 +251,157 @@ def train(self, train_data_set: DataLoader,
                 global_step += 1
                 avg_loss += loss.item()
                 if global_step % logging_steps == 0:
-                    logger.info(" global_step = %s, average loss = %s", global_step, avg_loss / s_idx)
+                    if step != 0:
+                        logger.info(
+                            " global_step = %s, average loss = %s", global_step, avg_loss / step)
                     self._get_eval(dev_data_set, "dev")
                     self._get_eval(test_data_set, "test")
                 if save_path is not None and global_step % save_steps == 0:
                     self.save_model(save_path)
 
+    def train_pseudo(
+            self, labeled_data_set: DataLoader,
+            unlabeled_data_set: DataLoader,
+            distiller: TeacherStudentDistill,
+            dev_data_set: DataLoader = None,
+            test_data_set: DataLoader = None,
+            batch_size_l: int = 8,
+            batch_size_ul: int = 8,
+            epochs: int = 100,
+            optimizer=None,
+            max_grad_norm: float = 5.0,
+            logging_steps: int = 50,
+            save_steps: int = 100,
+            save_path: str = None,
+            save_best: bool = False):
+        """
+        Train a tagging model
+
+        Args:
+            train_data_set (DataLoader): train examples dataloader. If distiller object is
+            provided train examples should contain a tuple of student/teacher data examples.
+            dev_data_set (DataLoader, optional): dev examples dataloader. Defaults to None.
+            test_data_set (DataLoader, optional): test examples dataloader. Defaults to None.
+            batch_size_l (int, optional): batch size for the labeled dataset. Defaults to 8.
+            batch_size_ul (int, optional): batch size for the unlabeled dataset. Defaults to 8.
+            epochs (int, optional): num of epochs to train. Defaults to 100.
+            optimizer (fn, optional): optimizer function. Defaults to default model optimizer.
+            max_grad_norm (float, optional): max gradient norm. Defaults to 5.0.
+            logging_steps (int, optional): number of steps between logging. Defaults to 50.
+            save_steps (int, optional): number of steps between model saves. Defaults to 100.
+            save_path (str, optional): model output path. Defaults to None.
+            save_best (str, optional): wether to save model when result is best on dev set
+            distiller (TeacherStudentDistill, optional): KD model for training the model using
+            a teacher model. Defaults to None.
+        """
+        if optimizer is None:
+            optimizer = self.get_optimizer()
+        train_batch_size_l = batch_size_l * max(1, self.n_gpus)
+        train_batch_size_ul = batch_size_ul * max(1, self.n_gpus)
+        logger.info("***** Running training *****")
+        logger.info("  Num labeled examples = %d", len(labeled_data_set.dataset))
+        logger.info("  Num unlabeled examples = %d", len(unlabeled_data_set.dataset))
+        logger.info("  Instantaneous labeled batch size per GPU/CPU = %d",
+                    batch_size_l)
+        logger.info("  Instantaneous unlabeled batch size per GPU/CPU = %d",
+                    batch_size_ul)
+        logger.info("  Total batch size labeled= %d", train_batch_size_l)
+        logger.info("  Total batch size unlabeled= %d", train_batch_size_ul)
+        global_step = 0
+        self.model.zero_grad()
+        avg_loss = 0
+        iter_l = iter(labeled_data_set)
+        iter_ul = iter(unlabeled_data_set)
+        epoch_l = 0
+        epoch_ul = 0
+        s_idx = -1
+        best_dev = 0
+        best_test = 0
+        while(True):
+            logger.info("labeled epoch=%d, unlabeled epoch=%d", epoch_l, epoch_ul)
+            loss_labeled = 0
+            loss_unlabeled = 0
+            try:
+                batch_l = next(iter_l)
+                s_idx += 1
+            except StopIteration:
+                iter_l = iter(labeled_data_set)
+                epoch_l += 1
+                batch_l = next(iter_l)
+                s_idx = 0
+                avg_loss = 0
+            try:
+                batch_ul = next(iter_ul)
+            except StopIteration:
+                iter_ul = iter(unlabeled_data_set)
+                epoch_ul += 1
+                batch_ul = next(iter_ul)
+            if epoch_ul > epochs:
+                logger.info("Done")
+                return
+            self.model.train()
+            batch_l, t_batch_l = batch_l[:2]
+            batch_ul, t_batch_ul = batch_ul[:2]
+            t_batch_l = tuple(t.to(self.device) for t in t_batch_l)
+            t_batch_ul = tuple(t.to(self.device) for t in t_batch_ul)
+            t_logits = distiller.get_teacher_logits(t_batch_l)
+            t_logits_ul = distiller.get_teacher_logits(t_batch_ul)
+            batch_l = tuple(t.to(self.device) for t in batch_l)
+            batch_ul = tuple(t.to(self.device) for t in batch_ul)
+            inputs = self.batch_mapper(batch_l)
+            inputs_ul = self.batch_mapper(batch_ul)
+            logits = self.model(**inputs)
+            logits_ul = self.model(**inputs_ul)
+            t_labels = torch.argmax(F.log_softmax(t_logits_ul, dim=2), dim=2)
+            if self.use_crf:
+                loss_labeled = -1.0 * self.crf(
+                    logits, inputs['labels'], mask=inputs['mask'] != 0.0)
+                loss_unlabeled = -1.0 * self.crf(
+                    logits_ul, t_labels, mask=inputs_ul['mask'] != 0.0)
+            else:
+                loss_fn = CrossEntropyLoss(ignore_index=0)
+                loss_labeled = loss_fn(logits.view(-1, self.num_labels), inputs['labels'].view(-1))
+                loss_unlabeled = loss_fn(logits_ul.view(-1, self.num_labels), t_labels.view(-1))
+
+            if self.n_gpus > 1:
+                loss_labeled = loss_labeled.mean()
+                loss_unlabeled = loss_unlabeled.mean()
+
+            # add distillation loss
+            loss_labeled = distiller.distill_loss(loss_labeled, logits, t_logits)
+            loss_unlabeled = distiller.distill_loss(loss_unlabeled, logits_ul, t_logits_ul)
+
+            # sum labeled and unlabeled losses
+            loss = loss_labeled + loss_unlabeled
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
+            optimizer.step()
+            # self.model.zero_grad()
+            optimizer.zero_grad()
+            global_step += 1
+            avg_loss += loss.item()
+            if global_step % logging_steps == 0:
+                if s_idx != 0:
+                    logger.info(
+                        " global_step = %s, average loss = %s", global_step, avg_loss / s_idx)
+                dev = self._get_eval(dev_data_set, "dev")
+                test = self._get_eval(test_data_set, "test")
+                if dev > best_dev:
+                    best_dev = dev
+                    best_test = test
+                    if save_path is not None and save_best:
+                        self.save_model(save_path)
+                logger.info("Best result: dev= %s, test= %s", str(best_dev), str(best_test))
+            if save_path is not None and global_step % save_steps == 0:
+                self.save_model(save_path)
+
     def _get_eval(self, ds, set_name):
         if ds is not None:
             logits, out_label_ids = self.evaluate(ds)
             res = self.evaluate_predictions(logits, out_label_ids)
             logger.info(" {} set F1 = {}".format(set_name, res['f1']))
+            return res['f1']
+        return None
 
     def to(self, device='cpu', n_gpus=0):
         """
 
@@ -199,7 +199,8 @@ def load_model(cls, model_path: str, model_type: str, *args, **kwargs):
             raise FileNotFoundError
         with io.open(model_path + os.sep + 'labels.txt') as fp:
             labels = [l.strip() for l in fp.readlines()]
-        return cls(model_type=model_type, model_name_or_path=model_path, labels=labels, *args, **kwargs)
+        return cls(
+            model_type=model_type, model_name_or_path=model_path, labels=labels, *args, **kwargs)
 
     @staticmethod
     def get_train_steps_epochs(max_steps: int,
 
@@ -213,7 +213,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_8bit=False,
 
         # Instantiate model.
         model = cls(config)
-        # Set model to initialize variables to be loaded from quantized checkpoint which are None by Default
+        # Set model to initialize variables to be loaded from quantized
+        # checkpoint which are None by Default
         model.eval()
         # Get state dict of model
         state_dict = torch.load(model_file, map_location='cpu')
@@ -232,17 +233,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_8bit=False,
         def load(module, prefix=''):
             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
             module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                state_dict, prefix, local_metadata, True, missing_keys,
+                unexpected_keys, error_msgs)
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
 
         # Make sure we are able to load base models as well as derived models (with heads)
         start_prefix = ''
         model_to_load = model
-        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+        if not hasattr(model, cls.base_model_prefix) and any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
             start_prefix = cls.base_model_prefix + '.'
-        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+        if hasattr(model, cls.base_model_prefix) and not any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
             model_to_load = getattr(model, cls.base_model_prefix)
 
         load(model_to_load, prefix=start_prefix)