Added VAT loss.

otiliastr · otiliastr · commit 340dcd7eb0fd · 2019-10-04T15:27:22.000-04:00
diff --git a/neural_structured_learning/research/gam/experiments/run_train_gam_graph.py b/neural_structured_learning/research/gam/experiments/run_train_gam_graph.py
@@ -220,6 +220,12 @@
     'num_pairs_reg', 128,
     'Number of pairs of nodes to use in the agreement loss term of the '
     'classification model.')
+flags.DEFINE_float(
+    'reg_weight_vat', 0.0,
+    'Regularization weight for the virtual adversarial training (VAT) loss.')
+flags.DEFINE_bool(
+    'use_ent_min', False,
+    'A boolean specifying whether to add entropy minimization to VAT.')
 flags.DEFINE_string(
     'aggregation_agr_inputs', 'dist',
     'Operation to apply on the pair of nodes in the agreement model. '
@@ -407,6 +413,8 @@ def main(argv):
       reg_weight_lu=FLAGS.reg_weight_lu,
       reg_weight_uu=FLAGS.reg_weight_uu,
       num_pairs_reg=FLAGS.num_pairs_reg,
+      reg_weight_vat=FLAGS.reg_weight_vat,
+      use_ent_min=FLAGS.use_ent_min,
       penalize_neg_agr=FLAGS.penalize_neg_agr,
       use_l2_cls=FLAGS.use_l2_cls,
       first_iter_original=FLAGS.first_iter_original,
diff --git a/neural_structured_learning/research/gam/trainer/adversarial.py b/neural_structured_learning/research/gam/trainer/adversarial.py
@@ -0,0 +1,102 @@
+import tensorflow as tf
+
+epsilon = 5
+num_power_iterations = 1
+xi = 1e-6
+scale_r = False
+
+
+def kl_divergence_with_logit(q_logit, p_logit):
+    q = tf.nn.softmax(q_logit)
+    qlogq = -tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=q, logits=q_logit)
+    qlogp = -tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=q, logits=p_logit)
+    return qlogq - qlogp
+
+
+def get_normalized_vector(d):
+    d /= (1e-12 + tf.reduce_max(tf.abs(d), keep_dims=True))
+    d /= tf.sqrt(1e-6 + tf.reduce_sum(tf.pow(d, 2.0), keep_dims=True))
+    return d
+
+
+def get_normalizing_constant(d):
+    c = 1e-12 + tf.reduce_max(tf.abs(d), keep_dims=True)
+    c *= tf.sqrt(1e-6 + tf.reduce_sum(tf.pow(d, 2.0), keep_dims=True))
+    return c
+
+
+def get_loss_vat(inputs, predictions, is_train, model, predictions_var_scope):
+    r_vadv = generate_virtual_adversarial_perturbation(
+        inputs, predictions, model, predictions_var_scope, is_train=is_train)
+    predictions = tf.stop_gradient(predictions)
+    logit_p = predictions
+    new_inputs = tf.add(inputs, r_vadv)
+    with tf.variable_scope(
+        predictions_var_scope,  auxiliary_name_scope=False, reuse=True):
+        encoding_m, _, _ = model.get_encoding_and_params(
+            inputs=new_inputs,
+            is_train=is_train,
+            update_batch_stats=False)
+        logit_m, _, _ = model.get_predictions_and_params(
+            encoding=encoding_m,
+            is_train=is_train)
+    loss = kl_divergence_with_logit(logit_p, logit_m)
+    return tf.reduce_mean(loss)
+
+
+def generate_virtual_adversarial_perturbation(
+        inputs, logits, model, predictions_var_scope, is_train=True):
+    """Generates an adversarial perturbation for virtual adversarial training.
+
+    Args:
+        inputs: A batch of input features, where the batch is the first
+            dimension.
+        logits: The logits predicted by a model on the provided inputs.
+        model: The model that generated the logits.
+        predictions_var_scope: Variable scope for obtaining the predictions.
+        is_train: A boolean placeholder specifying if this is a training or
+            testing setting.
+
+    Returns:
+        A Tensor of the same shape as the inputs containing the adversarial
+        perturbation for these inputs.
+    """
+    d = tf.random_normal(shape=tf.shape(inputs))
+
+    for _ in range(num_power_iterations):
+        d = xi * get_normalized_vector(d)
+        logit_p = logits
+        with tf.variable_scope(
+                predictions_var_scope,  auxiliary_name_scope=False, reuse=True):
+            encoding_m, _, _ = model.get_encoding_and_params(
+                inputs=d + inputs,
+                is_train=is_train,
+                update_batch_stats=False)
+            logit_m, _, _ = model.get_predictions_and_params(
+                encoding=encoding_m,
+                is_train=is_train)
+        dist = kl_divergence_with_logit(logit_p, logit_m)
+        grad = tf.gradients(dist, [d], aggregation_method=2)[0]
+        d = tf.stop_gradient(grad)
+
+    r_vadv = get_normalized_vector(d)
+    if scale_r:
+        r_vadv *= get_normalizing_constant(inputs)
+    r_vadv *= epsilon
+    return r_vadv
+
+
+def logsoftmax(x):
+    """Implementation of softmax when the inputs are logits."""
+    xdev = x - tf.reduce_max(x, 1, keep_dims=True)
+    lsm = xdev - tf.log(tf.reduce_sum(tf.exp(xdev), 1, keep_dims=True))
+    return lsm
+
+
+def entropy_y_x(logit):
+    """Entropy term to add to VATENT."""
+    p = tf.nn.softmax(logit)
+    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=p, logits=logit))
diff --git a/neural_structured_learning/research/gam/trainer/trainer_classification.py b/neural_structured_learning/research/gam/trainer/trainer_classification.py
@@ -21,6 +21,8 @@
 import logging
 import os
 
+from gam.trainer.adversarial import entropy_y_x
+from gam.trainer.adversarial import get_loss_vat
 from gam.trainer.trainer_base import batch_iterator
 from gam.trainer.trainer_base import Trainer
 
@@ -65,6 +67,11 @@ class TrainerClassification(Trainer):
       model loss.
     iter_cotrain: A Tensorflow variable containing the current cotrain
       iteration.
+    reg_weight_vat: A float representing the weight of the virtual adversarial
+      training (VAT) regularization loss in the classification model loss
+      function.
+    use_ent_min: A boolean specifying whether to use entropy regularization with
+      VAT.
     enable_summaries: Boolean specifying whether to enable variable summaries.
     summary_step: Integer representing the summary step size.
     summary_dir: String representing the path to a directory where to save the
@@ -122,6 +129,8 @@ def __init__(self,
                reg_weight_uu,
                num_pairs_reg,
                iter_cotrain,
+               reg_weight_vat=0.0,
+               use_ent_min=False,
                enable_summaries=False,
                summary_step=1,
                summary_dir=None,
@@ -170,6 +179,8 @@ def __init__(self,
     self.reg_weight_ll = reg_weight_ll
     self.reg_weight_lu = reg_weight_lu
     self.reg_weight_uu = reg_weight_uu
+    self.reg_weight_vat = reg_weight_vat
+    self.use_ent_min = use_ent_min
     self.penalize_neg_agr = penalize_neg_agr
     self.use_l2_classif = use_l2_classif
     self.first_iter_original = first_iter_original
@@ -188,6 +199,8 @@ def __init__(self,
     features_shape = [None] + list(data.features_shape)
     input_features = tf.placeholder(
         tf.float32, shape=features_shape, name='input_features')
+    input_features_unlabeled = tf.placeholder(
+        tf.float32, shape=features_shape, name='input_features_unlabeled')
     input_labels = tf.placeholder(tf.int64, shape=(None,), name='input_labels')
     one_hot_labels = tf.one_hot(
         input_labels, data.num_classes, name='input_labels_one_hot')
@@ -206,6 +219,18 @@ def __init__(self,
       self.variables.update(variables)
       self.reg_params.update(reg_params)
       normalized_predictions = self.model.normalize_predictions(predictions)
+      predictions_var_scope = tf.get_variable_scope()
+
+    # Create predictions on unlabeled data, which is only used for VAT loss.
+    with tf.variable_scope("predictions", reuse=True):
+      encoding_unlabeled, _, _ = self.model.get_encoding_and_params(
+          inputs=input_features_unlabeled,
+          is_train=is_train,
+          update_batch_stats=False)
+      predictions_unlabeled, _, _ = (
+          self.model.get_predictions_and_params(
+              encoding=encoding_unlabeled,
+              is_train=is_train))
 
     # Create a variable for weight decay that may be updated.
     weight_decay_var, weight_decay_update = self._create_weight_decay_var(
@@ -240,8 +265,31 @@ def __init__(self,
         for var in reg_params.values():
           loss_reg += weight_decay_var * tf.nn.l2_loss(var)
 
+      # Adversarial loss, in case we want to add VAT on top of GAM.
+      loss_vat = get_loss_vat(
+          input_features_unlabeled, predictions_unlabeled, is_train, model,
+          predictions_var_scope)
+      num_unlabeled = tf.shape(input_features_unlabeled)[0]
+      loss_vat = tf.cond(tf.greater(num_unlabeled, 0),
+                         lambda: loss_vat,
+                         lambda: 0.0)
+      if self.use_ent_min:
+          # Use entropy minimization with VAT (i.e. VATENT).
+          loss_ent = entropy_y_x(predictions_unlabeled)
+          loss_vat = loss_vat + tf.cond(tf.greater(num_unlabeled, 0),
+                                        lambda: loss_ent,
+                                        lambda: 0.0)
+      loss_vat = loss_vat * self.reg_weight_vat
+      if self.first_iter_original:
+          # Do not add the adversarial loss in the first iteration if
+          # the first iteration trains the plain baseline model.
+          weight_loss_vat = tf.cond(tf.greater(iter_cotrain, 0),
+                                    lambda: 1.0,
+                                    lambda: 0.0)
+          loss_vat = loss_vat * weight_loss_vat
+
       # Total loss.
-      loss_op = loss_supervised + loss_agr + loss_reg
+      loss_op = loss_supervised + loss_agr + loss_reg + loss_vat
 
     # Create accuracy.
     accuracy = tf.equal(tf.argmax(normalized_predictions, 1), input_labels)
@@ -310,6 +358,7 @@ def __init__(self,
 
     self.rng = np.random.RandomState(seed)
     self.input_features = input_features
+    self.input_features_unlabeled = input_features_unlabeled
     self.input_labels = input_labels
     self.predictions = predictions
     self.normalized_predictions = normalized_predictions
@@ -507,7 +556,8 @@ def _construct_feed_dict(self,
                            split,
                            pair_ll_iterator=None,
                            pair_lu_iterator=None,
-                           pair_uu_iterator=None):
+                           pair_uu_iterator=None,
+                           data_iterator_unlabeled=None):
     """Construct feed dictionary."""
     try:
       input_indices = next(data_iterator)
@@ -521,6 +571,14 @@ def _construct_feed_dict(self,
           self.input_labels: labels,
           self.is_train: split == 'train'
       }
+      if data_iterator_unlabeled is not None:
+        # This is not None only when using VAT regularization.
+        try:
+            input_indices = next(data_iterator_unlabeled)
+            input_features = self.data.get_features(input_indices)
+        except StopIteration:
+            input_features = np.zeros([0] + list(self.data.features_shape))
+        feed_dict.update({self.input_features_unlabeled: input_features})
       if pair_ll_iterator is not None:
         _, _, _, features_tgt, labels_src, labels_tgt = next(pair_ll_iterator)
         feed_dict.update({
@@ -720,6 +778,13 @@ def train(self, data, session=None, **kwargs):
         shuffle=True,
         allow_smaller_batch=False,
         repeat=True)
+    # Create an iterator for unlabeled samples for the VAT loss term.
+    data_iterator_unlabeled = batch_iterator(
+        unlabeled_indices,
+        batch_size=self.batch_size,
+        shuffle=True,
+        allow_smaller_batch=False,
+        repeat=True)
     # Create iterators for ll, lu, uu pairs of samples for the agreement term.
     if self.use_graph:
       pair_ll_iterator = self.edge_iterator(
@@ -750,7 +815,8 @@ def train(self, data, session=None, **kwargs):
           split='train',
           pair_ll_iterator=pair_ll_iterator,
           pair_lu_iterator=pair_lu_iterator,
-          pair_uu_iterator=pair_uu_iterator)
+          pair_uu_iterator=pair_uu_iterator,
+          data_iterator_unlabeled=data_iterator_unlabeled)
       if self.enable_summaries and step % self.summary_step == 0:
         loss_val, summary, iter_cls_total, _ = session.run(
             [self.loss_op, self.summary_op, self.iter_cls_total, self.train_op],
diff --git a/neural_structured_learning/research/gam/trainer/trainer_cotrain.py b/neural_structured_learning/research/gam/trainer/trainer_cotrain.py
@@ -159,6 +159,11 @@ class TrainerCotraining(Trainer):
     num_pairs_reg: An integer representing the number of sample pairs of each
       type (LL, LU, UU) to include in each computation of the classification
       model loss.
+    reg_weight_vat: A float representing the weight of the virtual adversarial
+      training (VAT) regularization loss in the classification model loss
+      function.
+    use_ent_min: A boolean specifying whether to use entropy regularization with
+      VAT.
     penalize_neg_agr: Whether to not only encourage agreement between samples
       that the agreement model believes should have the same label, but also
       penalize agreement when two samples agree when the agreement model
@@ -245,6 +250,8 @@ def __init__(self,
                reg_weight_lu=0,
                reg_weight_uu=0,
                num_pairs_reg=100,
+               reg_weight_vat=0,
+               use_ent_min=False,
                penalize_neg_agr=False,
                use_l2_cls=True,
                first_iter_original=True,
@@ -314,6 +321,8 @@ def __init__(self,
     self.reg_weight_lu = reg_weight_lu
     self.reg_weight_uu = reg_weight_uu
     self.num_pairs_reg = num_pairs_reg
+    self.reg_weight_vat = reg_weight_vat
+    self.use_ent_min = use_ent_min
     self.penalize_neg_agr = penalize_neg_agr
     self.use_l2_classif = use_l2_cls
     self.first_iter_original = first_iter_original
@@ -506,6 +515,8 @@ def train(self, data, **kwargs):
             reg_weight_lu=self.reg_weight_lu,
             reg_weight_uu=self.reg_weight_uu,
             num_pairs_reg=self.num_pairs_reg,
+            reg_weight_vat=self.reg_weight_vat,
+            use_ent_min=self.use_ent_min,
             enable_summaries=self.enable_summaries_per_model,
             summary_step=self.summary_step_cls,
             summary_dir=self.summary_dir,