Merge pull request #12 from antoinedemathelin/master

antoinedemathelin · web-flow · commit 10e1e0e796e4 · 2021-10-18T18:15:09.000+02:00
feat: Add WDGRL method
diff --git a/adapt/feature_based/__init__.py b/adapt/feature_based/__init__.py
@@ -11,5 +11,6 @@
 from ._deepcoral import DeepCORAL
 from ._mcd import MCD
 from ._mdd import MDD
+from ._wdgrl import WDGRL
 
-__all__ = ["FE", "CORAL", "DeepCORAL", "ADDA", "DANN", "mSDA", "MCD", "MDD", "BaseDeepFeature"]
+__all__ = ["FE", "CORAL", "DeepCORAL", "ADDA", "DANN", "mSDA", "MCD", "MDD", "WDGRL", "BaseDeepFeature"]
diff --git a/adapt/feature_based/_wdgrl.py b/adapt/feature_based/_wdgrl.py
@@ -0,0 +1,264 @@
+"""
+WDGRL
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import Layer, subtract
+from tensorflow.keras.optimizers import Adam
+import tensorflow.keras.backend as K
+
+from adapt.utils import (GradientHandler,
+                         check_arrays)
+from adapt.feature_based import BaseDeepFeature
+
+EPS = K.epsilon()
+
+
+class _Interpolation(Layer):
+    """
+    Layer that produces interpolates points between
+    two entries, with the distance of the interpolation
+    to the first entry.
+    """
+    
+    def call(self, inputs):
+        Xs = inputs[0]
+        Xt = inputs[1]
+        batch_size = tf.shape(Xs)[0]
+        dim = tf.shape(Xs)[1:]
+        alphas = tf.random.uniform([batch_size]+[1]*len(dim))
+        tiled_shape = tf.concat(([1], dim), 0)
+        tiled_alphas = tf.tile(alphas, tiled_shape)
+        differences = Xt - Xs
+        interpolates = Xs + tiled_alphas * differences
+        distances = K.sqrt(K.mean(K.square(tiled_alphas * differences),
+                          axis=[i for i in range(1, len(dim))]) + EPS)
+        return interpolates, distances
+
+
+class WDGRL(BaseDeepFeature):
+    """
+    WDGRL (Wasserstein Distance Guided Representation Learning) is an
+    unsupervised domain adaptation method on the model of the 
+    :ref:`DANN <adapt.feature_based.DANN>`. In WDGRL the discriminator
+    is used to approximate the Wasserstein distance between the
+    source and target encoded distributions in the spirit of WGAN.
+    
+    The optimization formulation is the following:
+    
+    .. math::
+    
+        \min_{\phi, F} & \; \mathcal{L}_{task}(F(\phi(X_S)), y_S) +
+        \lambda \\left(D(\phi(X_S)) - D(\phi(X_T)) \\right) \\\\
+        \max_{D} & \; \\left(D(\phi(X_S)) - D(\phi(X_T)) \\right) -
+        \\gamma (||\\nabla D(\\alpha \phi(X_S) + (1- \\alpha) \phi(X_T))||_2 - 1)^2
+        
+    Where:
+    
+    - :math:`(X_S, y_S), (X_T)` are respectively the labeled source data
+      and the unlabeled target data.
+    - :math:`\phi, F, D` are respectively the **encoder**, the **task**
+      and the **discriminator** networks
+    - :math:`\lambda` is the trade-off parameter.
+    - :math:`\\gamma` is the gradient penalty parameter.
+    
+    .. figure:: ../_static/images/wdgrl.png
+        :align: center
+        
+        WDGRL architecture (source: [1])
+    
+    Parameters
+    ----------
+    encoder : tensorflow Model (default=None)
+        Encoder netwok. If ``None``, a shallow network with 10
+        neurons and ReLU activation is used as encoder network.
+        
+    task : tensorflow Model (default=None)
+        Task netwok. If ``None``, a two layers network with 10
+        neurons per layer and ReLU activation is used as task network.
+        
+    discriminator : tensorflow Model (default=None)
+        Discriminator netwok. If ``None``, a two layers network with 10
+        neurons per layer and ReLU activation is used as discriminator
+        network. Note that the output shape of the discriminator should
+        be ``(None, 1)``.
+        
+    lambda_ : float or None (default=1)
+        Trade-off parameter. This parameter gives the trade-off
+        for the encoder between learning the task and matching
+        the source and target distribution. If `lambda_`is small
+        the encoder will focus on the task. If `lambda_=0`, WDGRL
+        is equivalent to a "source only" method.
+        
+    gamma : float (default=1.)
+        Gradient penalization parameter. To well approximate the
+        Wasserstein, the `discriminator`should be 1-Lipschitz.
+        This constraint is imposed by the gradient penalty term
+        of the optimization. The good value `gamma` to use is
+        not easy to find. One can check through the metrics that
+        the gradient penalty term is in the same order than the
+        "disc loss". If `gamma=0`, no penalty is given on the
+        discriminator gradient.
+        
+    loss : string or tensorflow loss (default="mse")
+        Loss function used for the task.
+        
+    metrics : dict or list of string or tensorflow metrics (default=None)
+        Metrics given to the model. If a list is provided,
+        metrics are used on both ``task`` and ``discriminator``
+        outputs. To give seperated metrics, please provide a
+        dict of metrics list with ``"task"`` and ``"disc"`` as keys.
+        
+    optimizer : string or tensorflow optimizer (default=None)
+        Optimizer of the model. If ``None``, the
+        optimizer is set to tf.keras.optimizers.Adam(0.001)
+        
+    copy : boolean (default=True)
+        Whether to make a copy of ``encoder``, ``task`` and
+        ``discriminator`` or not.
+        
+    random_state : int (default=None)
+        Seed of random generator.
+    
+    Attributes
+    ----------
+    encoder_ : tensorflow Model
+        encoder network.
+        
+    task_ : tensorflow Model
+        task network.
+        
+    discriminator_ : tensorflow Model
+        discriminator network.
+    
+    model_ : tensorflow Model
+        Fitted model: the union of ``encoder_``,
+        ``task_`` and ``discriminator_`` networks.
+        
+    history_ : dict
+        history of the losses and metrics across the epochs.
+        If ``yt`` is given in ``fit`` method, target metrics
+        and losses are recorded too.
+        
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from adapt.feature_based import WDGRL
+    >>> np.random.seed(0)
+    >>> Xs = np.concatenate((np.random.random((100, 1)),
+    ...                      np.zeros((100, 1))), 1)
+    >>> Xt = np.concatenate((np.random.random((100, 1)),
+    ...                      np.ones((100, 1))), 1)
+    >>> ys = 0.2 * Xs[:, 0]
+    >>> yt = 0.2 * Xt[:, 0]
+    >>> model = WDGRL(lambda_=0., random_state=0)
+    >>> model.fit(Xs, ys, Xt, yt, epochs=100, verbose=0)
+    >>> model.history_["task_t"][-1]
+    0.0223...
+    >>> model = WDGRL(lambda_=1, random_state=0)
+    >>> model.fit(Xs, ys, Xt, yt, epochs=100, verbose=0)
+    >>> model.history_["task_t"][-1]
+    0.0044...
+        
+    See also
+    --------
+    DANN
+    ADDA
+    DeepCORAL
+    
+        References
+    ----------
+    .. [1] `[1] <https://arxiv.org/pdf/1707.01217.pdf>`_ Shen, J., Qu, Y., Zhang, W., \
+and Yu, Y. Wasserstein distance guided representation learning for domain adaptation. \
+In AAAI, 2018.
+    """
+    def __init__(self, 
+                 encoder=None,
+                 task=None,
+                 discriminator=None,
+                 lambda_=1.,
+                 gamma=1.,
+                 loss="mse",
+                 metrics=None,
+                 optimizer=None,
+                 copy=True,
+                 random_state=None):
+        
+        self.lambda_ = lambda_
+        self.gamma = gamma        
+        super().__init__(encoder, task, discriminator,
+                         loss, metrics, optimizer, copy,
+                         random_state)
+
+    
+    def create_model(self, inputs_Xs, inputs_Xt):
+
+        encoded_src = self.encoder_(inputs_Xs)
+        encoded_tgt = self.encoder_(inputs_Xt)
+        task_src = self.task_(encoded_src)
+        task_tgt = self.task_(encoded_tgt)
+        
+        flip = GradientHandler(-self.lambda_, name="flip")
+        no_grad = GradientHandler(0, name="no_grad")
+        
+        disc_src = flip(encoded_src)
+        disc_src = self.discriminator_(disc_src)
+        disc_tgt = flip(encoded_tgt)
+        disc_tgt = self.discriminator_(disc_tgt)
+        
+        encoded_src_no_grad = no_grad(encoded_src)
+        encoded_tgt_no_grad = no_grad(encoded_tgt)
+        
+        interpolates, distances = _Interpolation()([encoded_src_no_grad, encoded_tgt_no_grad])
+        disc_grad = K.abs(
+            subtract([self.discriminator_(interpolates), self.discriminator_(encoded_src_no_grad)])
+        )
+        disc_grad /= distances
+        
+        outputs = dict(task_src=task_src,
+                       task_tgt=task_tgt,
+                       disc_src=disc_src,
+                       disc_tgt=disc_tgt,
+                       disc_grad=disc_grad)
+        return outputs
+
+    
+    def get_loss(self, inputs_ys,
+                  task_src, task_tgt,
+                  disc_src, disc_tgt,
+                  disc_grad):
+        
+        loss_task = self.loss_(inputs_ys, task_src)
+        loss_disc = K.mean(disc_src) - K.mean(disc_tgt)
+        gradient_penalty = K.mean(K.square(disc_grad-1.))
+                            
+        loss = K.mean(loss_task) - K.mean(loss_disc) + self.gamma * K.mean(gradient_penalty)
+        return loss
+    
+    
+    def get_metrics(self, inputs_ys, inputs_yt,
+                     task_src, task_tgt,
+                     disc_src, disc_tgt, disc_grad):
+        metrics = {}
+        
+        task_s = self.loss_(inputs_ys, task_src)
+        disc = K.mean(disc_src) - K.mean(disc_tgt)
+        grad_pen = K.square(disc_grad-1.)
+        
+        metrics["task_s"] = K.mean(task_s)
+        metrics["disc"] = K.mean(disc)
+        metrics["grad_pen"] = self.gamma * K.mean(grad_pen)
+       
+        if inputs_yt is not None:
+            task_t = self.loss_(inputs_yt, task_tgt)
+            metrics["task_t"] = K.mean(task_t)
+        
+        names_task, names_disc = self._get_metric_names()
+        
+        for metric, name in zip(self.metrics_task_, names_task):
+            metrics[name + "_s"] = metric(inputs_ys, task_src)
+            if inputs_yt is not None:
+                metrics[name + "_t"] = metric(inputs_yt, task_tgt)
+        return metrics
diff --git a/src_docs/_static/images/wdgrl.png b/src_docs/_static/images/wdgrl.png
diff --git a/src_docs/_templates/layout.html b/src_docs/_templates/layout.html
@@ -27,6 +27,7 @@
 <li class="toctree-l2"><a class="reference internal" href="{{ pathto("generated/adapt.feature_based.mSDA") }}">mSDA</a></li>
 <li class="toctree-l2"><a class="reference internal" href="{{ pathto("generated/adapt.feature_based.MCD") }}">MCD</a></li>
 <li class="toctree-l2"><a class="reference internal" href="{{ pathto("generated/adapt.feature_based.MDD") }}">MDD</a></li>
+<li class="toctree-l2"><a class="reference internal" href="{{ pathto("generated/adapt.feature_based.WDGRL") }}">WDGRL</a></li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="{{ pathto("contents") }}{{ contents }}{{ "adapt-instance-based" }}">Instance-based</a><ul>
diff --git a/src_docs/contents.rst b/src_docs/contents.rst
@@ -64,6 +64,7 @@ and **target** distributions. The **task** is then learned in this **encoded fea
    feature_based.mSDA
    feature_based.MCD
    feature_based.MDD
+   feature_based.WDGRL
 
    
 .. _adapt.instance_based:
diff --git a/src_docs/gallery/WDGRL.rst b/src_docs/gallery/WDGRL.rst
diff --git a/tests/test_wdgrl.py b/tests/test_wdgrl.py
@@ -0,0 +1,94 @@
+"""
+Test functions for dann module.
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Sequential, Model
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+
+from adapt.feature_based import WDGRL
+from adapt.feature_based._wdgrl import _Interpolation
+
+Xs = np.concatenate((
+    np.linspace(0, 1, 100).reshape(-1, 1),
+    np.zeros((100, 1))
+    ), axis=1)
+Xt = np.concatenate((
+    np.linspace(0, 1, 100).reshape(-1, 1),
+    np.ones((100, 1))
+    ), axis=1)
+ys = 0.2 * Xs[:, 0].ravel()
+yt = 0.2 * Xt[:, 0].ravel()
+
+
+def _get_encoder(input_shape=Xs.shape[1:]):
+    model = Sequential()
+    model.add(Dense(1, input_shape=input_shape,
+                    kernel_initializer="ones",
+                    use_bias=False))
+    model.compile(loss="mse", optimizer="adam")
+    return model
+
+
+def _get_discriminator(input_shape=(1,)):
+    model = Sequential()
+    model.add(Dense(10,
+                    input_shape=input_shape,
+                    activation="relu"))
+    model.add(Dense(1,
+                    activation=None))
+    model.compile(loss="mse", optimizer="adam")
+    return model
+
+
+def _get_task(input_shape=(1,), output_shape=(1,)):
+    model = Sequential()
+    model.add(Dense(np.prod(output_shape),
+                    use_bias=False,
+                    input_shape=input_shape))
+    model.compile(loss="mse", optimizer=Adam(0.1))
+    return model
+
+
+def test_interpolation():
+    np.random.seed(0)
+    tf.random.set_seed(0)
+
+    zeros = tf.identity(np.zeros((3, 1), dtype=np.float32))
+    ones= tf.identity(np.ones((3, 1), dtype=np.float32))
+
+    inter, dist = _Interpolation().call([zeros, ones])
+    assert np.all(np.round(dist, 3) == np.round(inter, 3))
+    assert np.all(inter >= zeros)
+    assert np.all(inter <= ones)
+
+
+def test_fit_lambda_zero():
+    tf.random.set_seed(1)
+    np.random.seed(1)
+    model = WDGRL(_get_encoder(), _get_task(), _get_discriminator(),
+                 lambda_=0, loss="mse", optimizer=Adam(0.01), metrics=["mse"],
+                 random_state=0)
+    model.fit(Xs, ys, Xt, yt,
+              epochs=300, verbose=0)
+    assert isinstance(model.model_, Model)
+    assert model.encoder_.get_weights()[0][1][0] == 1.0
+    assert np.sum(np.abs(model.predict(Xs).ravel() - ys)) < 0.01
+    assert np.sum(np.abs(model.predict(Xt).ravel() - yt)) > 10
+
+
+def test_fit_lambda_one():
+    tf.random.set_seed(1)
+    np.random.seed(1)
+    model = WDGRL(_get_encoder(), _get_task(), _get_discriminator(),
+                 lambda_=1, gamma=0, loss="mse", optimizer=Adam(0.01),
+                  metrics=["mse"], random_state=0)
+    model.fit(Xs, ys, Xt, yt,
+              epochs=300, verbose=0)
+    assert isinstance(model.model_, Model)
+    assert np.abs(model.encoder_.get_weights()[0][1][0] / 
+            model.encoder_.get_weights()[0][0][0]) < 0.05
+    assert np.sum(np.abs(model.predict(Xs).ravel() - ys)) < 2
+    assert np.sum(np.abs(model.predict(Xt).ravel() - yt)) < 2