Add python wrapper for matmul_op and dot_product_attention

guoshengCS · guoshengCS · commit 9bcb2d268e4c · 2018-01-17T17:25:00.000+08:00
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
@@ -364,6 +364,12 @@ split
 ..  autofunction:: paddle.v2.fluid.layers.split
     :noindex:
 
+
+matmul
+------
+..  autofunction:: paddle.v2.fluid.layers.matmul
+    :noindex:
+
 logsigmoid
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
@@ -25,3 +25,9 @@ glu
 ..  autofunction:: paddle.v2.fluid.nets.glu
     :noindex:
 
+
+dot_product_attention
+---------------------
+..  autofunction:: paddle.v2.fluid.nets.dot_product_attention
+    :noindex:
+
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
@@ -37,6 +37,7 @@
     'sequence_last_step',
     'dropout',
     'split',
+    'matmul',
 ]
 
 
@@ -1586,83 +1587,71 @@ def split(input, num_or_sections, dim=-1):
     return outs
 
 
-def matmul(x, y):
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-    Applies matrix multipication to two tensors.
+    Applies matrix multipication to two tensors. Currently only rank 1 to rank 
+    3 input tensors are supported.
 
-    This operator is used to perform (batched) matrix multiplication
-    over the last two dimensions of the input tensors `X` and `Y`.
+    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the 
+    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
-    If a transpose flag is specified, the last two dimensions of the
-    tensor are transposed. If the tensor is rank-1 of shape [D], then
-    for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
-    in transposed form, whereas for `Y` it is the opposite: It is treated
-    as [D, 1] in nontransposed form and as [1, D] in transposed form.
+    - If a transpose flag is specified, the last two dimensions of the tensor 
+      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for 
+      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as 
+      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the 
+      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as 
+      :math:`[1, D]` in transposed form.
 
-    Examples without transpose:
-    - X: [K], Y: [K] => Out: [1]
-    - X: [K], Y: [K, N] => Out: [N]
-    - X: [B, M, K], Y: [K] => Out: [B, M]
-    - X: [M, K], Y: [B, K, N] => Out: [B, M, N]
-    - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
+    - After transpose, the two tensors are 2-D or 3-D and matrix multipication 
+      performs in the following way.
 
-    The behavior is designed to be similar to the `numpy.matmul` function.
-    The differences are:
-    - Currently only rank 1 to rank 3 input tensors are supported.
-    - We add `transpose_X` and `transpose_Y` flags.
+      - If both are 2-D, they are multiplied like conventional matrices.
+      - If either is 3-D, it is treated as a stack of matrices residing in the 
+        last two dimensions and a batched matrix multiply supporting broadcast 
+        applies on the two tensors.
 
-    Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-    or not. But the output only shares the LoD information with input `X`.
+    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and 
+    nontransposed, the prepended or appended dimension :math:`1` will be 
+    removed after matrix multipication.
 
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): If :attr:`num_or_sections` is an integer, 
-            then the integer indicates the number of equal sized sub-tensors 
-            that the tensor will be divided into. If :attr:`num_or_sections` 
-            is a list of integers, the length of list indicates the number of 
-            sub-tensors and the integers indicate the sizes of sub-tensors' 
-            :attr:`dim` dimension orderly.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
-            dimension to split along is :math:`rank(input) + dim`.
+        y (Variable): The input variable which is a Tensor or LoDTensor.
+        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        name(str|None): A name for this layer(optional). If set None, the layer 
+            will be named automatically.
 
     Returns:
-        List: The list of segmented tensor variables.
+        Variable: The product Tensor variable.
 
     Examples:
         .. code-block:: python
 
-            # x is a Tensor variable with shape [3, 9, 5]:
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1)
-            x0.shape  # [3, 3, 5]
-            x1.shape  # [3, 3, 5]
-            x2.shape  # [3, 3, 5]
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
-            x0.shape  # [3, 2, 5]
-            x1.shape  # [3, 3, 5]
-            x2.shape  # [3, 4, 5]
+            # Examples to clarify shapes of the inputs and output
+            # x: [B, M, K], y: [B, K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+            # x: [B, M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+            # x: [B, M, K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [B, M]
+            # x: [M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [M, N]
+            # x: [K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [1]
+            # x: [M], y: [N]
+            fluid.layers.matmul(x, y, True, True)  # out: [M, N]
     """
-    helper = LayerHelper('split', **locals())
-    input_shape = input.shape
-    dim = (len(input_shape) + dim) if dim < 0 else dim
-    if isinstance(num_or_sections, int):
-        assert num_or_sections > 1, 'num_or_sections must be more than 1.'
-        num = num_or_sections
-    else:
-        assert len(num_or_sections) < input_shape[
-            dim], 'len(num_or_sections) must not be more than input.shape[dim].'
-        num = len(num_or_sections)
-    outs = [
-        helper.create_tmp_variable(dtype=helper.input_dtype())
-        for i in range(num)
-    ]
+    helper = LayerHelper('matmul', **locals())
+    assert max(
+        len(x.shape), len(y.shape)
+    ) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.'
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
-        type='split',
-        inputs={'X': input},
-        outputs={'Out': outs},
-        attrs={
-            'num': num_or_sections if isinstance(num_or_sections, int) else 0,
-            'sections': num_or_sections
-            if isinstance(num_or_sections, list) else [],
-            'axis': dim
-        })
-    return outs
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'transpose_X': transpose_x,
+               'transpose_Y': transpose_y})
+    return out
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
@@ -4,6 +4,7 @@
     "simple_img_conv_pool",
     "sequence_conv_pool",
     "glu",
+    "",
 ]
 
 
@@ -135,3 +136,55 @@ def glu(input, dim=-1):
     a, b = layers.split(input, num_or_sections=2, dim=dim)
     out = layers.elementwise_mul(x=a, y=b)
     return out
+
+
+def dot_product_attention(querys, keys, values):
+    """
+    The dot-product attention.
+
+    Attention mechanism can be seen as mapping a query and a set of key-value 
+    pairs to an output. The output is computed as a weighted sum of the values, 
+    where the weight assigned to each value is computed by a compatibility 
+    function (dot-product here) of the query with the corresponding key.
+    
+    The dot-product attention can be implemented through (batch) matrix 
+    multipication as follows:
+
+        .. math::
+
+            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
+
+    Refer to `Attention Is All You Need 
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+
+    Note that batch data containing sequences with different lengths is not 
+    supported by this because of the (batch) matrix multipication.
+    
+    Args:
+        query (Variable): The input variable which is a Tensor or LoDTensor.
+        key (Variable): The input variable which is a Tensor or LoDTensor.
+        value (Variable): The input variable which is a Tensor or LoDTensor.
+
+    Returns:
+        tuple: The Tensor variables representing the output and attention scores.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose q, k, v are tensor variables with the following shape:
+            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
+            out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
+            out.shape  # [3, 5, 10]
+            attn_scores.shape  # [3, 5, 6]
+    """
+    assert keys.shape[-2] == values.shape[
+        -2], 'The shapes of keys and values mismatch.'
+    assert querys.shape[-1] == keys.shape[
+        -1], 'The shapes of querys and keys mismatch.'
+    product = layers.matmul(x=querys, y=keys, transpose_y=True)
+    attn_scores = layers.reshape(
+        x=layers.reshape(
+            x=product, shape=[-1, product.shape[-1]], act='softmax'),
+        shape=product.shape)
+    out = layers.matmul(attn_scores, values)
+    return out, attn_scores
diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
@@ -83,18 +83,18 @@ def setUp(self):
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-        self.check_output(atol=1e-2)
+        self.check_output(atol=1e-3)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
 
     def test_check_grad_ignore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
 
 
 # Generate test cases for all possibilities