PaddlePaddle
diff --git a/‎paddle/cuda/include/hl_matrix.h‎
Lines changed: 0 additions & 42 deletions b/‎paddle/cuda/include/hl_matrix.h‎
Lines changed: 0 additions & 42 deletions
diff --git a/‎paddle/cuda/include/stub/hl_matrix_stub.h‎
Lines changed: 0 additions & 19 deletions b/‎paddle/cuda/include/stub/hl_matrix_stub.h‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎paddle/cuda/src/hl_cuda_matrix.cu‎
Lines changed: 0 additions & 171 deletions b/‎paddle/cuda/src/hl_cuda_matrix.cu‎
Lines changed: 0 additions & 171 deletions
diff --git a/‎paddle/function/CosSimOp.cpp‎
Lines changed: 27 additions & 25 deletions b/‎paddle/function/CosSimOp.cpp‎
Lines changed: 27 additions & 25 deletions
diff --git a/‎paddle/function/CosSimOp.h‎
Lines changed: 10 additions & 10 deletions b/‎paddle/function/CosSimOp.h‎
Lines changed: 10 additions & 10 deletions
@@ -188,48 +188,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                         int width,
                                         int height,
                                         int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);
 
 /**
  * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
 
@@ -74,25 +74,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                         int height,
                                         int partial_sum) {}
 
-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                       real* B_d,
                                       const int channel,
 
@@ -584,177 +584,6 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                       real* B,
                                       const int channel,
 
@@ -34,7 +34,6 @@ void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
   CHECK(in2_mat->getHeight() == 1LU || in2_mat->getHeight() == num_samples);
   size_t inc = (in2_mat->getHeight() == 1LU) ? 0 : dim;
   for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    /// for each row, todo(tianbing), use TensorExpression square2 ?
     real square_sum_x = 0;
     real square_sum_y = 0;
     real xy = 0;
@@ -147,12 +146,15 @@ void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix* out_grad,
 }
 
 /**
- * \param inputs[0] output value 1, size: nSamples * 1.
- * \param inputs[1] input value 1, size: nSamples * dim.
- * \param inputs[2] input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param inputs[3] input grad 1, size: nSamples * dim.
- * \param inputs[4] input grad 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output grad, size : nSamples * 1.
+ * \param inouts[0] forward input grad 1, size: nSamples * dim.
+ * \param inouts[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
  */
 template <DeviceType Device>
 class CosSimBackwardFunc : public FunctionBase {
@@ -163,35 +165,35 @@ class CosSimBackwardFunc : public FunctionBase {
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(inputs.size(), 5);
-    CHECK_EQ(outputs.size(), 1);
-    CHECK_EQ(inouts.size(), 0);
+    CHECK_EQ(inputs.size(), 4);
+    CHECK_EQ(outputs.size(), 0);
+    CHECK_EQ(inouts.size(), 2);
     /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(outputs[0].dims_[1], 1UL);
     CHECK_EQ(inputs[0].dims_[1], 1UL);
+    CHECK_EQ(inputs[1].dims_[1], 1UL);
     /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[1].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[3].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[1].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inouts[0].dims_[0], inputs[0].dims_[0]);
     /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[2].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[3].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[4].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[3].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[0].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[1].dims_[1], inputs[2].dims_[1]);
 
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[1].getData() &&
-          inputs[2].getData() && inputs[3].getData() && inputs[4].getData());
+    CHECK(inputs[0].getData() && inputs[1].getData() && inputs[2].getData() &&
+          inputs[3].getData() && inouts[0].getData() && inouts[1].getData());
     const auto out_grad = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[2].getData(), inputs[2].dims_[0], inputs[2].dims_[1]);
-    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[3].getData(), inputs[3].dims_[0], inputs[3].dims_[1]);
+    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
     auto in2_grad = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[4].getData(), inputs[4].dims_[0], inputs[4].dims_[1]);
+        inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);
 
     CosSimBackward<Device>(out_grad.get(),
                            out_val.get(),
 
@@ -25,9 +25,9 @@ namespace paddle {
  *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
  *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
  *
- * \param[out]  output            output data.
- * \param[in]   intput1           input data.
- * \param[in]   intput2           input data.
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
  * \param[in]   scale             default 1.0.
  *
  */
@@ -40,13 +40,13 @@ void CosSimForward(typename MatrixT<Device>::type* output,
 /**
  * \brief   Cosine Similarity BackWard for Derivative.
  *
- * \param[out]  output1           backward loss output grad.
- * \param[in]   input1            forward-output value.
- * \param[in]   input2            forward input value 1.
- * \param[in]   input3            forward input value 2.
- * \param[in]   input4            forward input grad 1.
- * \param[in]   input5            forward input grad 2.
- * \param[in]   scale             default 1.0.
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
  *
  */
 template <DeviceType Device>