Implement Mish Layer , reference : https://github.com/ChenYingpeng/darknet2caffe/tree/master/caffe_layers/mish_layer

lachao · lachao · commit ffc96cc18be9 · 2020-05-26T09:58:55.000+02:00
diff --git a/include/caffe/layers/mish_layer.hpp b/include/caffe/layers/mish_layer.hpp
@@ -0,0 +1,71 @@
+ #ifndef CAFFE_MISH_LAYER_HPP_
+#define CAFFE_MISH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Sigmoid function non-linearity @f$
+ *         y = (1 + \exp(-x))^{-1}
+ *     @f$, a classic choice in neural networks.
+ *
+ * Note that the gradient vanishes as the values move away from 0.
+ * The ReLULayer is often a better choice for this reason.
+ */
+template <typename Dtype>
+class MishLayer : public NeuronLayer<Dtype> {
+ public:
+  explicit MishLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Mish"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = (1 + \exp(-x))^{-1}
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x}
+   *            = \frac{\partial E}{\partial y} y (1 - y)
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MISH_LAYER_HPP_
diff --git a/src/caffe/layers/mish_layer.cpp b/src/caffe/layers/mish_layer.cpp
@@ -0,0 +1,70 @@
+  
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/mish_layer.hpp"
+
+namespace caffe {
+
+
+template <typename Dtype>
+inline Dtype tanh_activate(Dtype x) { return (2 / (1 + expf(-2 * x)) - 1); }
+
+
+template <typename Dtype>
+inline Dtype softplus_activate(Dtype x, float threshold) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}
+
+template <typename Dtype>
+void MishLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+
+  const float MISH_THRESHOLD = 20;
+  for (int i = 0; i < count; ++i) {
+    float x_val = bottom_data[i];
+    top_data[i] = x_val * tanh_activate(softplus_activate(x_val, MISH_THRESHOLD));
+  }
+}
+
+template <typename Dtype>
+void MishLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    for (int i = 0; i < count; ++i) {
+      // const Dtype sigmoid_x = top_data[i];
+      // bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
+
+      const float MISH_THRESHOLD = 20.0f;
+      // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
+      // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+      Dtype inp = top_data[i];
+      const Dtype sp = softplus_activate(inp, MISH_THRESHOLD);
+      const Dtype grad_sp = 1 - exp(-sp);
+      const Dtype tsp = tanh(sp);
+      const Dtype grad_tsp = (1 - tsp*tsp) * grad_sp;
+      const Dtype grad = inp * grad_tsp + tsp;
+      bottom_diff[i] = top_diff[i] * grad;
+
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(MishLayer);
+#endif
+
+INSTANTIATE_CLASS(MishLayer);
+REGISTER_LAYER_CLASS(Mish);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/mish_layer.cu b/src/caffe/layers/mish_layer.cu
@@ -0,0 +1,91 @@
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/mish_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype tanh_activate_kernel(Dtype x){return (2/(1 + expf(-2*x)) - 1);}
+
+
+template <typename Dtype>
+__device__ Dtype softplus_kernel(Dtype x, float threshold = 20) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}
+
+/*__device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
+__device__ float softplus_kernel(float x, float threshold = 20) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}*/
+
+template <typename Dtype>
+__global__ void MishForward(const int n, const Dtype* in, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = in[index] * tanh_activate_kernel(softplus_kernel(in[index]));
+  }
+}
+
+template <typename Dtype>
+void MishLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  MishForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, top_data);
+  CUDA_POST_KERNEL_CHECK;
+  // << " count: " << count << " bottom_data: "
+  //     << (unsigned long)bottom_data
+  //     << " top_data: " << (unsigned long)top_data
+  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
+  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+}
+
+template <typename Dtype>
+__global__ void MishBackward(const int n, const Dtype* in_diff,
+    const Dtype* out_data, Dtype* out_diff) {
+  CUDA_KERNEL_LOOP(index, n) {
+    //const Dtype sigmoid_x = out_data[index];
+    //out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
+
+    const float MISH_THRESHOLD = 20.0f;
+    // implementation from TensorFlow: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L66-L80
+    // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+    const float inp = out_data[index];
+    const float sp = softplus_kernel(inp, MISH_THRESHOLD);
+    const float grad_sp = 1 - expf(-sp);
+    const float tsp = tanh(sp);
+    const float grad_tsp = (1 - tsp*tsp) * grad_sp;
+    const float grad = inp * grad_tsp + tsp;
+
+    out_diff[index] = in_diff[index] * grad;
+
+  }
+}
+
+template <typename Dtype>
+void MishLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MishBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, top_data, bottom_diff);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(MishLayer);
+
+
+}
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -460,7 +460,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 270 (last added: maskrcnn_detection_param)
+// LayerParameter next available layer-specific ID: 271 (last added: mish_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -614,6 +614,7 @@ message LayerParameter {
   optional YoloV2LossParameter yolo_v2_loss_param = 198;
   optional YoloV3LossParameter yolo_v3_loss_param = 199;
   optional UpsampleDarknetParameter upsample_darknet_param = 209; // Darknet layer used in yolov3
+  optional MishParameter mish_param = 270; // yolov4
 
   //TensorFlow related
   optional DepthToSpaceParameter depth_to_space_param = 208;
@@ -3316,3 +3317,7 @@ message MaskRCNNProposalParameter {
   optional uint32 width = 8[default = 1024];
 }
 
+message MishParameter {
+  optional float MISH_THRESHOLD = 1;
+}
+