Skip to content

Commit ffc96cc

Browse files
committed
1 parent 0c7e543 commit ffc96cc

File tree

4 files changed

+238
-1
lines changed

4 files changed

+238
-1
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#ifndef CAFFE_MISH_LAYER_HPP_
2+
#define CAFFE_MISH_LAYER_HPP_
3+
4+
#include <vector>
5+
6+
#include "caffe/blob.hpp"
7+
#include "caffe/layer.hpp"
8+
#include "caffe/proto/caffe.pb.h"
9+
10+
#include "caffe/layers/neuron_layer.hpp"
11+
12+
namespace caffe {
13+
14+
/**
15+
* @brief Sigmoid function non-linearity @f$
16+
* y = (1 + \exp(-x))^{-1}
17+
* @f$, a classic choice in neural networks.
18+
*
19+
* Note that the gradient vanishes as the values move away from 0.
20+
* The ReLULayer is often a better choice for this reason.
21+
*/
22+
template <typename Dtype>
23+
class MishLayer : public NeuronLayer<Dtype> {
24+
public:
25+
explicit MishLayer(const LayerParameter& param)
26+
: NeuronLayer<Dtype>(param) {}
27+
28+
virtual inline const char* type() const { return "Mish"; }
29+
30+
protected:
31+
/**
32+
* @param bottom input Blob vector (length 1)
33+
* -# @f$ (N \times C \times H \times W) @f$
34+
* the inputs @f$ x @f$
35+
* @param top output Blob vector (length 1)
36+
* -# @f$ (N \times C \times H \times W) @f$
37+
* the computed outputs @f$
38+
* y = (1 + \exp(-x))^{-1}
39+
* @f$
40+
*/
41+
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
42+
const vector<Blob<Dtype>*>& top);
43+
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
44+
const vector<Blob<Dtype>*>& top);
45+
46+
/**
47+
* @brief Computes the error gradient w.r.t. the sigmoid inputs.
48+
*
49+
* @param top output Blob vector (length 1), providing the error gradient with
50+
* respect to the outputs
51+
* -# @f$ (N \times C \times H \times W) @f$
52+
* containing error gradients @f$ \frac{\partial E}{\partial y} @f$
53+
* with respect to computed outputs @f$ y @f$
54+
* @param propagate_down see Layer::Backward.
55+
* @param bottom input Blob vector (length 1)
56+
* -# @f$ (N \times C \times H \times W) @f$
57+
* the inputs @f$ x @f$; Backward fills their diff with
58+
* gradients @f$
59+
* \frac{\partial E}{\partial x}
60+
* = \frac{\partial E}{\partial y} y (1 - y)
61+
* @f$ if propagate_down[0]
62+
*/
63+
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
64+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
65+
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
66+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
67+
};
68+
69+
} // namespace caffe
70+
71+
#endif // CAFFE_MISH_LAYER_HPP_

src/caffe/layers/mish_layer.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
2+
#include <cmath>
3+
#include <vector>
4+
5+
#include "caffe/layers/mish_layer.hpp"
6+
7+
namespace caffe {
8+
9+
10+
template <typename Dtype>
11+
inline Dtype tanh_activate(Dtype x) { return (2 / (1 + expf(-2 * x)) - 1); }
12+
13+
14+
template <typename Dtype>
15+
inline Dtype softplus_activate(Dtype x, float threshold) {
16+
if (x > threshold) return x; // too large
17+
else if (x < -threshold) return expf(x); // too small
18+
return logf(expf(x) + 1);
19+
}
20+
21+
template <typename Dtype>
22+
void MishLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
23+
const vector<Blob<Dtype>*>& top) {
24+
const Dtype* bottom_data = bottom[0]->cpu_data();
25+
Dtype* top_data = top[0]->mutable_cpu_data();
26+
const int count = bottom[0]->count();
27+
28+
const float MISH_THRESHOLD = 20;
29+
for (int i = 0; i < count; ++i) {
30+
float x_val = bottom_data[i];
31+
top_data[i] = x_val * tanh_activate(softplus_activate(x_val, MISH_THRESHOLD));
32+
}
33+
}
34+
35+
template <typename Dtype>
36+
void MishLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
37+
const vector<bool>& propagate_down,
38+
const vector<Blob<Dtype>*>& bottom) {
39+
if (propagate_down[0]) {
40+
const Dtype* top_data = top[0]->cpu_data();
41+
const Dtype* top_diff = top[0]->cpu_diff();
42+
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
43+
const int count = bottom[0]->count();
44+
for (int i = 0; i < count; ++i) {
45+
// const Dtype sigmoid_x = top_data[i];
46+
// bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
47+
48+
const float MISH_THRESHOLD = 20.0f;
49+
// implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
50+
// implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
51+
Dtype inp = top_data[i];
52+
const Dtype sp = softplus_activate(inp, MISH_THRESHOLD);
53+
const Dtype grad_sp = 1 - exp(-sp);
54+
const Dtype tsp = tanh(sp);
55+
const Dtype grad_tsp = (1 - tsp*tsp) * grad_sp;
56+
const Dtype grad = inp * grad_tsp + tsp;
57+
bottom_diff[i] = top_diff[i] * grad;
58+
59+
}
60+
}
61+
}
62+
63+
#ifdef CPU_ONLY
64+
STUB_GPU(MishLayer);
65+
#endif
66+
67+
INSTANTIATE_CLASS(MishLayer);
68+
REGISTER_LAYER_CLASS(Mish);
69+
70+
} // namespace caffe

src/caffe/layers/mish_layer.cu

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#include <cmath>
2+
#include <vector>
3+
4+
#include "caffe/layers/mish_layer.hpp"
5+
6+
namespace caffe {
7+
8+
template <typename Dtype>
9+
__device__ Dtype tanh_activate_kernel(Dtype x){return (2/(1 + expf(-2*x)) - 1);}
10+
11+
12+
template <typename Dtype>
13+
__device__ Dtype softplus_kernel(Dtype x, float threshold = 20) {
14+
if (x > threshold) return x; // too large
15+
else if (x < -threshold) return expf(x); // too small
16+
return logf(expf(x) + 1);
17+
}
18+
19+
/*__device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
20+
__device__ float softplus_kernel(float x, float threshold = 20) {
21+
if (x > threshold) return x; // too large
22+
else if (x < -threshold) return expf(x); // too small
23+
return logf(expf(x) + 1);
24+
}*/
25+
26+
template <typename Dtype>
27+
__global__ void MishForward(const int n, const Dtype* in, Dtype* out) {
28+
CUDA_KERNEL_LOOP(index, n) {
29+
out[index] = in[index] * tanh_activate_kernel(softplus_kernel(in[index]));
30+
}
31+
}
32+
33+
template <typename Dtype>
34+
void MishLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
35+
const vector<Blob<Dtype>*>& top) {
36+
const Dtype* bottom_data = bottom[0]->gpu_data();
37+
Dtype* top_data = top[0]->mutable_gpu_data();
38+
const int count = bottom[0]->count();
39+
// NOLINT_NEXT_LINE(whitespace/operators)
40+
MishForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
41+
count, bottom_data, top_data);
42+
CUDA_POST_KERNEL_CHECK;
43+
// << " count: " << count << " bottom_data: "
44+
// << (unsigned long)bottom_data
45+
// << " top_data: " << (unsigned long)top_data
46+
// << " blocks: " << CAFFE_GET_BLOCKS(count)
47+
// << " threads: " << CAFFE_CUDA_NUM_THREADS;
48+
}
49+
50+
template <typename Dtype>
51+
__global__ void MishBackward(const int n, const Dtype* in_diff,
52+
const Dtype* out_data, Dtype* out_diff) {
53+
CUDA_KERNEL_LOOP(index, n) {
54+
//const Dtype sigmoid_x = out_data[index];
55+
//out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
56+
57+
const float MISH_THRESHOLD = 20.0f;
58+
// implementation from TensorFlow: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L66-L80
59+
// implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
60+
const float inp = out_data[index];
61+
const float sp = softplus_kernel(inp, MISH_THRESHOLD);
62+
const float grad_sp = 1 - expf(-sp);
63+
const float tsp = tanh(sp);
64+
const float grad_tsp = (1 - tsp*tsp) * grad_sp;
65+
const float grad = inp * grad_tsp + tsp;
66+
67+
out_diff[index] = in_diff[index] * grad;
68+
69+
}
70+
}
71+
72+
template <typename Dtype>
73+
void MishLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
74+
const vector<bool>& propagate_down,
75+
const vector<Blob<Dtype>*>& bottom) {
76+
if (propagate_down[0]) {
77+
const Dtype* top_data = top[0]->gpu_data();
78+
const Dtype* top_diff = top[0]->gpu_diff();
79+
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
80+
const int count = bottom[0]->count();
81+
// NOLINT_NEXT_LINE(whitespace/operators)
82+
MishBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
83+
count, top_diff, top_data, bottom_diff);
84+
CUDA_POST_KERNEL_CHECK;
85+
}
86+
}
87+
88+
INSTANTIATE_LAYER_GPU_FUNCS(MishLayer);
89+
90+
91+
}

src/caffe/proto/caffe.proto

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ message ParamSpec {
460460
// NOTE
461461
// Update the next available ID when you add a new LayerParameter field.
462462
//
463-
// LayerParameter next available layer-specific ID: 270 (last added: maskrcnn_detection_param)
463+
// LayerParameter next available layer-specific ID: 271 (last added: mish_param)
464464
message LayerParameter {
465465
optional string name = 1; // the layer name
466466
optional string type = 2; // the layer type
@@ -614,6 +614,7 @@ message LayerParameter {
614614
optional YoloV2LossParameter yolo_v2_loss_param = 198;
615615
optional YoloV3LossParameter yolo_v3_loss_param = 199;
616616
optional UpsampleDarknetParameter upsample_darknet_param = 209; // Darknet layer used in yolov3
617+
optional MishParameter mish_param = 270; // yolov4
617618

618619
//TensorFlow related
619620
optional DepthToSpaceParameter depth_to_space_param = 208;
@@ -3316,3 +3317,7 @@ message MaskRCNNProposalParameter {
33163317
optional uint32 width = 8[default = 1024];
33173318
}
33183319

3320+
message MishParameter {
3321+
optional float MISH_THRESHOLD = 1;
3322+
}
3323+

0 commit comments

Comments
 (0)