diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp index 157252681c25c7..965a767507179c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp @@ -80,6 +80,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ reg64_t reg_ow_pos = rdx; reg64_t aux_reg_output = reg_ow_pos; reg64_t reg_dg_iter = reg_output; + reg64_t reg_gr_iter = rsp; reg64_t aux_reg_input = rax; reg64_t aux2_reg_input = reg_kernel; reg64_t reg_ic_iter = rbx; @@ -163,6 +164,10 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (size_t d = 0; d < vlen / sizeof(int32_t); ++d) { dd(1); } + + for (size_t d = 0; d < vlen / sizeof(int32_t); ++d) { + dd(-1); + } } void apply_filter(int ow_step, int oc_blocks_step, int oc_step, int ic_step) { @@ -359,16 +364,32 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ cvtsi2ss(xmm_ih_im, reg_tmp_32); addss(xmm_ih_im, xmm_map_h); - movss(xmm_tmp, xmm_ih_im); - cmpss(xmm_tmp, table_val(0), 1); - movq(reg_tmp_64, xmm_tmp); - cmp(reg_tmp_32, 0); - jne(init_with_zeros, T_NEAR); + if (jcp_.with_bi_pad) { + movss(xmm_tmp, xmm_ih_im); + cvtps2dq(xmm_tmp, xmm_tmp); + cmpss(xmm_tmp, table_val(6), 0x0e); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + jne(init_with_zeros, T_NEAR); + + movss(xmm_tmp, xmm_ih_im); + cvtps2dq(xmm_tmp, xmm_tmp); + cmpss(xmm_tmp, table_val(1), 1); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + je(init_with_zeros, T_NEAR); + } else { + movss(xmm_tmp, xmm_ih_im); + cmpss(xmm_tmp, table_val(0), 1); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + jne(init_with_zeros, T_NEAR); - cmpss(xmm_ih_im, table_val(1), 1); - movq(reg_tmp_64, xmm_ih_im); - cmp(reg_tmp_32, 0); - je(init_with_zeros, T_NEAR); + cmpss(xmm_ih_im, table_val(1), 1); + movq(reg_tmp_64, xmm_ih_im); + cmp(reg_tmp_32, 0); + je(init_with_zeros, T_NEAR); + } size_t def_off_w = ((2 * (kh * jcp_.kw + kw) + 1) * jcp_.oh * jcp_.ow) + ow; @@ -387,16 +408,33 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ cvtsi2ss(xmm_iw_im, reg_tmp_32); addss(xmm_iw_im, xmm_map_w); - movss(xmm_tmp, xmm_iw_im); - cmpss(xmm_tmp, table_val(0), 1); - movq(reg_tmp_64, xmm_tmp); - cmp(reg_tmp_32, 0); - jne(init_with_zeros, T_NEAR); + if (jcp_.with_bi_pad) { + movss(xmm_tmp, xmm_iw_im); + cvtps2dq(xmm_tmp, xmm_tmp); + cmpss(xmm_tmp, table_val(6), 0x0e); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + jne(init_with_zeros, T_NEAR); + + movss(xmm_tmp, xmm_iw_im); + cvtps2dq(xmm_tmp, xmm_tmp); + cmpss(xmm_tmp, table_val(2), 1); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + je(init_with_zeros, T_NEAR); + } else { + movss(xmm_tmp, xmm_iw_im); + cmpss(xmm_tmp, table_val(0), 1); + movq(reg_tmp_64, xmm_tmp); + cmp(reg_tmp_32, 0); + jne(init_with_zeros, T_NEAR); + + cmpss(xmm_iw_im, table_val(2), 1); + movq(reg_tmp_64, xmm_iw_im); + cmp(reg_tmp_32, 0); + je(init_with_zeros, T_NEAR); + } - cmpss(xmm_iw_im, table_val(2), 1); - movq(reg_tmp_64, xmm_iw_im); - cmp(reg_tmp_32, 0); - je(init_with_zeros, T_NEAR); // interpolation calculation @@ -853,7 +891,6 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ L(oc_unrolled_loop); { cmp(reg_oc_work, jcp_.nb_oc_blocking * jcp_.oc_block); jl(oc_main_loop, T_NEAR); - ic_loop(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); store_output(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); @@ -869,7 +906,6 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ L(oc_main_loop); { cmp(reg_oc_work, jcp_.oc_block); jl(oc_tail, T_NEAR); - ic_loop(ow_step, 1, jcp_.oc_block); store_output(ow_step, 1, jcp_.oc_block); @@ -967,6 +1003,12 @@ void MKLDNNDeformableConvolutionNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; + const int simd_w = mayiuse(cpu::x64::avx512_common) ? 16 : 8; + if (group != 1 && (((getParentEdgeAt(0)->getDims()[1] / group) % simd_w != 0) + || ((getChildEdgeAt(0)->getDims()[1] / group) % simd_w != 0))) { + enforceRef = true; + } + size_t inputsNumber = getOriginalInputsNumber(); InferenceEngine::LayerConfig config; config.dynBatchSupport = false; @@ -987,7 +1029,9 @@ void MKLDNNDeformableConvolutionNode::initSupportedPrimitiveDescriptors() { config.outConfs[0].inPlace = -1; impl_desc_type impl_type; - if (mayiuse(cpu::x64::avx512_common)) { + if (enforceRef) { + impl_type = impl_desc_type::ref; + } else if (mayiuse(cpu::x64::avx512_common)) { impl_type = impl_desc_type::jit_avx512; } else if (mayiuse(cpu::x64::avx2)) { impl_type = impl_desc_type::jit_avx2; @@ -997,8 +1041,8 @@ void MKLDNNDeformableConvolutionNode::initSupportedPrimitiveDescriptors() { impl_type = impl_desc_type::ref; } - if (mayiuse(cpu::x64::sse41)) { - // optimzed implementation + if (!enforceRef && mayiuse(cpu::x64::sse41)) { + // optimized implementation auto dataFormat = memory::format_tag::nhwc; auto offFormat = memory::format_tag::nchw; auto weiFormat = group > 1 ? mayiuse(avx512_common) ? memory::format_tag::gOIhw16i16o : memory::format_tag::gOIhw8i8o @@ -1062,9 +1106,9 @@ void MKLDNNDeformableConvolutionNode::createPrimitive() { jcp.oh = dstDims[2]; jcp.ow = dstDims[3]; - bool with_groups = group > 1; - jcp.kh = weiDims[with_groups + 2]; - jcp.kw = weiDims[with_groups + 3]; +// bool with_groups = group > 1; + jcp.kh = weiDims[2]; + jcp.kw = weiDims[3]; jcp.t_pad = paddingL[0]; jcp.l_pad = paddingL[1]; @@ -1097,7 +1141,9 @@ void MKLDNNDeformableConvolutionNode::createPrimitive() { jcp.nthr = dnnl_get_max_threads(); - if (mayiuse(cpu::x64::avx512_common)) { + if (enforceRef) { + return; + } else if (mayiuse(cpu::x64::avx512_common)) { def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32(jcp)); } else if (mayiuse(cpu::x64::avx2)) { def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32(jcp)); @@ -1147,7 +1193,7 @@ void MKLDNNDeformableConvolutionNode::executeReference(const float* src, const f for (int ic = 0; ic < IC; ic++) { const float *data_im_ptr = src + mb * src_strides[0] + (g * IC + ic) * src_strides[1] + h_in * src_strides[2] + w_in * src_strides[3]; - const int deformable_group_index = ic / channel_per_deformable_group; + const int deformable_group_index = (IC * g + ic) / channel_per_deformable_group; const float *data_offset_ptr = offsets + mb * off_strides[0] + (deformable_group_index * 2 * KH * KW) * off_strides[1]; const float *modulation_offset_ptr = nullptr; if (modulation != nullptr) { @@ -1165,22 +1211,38 @@ void MKLDNNDeformableConvolutionNode::executeReference(const float* src, const f const float h_im = h_in + map_h; // absolute pixel index with offset const float w_im = w_in + map_w; // absolute pixel index with offset - if (h_im >= 0 && w_im >= 0 && h_im < IH && w_im < IW) { - const int cur_height = IH - h_in; - const int cur_width = IW - w_in; - int h_low = std::max(static_cast(floorf(map_h)), 0); - int w_low = std::max(static_cast(floorf(map_w)), 0); - int h_high = with_bi_pad ? h_low + 1 : std::min(static_cast(ceilf(map_h)), cur_height - 1); - int w_high = with_bi_pad ? w_low + 1 : std::min(static_cast(ceilf(map_w)), cur_width - 1); + bool skip_compute; + if (with_bilinear_pad) { + skip_compute = !(static_cast(w_im) > -1 && + static_cast(w_im) < IW && + static_cast(h_im) > -1 && + static_cast(h_im) < IH); + } else { + skip_compute = !(w_im >= 0 && + w_im < IW && + h_im >= 0 && + h_im < IH); + } + if (!skip_compute) { + const int cur_h_end = IH - h_in; + const int cur_w_end = IW - w_in; + int h_low = with_bi_pad ? static_cast(floorf(map_h)) : + std::max(static_cast(floorf(map_h)), 0); + int w_low = with_bi_pad ? static_cast(floorf(map_w)) : + std::max(static_cast(floorf(map_w)), 0); + const int cur_h_start = h_low + h_in; + const int cur_w_start = w_low + w_in; + int h_high = with_bi_pad ? h_low + 1 : std::min(static_cast(ceilf(map_h)), cur_h_end - 1); + int w_high = with_bi_pad ? w_low + 1 : std::min(static_cast(ceilf(map_w)), cur_w_end - 1); float lh = map_h - h_low; float lw = map_w - w_low; float hh = 1 - lh, hw = 1 - lw; - float v1 = (w_low >= 0 && h_low >= 0) ? data_im_ptr[h_low * src_strides[2] + w_low * src_strides[3]] : 0.0f; - float v2 = (w_high < cur_width && h_low >= 0) ? data_im_ptr[h_low * src_strides[2] + w_high * src_strides[3]] : 0.0f; - float v3 = (w_low >= 0 && h_high < cur_height) ? data_im_ptr[h_high * src_strides[2] + w_low * src_strides[3]] : 0.0f; - float v4 = (w_high < cur_width && h_high < cur_height) ? data_im_ptr[h_high * src_strides[2] + w_high * src_strides[3]] : 0.0f; + float v1 = (cur_w_start >= 0 && cur_h_start >= 0) ? data_im_ptr[h_low * src_strides[2] + w_low * src_strides[3]] : 0.0f; + float v2 = (w_high < cur_w_end && cur_h_start >= 0) ? data_im_ptr[h_low * src_strides[2] + w_high * src_strides[3]] : 0.0f; + float v3 = (cur_w_start >= 0 && h_high < cur_h_end) ? data_im_ptr[h_high * src_strides[2] + w_low * src_strides[3]] : 0.0f; + float v4 = (w_high < cur_w_end && h_high < cur_h_end) ? data_im_ptr[h_high * src_strides[2] + w_high * src_strides[3]] : 0.0f; float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); @@ -1192,8 +1254,8 @@ void MKLDNNDeformableConvolutionNode::executeReference(const float* src, const f modulation_scalar = modulation_offset_ptr[modulation_index]; } - const float weight = with_groups ? weights[g * wei_strides[0] + oc * wei_strides[1] + ic * wei_strides[2] + kh * wei_strides[3] + - kw * wei_strides[4]] + const float weight = with_groups ? weights[(g + oc / G) * wei_strides[0] + ic * wei_strides[1] + kh * wei_strides[2] + + kw * wei_strides[3]] : weights[oc * wei_strides[0] + ic * wei_strides[1] + kh * wei_strides[2] + kw * wei_strides[3]]; d += val * weight * modulation_scalar; } @@ -1205,7 +1267,7 @@ void MKLDNNDeformableConvolutionNode::executeReference(const float* src, const f }; parallel_nd(G, MB, OC, OH, OW, - [&](int g, int mb, int oc, int oh, int ow) { + [&](int g, int mb, int oc, int oh, int ow) { dst[mb * dst_strides[0] + (g * OC + oc) * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]] = ker(g, mb, oc, oh, ow); }); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h index 5ffaff1fb3a316..d71946f77a3b51 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h @@ -79,6 +79,7 @@ class MKLDNNDeformableConvolutionNode : public MKLDNNNode { bool canBeInPlace() const override { return false; } + bool enforceRef = false; InferenceEngine::Precision getRuntimePrecision() const override;