Skip to content

Commit 2116fc1

Browse files
committed
tbb conv
1 parent 20c8657 commit 2116fc1

File tree

2 files changed

+304
-2
lines changed

2 files changed

+304
-2
lines changed

include/layers/ConvLayer.hpp

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
#pragma once
2+
#include <tbb/blocked_range.h>
3+
#include <tbb/blocked_range2d.h>
4+
#include <tbb/blocked_range3d.h>
5+
#include <tbb/parallel_for.h>
6+
#include <tbb/task_group.h>
7+
28
#include <cmath>
39
#include <stdexcept>
410
#include <thread>
@@ -459,6 +465,152 @@ void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
459465
output = make_tensor<ValueType>(one_d_vector, sh);
460466
}
461467

468+
template <typename ValueType>
469+
void Conv4D_TBB(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
470+
Tensor& output, size_t stride_, size_t pads_, size_t group_,
471+
size_t dilations_) {
472+
size_t batch_size = input.get_shape()[0];
473+
size_t in_channels = input.get_shape()[1];
474+
size_t in_height = input.get_shape()[2];
475+
size_t in_width = input.get_shape()[3];
476+
477+
size_t kernel_out_channels = kernel_.get_shape()[0];
478+
size_t kernel_in_channels = kernel_.get_shape()[1];
479+
size_t kernel_height = kernel_.get_shape()[2];
480+
size_t kernel_width = kernel_.get_shape()[3];
481+
482+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input(
483+
batch_size,
484+
std::vector<std::vector<std::vector<ValueType>>>(
485+
in_height + 2 * pads_,
486+
std::vector<std::vector<ValueType>>(
487+
in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));
488+
489+
tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size),
490+
[&](const tbb::blocked_range<size_t>& range) {
491+
for (size_t b = range.begin(); b != range.end(); ++b) {
492+
for (size_t h = 0; h < in_height; ++h) {
493+
for (size_t w = 0; w < in_width; ++w) {
494+
for (size_t c = 0; c < in_channels; ++c) {
495+
padded_input[b][h + pads_][w + pads_][c] =
496+
input.get<ValueType>({b, c, h, w});
497+
}
498+
}
499+
}
500+
}
501+
});
502+
503+
size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_;
504+
size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_;
505+
506+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel(
507+
dilated_kernel_height,
508+
std::vector<std::vector<std::vector<ValueType>>>(
509+
dilated_kernel_width,
510+
std::vector<std::vector<ValueType>>(
511+
kernel_in_channels,
512+
std::vector<ValueType>(kernel_out_channels, 0))));
513+
514+
tbb::parallel_for(
515+
tbb::blocked_range<size_t>(0, kernel_out_channels),
516+
[&](const tbb::blocked_range<size_t>& range) {
517+
for (size_t oc = range.begin(); oc != range.end(); ++oc) {
518+
for (size_t h = 0; h < kernel_height; ++h) {
519+
for (size_t w = 0; w < kernel_width; ++w) {
520+
for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
521+
dil_kernel[h * dilations_][w * dilations_][ic][oc] =
522+
kernel_.get<ValueType>({oc, ic, h, w});
523+
}
524+
}
525+
}
526+
}
527+
});
528+
529+
auto ComputeConvOutputDim = [](size_t input_dim, size_t kernel_dim,
530+
size_t stride, size_t pad, size_t dilation) {
531+
size_t effective_kernel = (kernel_dim - 1) * dilation + 1;
532+
return (input_dim + 2 * pad - effective_kernel) / stride + 1;
533+
};
534+
535+
size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_,
536+
pads_, dilations_);
537+
size_t out_width =
538+
ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_);
539+
540+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
541+
batch_size, std::vector<std::vector<std::vector<ValueType>>>(
542+
kernel_out_channels,
543+
std::vector<std::vector<ValueType>>(
544+
out_height, std::vector<ValueType>(out_width, 0))));
545+
546+
tbb::parallel_for(
547+
tbb::blocked_range2d<size_t>(0, batch_size, 0, kernel_out_channels),
548+
[&](const tbb::blocked_range2d<size_t>& range) {
549+
for (size_t b = range.rows().begin(); b != range.rows().end(); ++b) {
550+
for (size_t oc = range.cols().begin(); oc != range.cols().end();
551+
++oc) {
552+
size_t group =
553+
(group_ > 1) ? oc / (kernel_out_channels / group_) : 0;
554+
size_t group_start_channel = group * (in_channels / group_);
555+
size_t group_end_channel = (group + 1) * (in_channels / group_);
556+
557+
for (size_t oh = 0; oh < out_height; ++oh) {
558+
for (size_t ow = 0; ow < out_width; ++ow) {
559+
ValueType value = 0;
560+
561+
for (size_t ic = group_start_channel; ic < group_end_channel;
562+
++ic) {
563+
size_t kernel_ic = ic - group_start_channel;
564+
565+
for (size_t kh = 0; kh < dilated_kernel_height; ++kh) {
566+
for (size_t kw = 0; kw < dilated_kernel_width; ++kw) {
567+
size_t h_index = oh * stride_ + kh;
568+
size_t w_index = ow * stride_ + kw;
569+
570+
if (h_index < padded_input[b].size() &&
571+
w_index < padded_input[b][h_index].size()) {
572+
value += padded_input[b][h_index][w_index][ic] *
573+
dil_kernel[kh][kw][kernel_ic][oc];
574+
}
575+
}
576+
}
577+
}
578+
if (!bias_.empty()) {
579+
value += (*bias_.as<ValueType>())[oc];
580+
}
581+
582+
output_tensor[b][oc][oh][ow] = value;
583+
}
584+
}
585+
}
586+
}
587+
});
588+
589+
size_t total_elements =
590+
batch_size * kernel_out_channels * out_height * out_width;
591+
std::vector<ValueType> one_d_vector(total_elements);
592+
593+
tbb::parallel_for(
594+
tbb::blocked_range<size_t>(0, batch_size),
595+
[&](const tbb::blocked_range<size_t>& range) {
596+
for (size_t b = range.begin(); b != range.end(); ++b) {
597+
size_t base_index = b * kernel_out_channels * out_height * out_width;
598+
for (size_t oc = 0; oc < kernel_out_channels; ++oc) {
599+
for (size_t oh = 0; oh < out_height; ++oh) {
600+
for (size_t ow = 0; ow < out_width; ++ow) {
601+
size_t idx = base_index + oc * out_height * out_width +
602+
oh * out_width + ow;
603+
one_d_vector[idx] = output_tensor[b][oc][oh][ow];
604+
}
605+
}
606+
}
607+
}
608+
});
609+
610+
Shape sh({batch_size, kernel_out_channels, out_height, out_width});
611+
output = make_tensor<ValueType>(one_d_vector, sh);
612+
}
613+
462614
template <typename ValueType>
463615
void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
464616
const Tensor& bias_, Tensor& output, size_t stride_,
@@ -619,4 +771,135 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
619771
}
620772
output = make_tensor<ValueType>(one_d_vector, sh);
621773
}
774+
775+
template <typename ValueType>
776+
void Conv4D_Legacy_TBB(const Tensor& input, const Tensor& kernel_,
777+
const Tensor& bias_, Tensor& output, size_t stride_,
778+
size_t pads_, size_t dilations_) {
779+
size_t batch_size = input.get_shape()[0];
780+
size_t in_height = input.get_shape()[2];
781+
size_t in_width = input.get_shape()[3];
782+
size_t in_channels = input.get_shape()[1];
783+
784+
size_t kernel_height = kernel_.get_shape()[0];
785+
size_t kernel_width = kernel_.get_shape()[1];
786+
size_t kernel_in_channels = kernel_.get_shape()[2];
787+
size_t kernel_out_channels = kernel_.get_shape()[3];
788+
789+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input(
790+
batch_size,
791+
std::vector<std::vector<std::vector<ValueType>>>(
792+
in_height + 2 * pads_,
793+
std::vector<std::vector<ValueType>>(
794+
in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));
795+
796+
tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size),
797+
[&](const tbb::blocked_range<size_t>& range) {
798+
for (size_t b = range.begin(); b != range.end(); ++b) {
799+
for (size_t h = 0; h < in_height; ++h) {
800+
for (size_t w = 0; w < in_width; ++w) {
801+
for (size_t c = 0; c < in_channels; ++c) {
802+
padded_input[b][h + pads_][w + pads_][c] =
803+
input.get<ValueType>({b, c, h, w});
804+
}
805+
}
806+
}
807+
}
808+
});
809+
810+
size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_;
811+
size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_;
812+
813+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel(
814+
dilated_kernel_height,
815+
std::vector<std::vector<std::vector<ValueType>>>(
816+
dilated_kernel_width,
817+
std::vector<std::vector<ValueType>>(
818+
kernel_in_channels,
819+
std::vector<ValueType>(kernel_out_channels, 0))));
820+
821+
tbb::parallel_for(
822+
tbb::blocked_range<size_t>(0, kernel_out_channels),
823+
[&](const tbb::blocked_range<size_t>& range) {
824+
for (size_t oc = range.begin(); oc != range.end(); ++oc) {
825+
for (size_t h = 0; h < kernel_height; ++h) {
826+
for (size_t w = 0; w < kernel_width; ++w) {
827+
for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
828+
dil_kernel[h * dilations_][w * dilations_][ic][oc] =
829+
kernel_.get<ValueType>({h, w, ic, oc});
830+
}
831+
}
832+
}
833+
}
834+
});
835+
836+
size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_,
837+
pads_, dilations_);
838+
size_t out_width =
839+
ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_);
840+
841+
std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
842+
batch_size, std::vector<std::vector<std::vector<ValueType>>>(
843+
kernel_out_channels,
844+
std::vector<std::vector<ValueType>>(
845+
out_height, std::vector<ValueType>(out_width, 0))));
846+
847+
tbb::parallel_for(
848+
tbb::blocked_range2d<size_t>(0, batch_size, 0, kernel_out_channels),
849+
[&](const tbb::blocked_range2d<size_t>& range) {
850+
for (size_t b = range.rows().begin(); b < range.rows().end(); ++b) {
851+
for (size_t oc = range.cols().begin(); oc < range.cols().end();
852+
++oc) {
853+
for (size_t i = 0; i < out_height; i += stride_) {
854+
for (size_t j = 0; j < out_width; j += stride_) {
855+
ValueType value = 0;
856+
857+
for (size_t ic = 0; ic < in_channels; ++ic) {
858+
if (ic < kernel_in_channels) {
859+
for (size_t h = 0; h < dilated_kernel_height; ++h) {
860+
for (size_t w = 0; w < dilated_kernel_width; ++w) {
861+
if (i + h < padded_input[b].size() &&
862+
j + w < padded_input[b][i + h].size()) {
863+
value += padded_input[b][i + h][j + w][ic] *
864+
dil_kernel[h][w][ic][oc];
865+
}
866+
}
867+
}
868+
}
869+
}
870+
871+
if (!bias_.empty()) {
872+
output_tensor[b][oc][i][j] =
873+
value + (*bias_.as<ValueType>())[oc];
874+
} else {
875+
output_tensor[b][oc][i][j] = value;
876+
}
877+
}
878+
}
879+
}
880+
}
881+
});
882+
883+
Shape sh({batch_size, kernel_out_channels, out_height, out_width});
884+
std::vector<ValueType> one_d_vector(batch_size * out_height * out_width *
885+
kernel_out_channels);
886+
887+
tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size),
888+
[&](const tbb::blocked_range<size_t>& range) {
889+
size_t index_1d = range.begin() * kernel_out_channels *
890+
out_height * out_width;
891+
for (size_t b = range.begin(); b != range.end(); ++b) {
892+
for (size_t oc = 0; oc < kernel_out_channels; ++oc) {
893+
for (size_t h = 0; h < out_height; ++h) {
894+
for (size_t w = 0; w < out_width; ++w) {
895+
one_d_vector[index_1d++] =
896+
output_tensor[b][oc][h][w];
897+
}
898+
}
899+
}
900+
}
901+
});
902+
903+
output = make_tensor<ValueType>(one_d_vector, sh);
904+
}
622905
} // namespace it_lab_ai

src/layers/ConvLayer.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
4040
case ParBackend::kThreads:
4141
implType_ = kSTL;
4242
break;
43+
case ParBackend::kTbb:
44+
implType_ = kTBB;
45+
break;
4346
case ParBackend::kSeq:
4447
default:
4548
implType_ = kDefault;
@@ -161,15 +164,31 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
161164
sh);
162165
} else {
163166
if (useLegacyImpl_) {
164-
Conv4D_Legacy<float>(input[0], kernel_, bias_, output[0], stride_,
165-
pads_, dilations_);
167+
switch (implType_) {
168+
case kTBB: {
169+
Conv4D_Legacy_TBB<float>(input[0], kernel_, bias_, output[0], stride_,
170+
pads_, dilations_);
171+
break;
172+
}
173+
default: {
174+
Conv4D_Legacy<float>(input[0], kernel_, bias_, output[0], stride_,
175+
pads_, dilations_);
176+
break;
177+
}
178+
}
179+
166180
} else {
167181
switch (implType_) {
168182
case kSTL: {
169183
Conv4DSTL<float>(input[0], kernel_, bias_, output[0], stride_,
170184
pads_, group_, dilations_);
171185
break;
172186
}
187+
case kTBB: {
188+
Conv4D_TBB<float>(input[0], kernel_, bias_, output[0], stride_,
189+
pads_, group_, dilations_);
190+
break;
191+
}
173192
default: {
174193
Conv4D<float>(input[0], kernel_, bias_, output[0], stride_, pads_,
175194
group_, dilations_);

0 commit comments

Comments
 (0)