|
1 | 1 | #pragma once |
| 2 | +#include <tbb/blocked_range.h> |
| 3 | +#include <tbb/blocked_range2d.h> |
| 4 | +#include <tbb/blocked_range3d.h> |
| 5 | +#include <tbb/parallel_for.h> |
| 6 | +#include <tbb/task_group.h> |
| 7 | + |
2 | 8 | #include <cmath> |
3 | 9 | #include <stdexcept> |
4 | 10 | #include <thread> |
@@ -459,6 +465,152 @@ void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, |
459 | 465 | output = make_tensor<ValueType>(one_d_vector, sh); |
460 | 466 | } |
461 | 467 |
|
| 468 | +template <typename ValueType> |
| 469 | +void Conv4D_TBB(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, |
| 470 | + Tensor& output, size_t stride_, size_t pads_, size_t group_, |
| 471 | + size_t dilations_) { |
| 472 | + size_t batch_size = input.get_shape()[0]; |
| 473 | + size_t in_channels = input.get_shape()[1]; |
| 474 | + size_t in_height = input.get_shape()[2]; |
| 475 | + size_t in_width = input.get_shape()[3]; |
| 476 | + |
| 477 | + size_t kernel_out_channels = kernel_.get_shape()[0]; |
| 478 | + size_t kernel_in_channels = kernel_.get_shape()[1]; |
| 479 | + size_t kernel_height = kernel_.get_shape()[2]; |
| 480 | + size_t kernel_width = kernel_.get_shape()[3]; |
| 481 | + |
| 482 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input( |
| 483 | + batch_size, |
| 484 | + std::vector<std::vector<std::vector<ValueType>>>( |
| 485 | + in_height + 2 * pads_, |
| 486 | + std::vector<std::vector<ValueType>>( |
| 487 | + in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0)))); |
| 488 | + |
| 489 | + tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size), |
| 490 | + [&](const tbb::blocked_range<size_t>& range) { |
| 491 | + for (size_t b = range.begin(); b != range.end(); ++b) { |
| 492 | + for (size_t h = 0; h < in_height; ++h) { |
| 493 | + for (size_t w = 0; w < in_width; ++w) { |
| 494 | + for (size_t c = 0; c < in_channels; ++c) { |
| 495 | + padded_input[b][h + pads_][w + pads_][c] = |
| 496 | + input.get<ValueType>({b, c, h, w}); |
| 497 | + } |
| 498 | + } |
| 499 | + } |
| 500 | + } |
| 501 | + }); |
| 502 | + |
| 503 | + size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_; |
| 504 | + size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_; |
| 505 | + |
| 506 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel( |
| 507 | + dilated_kernel_height, |
| 508 | + std::vector<std::vector<std::vector<ValueType>>>( |
| 509 | + dilated_kernel_width, |
| 510 | + std::vector<std::vector<ValueType>>( |
| 511 | + kernel_in_channels, |
| 512 | + std::vector<ValueType>(kernel_out_channels, 0)))); |
| 513 | + |
| 514 | + tbb::parallel_for( |
| 515 | + tbb::blocked_range<size_t>(0, kernel_out_channels), |
| 516 | + [&](const tbb::blocked_range<size_t>& range) { |
| 517 | + for (size_t oc = range.begin(); oc != range.end(); ++oc) { |
| 518 | + for (size_t h = 0; h < kernel_height; ++h) { |
| 519 | + for (size_t w = 0; w < kernel_width; ++w) { |
| 520 | + for (size_t ic = 0; ic < kernel_in_channels; ++ic) { |
| 521 | + dil_kernel[h * dilations_][w * dilations_][ic][oc] = |
| 522 | + kernel_.get<ValueType>({oc, ic, h, w}); |
| 523 | + } |
| 524 | + } |
| 525 | + } |
| 526 | + } |
| 527 | + }); |
| 528 | + |
| 529 | + auto ComputeConvOutputDim = [](size_t input_dim, size_t kernel_dim, |
| 530 | + size_t stride, size_t pad, size_t dilation) { |
| 531 | + size_t effective_kernel = (kernel_dim - 1) * dilation + 1; |
| 532 | + return (input_dim + 2 * pad - effective_kernel) / stride + 1; |
| 533 | + }; |
| 534 | + |
| 535 | + size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_, |
| 536 | + pads_, dilations_); |
| 537 | + size_t out_width = |
| 538 | + ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_); |
| 539 | + |
| 540 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor( |
| 541 | + batch_size, std::vector<std::vector<std::vector<ValueType>>>( |
| 542 | + kernel_out_channels, |
| 543 | + std::vector<std::vector<ValueType>>( |
| 544 | + out_height, std::vector<ValueType>(out_width, 0)))); |
| 545 | + |
| 546 | + tbb::parallel_for( |
| 547 | + tbb::blocked_range2d<size_t>(0, batch_size, 0, kernel_out_channels), |
| 548 | + [&](const tbb::blocked_range2d<size_t>& range) { |
| 549 | + for (size_t b = range.rows().begin(); b != range.rows().end(); ++b) { |
| 550 | + for (size_t oc = range.cols().begin(); oc != range.cols().end(); |
| 551 | + ++oc) { |
| 552 | + size_t group = |
| 553 | + (group_ > 1) ? oc / (kernel_out_channels / group_) : 0; |
| 554 | + size_t group_start_channel = group * (in_channels / group_); |
| 555 | + size_t group_end_channel = (group + 1) * (in_channels / group_); |
| 556 | + |
| 557 | + for (size_t oh = 0; oh < out_height; ++oh) { |
| 558 | + for (size_t ow = 0; ow < out_width; ++ow) { |
| 559 | + ValueType value = 0; |
| 560 | + |
| 561 | + for (size_t ic = group_start_channel; ic < group_end_channel; |
| 562 | + ++ic) { |
| 563 | + size_t kernel_ic = ic - group_start_channel; |
| 564 | + |
| 565 | + for (size_t kh = 0; kh < dilated_kernel_height; ++kh) { |
| 566 | + for (size_t kw = 0; kw < dilated_kernel_width; ++kw) { |
| 567 | + size_t h_index = oh * stride_ + kh; |
| 568 | + size_t w_index = ow * stride_ + kw; |
| 569 | + |
| 570 | + if (h_index < padded_input[b].size() && |
| 571 | + w_index < padded_input[b][h_index].size()) { |
| 572 | + value += padded_input[b][h_index][w_index][ic] * |
| 573 | + dil_kernel[kh][kw][kernel_ic][oc]; |
| 574 | + } |
| 575 | + } |
| 576 | + } |
| 577 | + } |
| 578 | + if (!bias_.empty()) { |
| 579 | + value += (*bias_.as<ValueType>())[oc]; |
| 580 | + } |
| 581 | + |
| 582 | + output_tensor[b][oc][oh][ow] = value; |
| 583 | + } |
| 584 | + } |
| 585 | + } |
| 586 | + } |
| 587 | + }); |
| 588 | + |
| 589 | + size_t total_elements = |
| 590 | + batch_size * kernel_out_channels * out_height * out_width; |
| 591 | + std::vector<ValueType> one_d_vector(total_elements); |
| 592 | + |
| 593 | + tbb::parallel_for( |
| 594 | + tbb::blocked_range<size_t>(0, batch_size), |
| 595 | + [&](const tbb::blocked_range<size_t>& range) { |
| 596 | + for (size_t b = range.begin(); b != range.end(); ++b) { |
| 597 | + size_t base_index = b * kernel_out_channels * out_height * out_width; |
| 598 | + for (size_t oc = 0; oc < kernel_out_channels; ++oc) { |
| 599 | + for (size_t oh = 0; oh < out_height; ++oh) { |
| 600 | + for (size_t ow = 0; ow < out_width; ++ow) { |
| 601 | + size_t idx = base_index + oc * out_height * out_width + |
| 602 | + oh * out_width + ow; |
| 603 | + one_d_vector[idx] = output_tensor[b][oc][oh][ow]; |
| 604 | + } |
| 605 | + } |
| 606 | + } |
| 607 | + } |
| 608 | + }); |
| 609 | + |
| 610 | + Shape sh({batch_size, kernel_out_channels, out_height, out_width}); |
| 611 | + output = make_tensor<ValueType>(one_d_vector, sh); |
| 612 | +} |
| 613 | + |
462 | 614 | template <typename ValueType> |
463 | 615 | void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_, |
464 | 616 | const Tensor& bias_, Tensor& output, size_t stride_, |
@@ -619,4 +771,135 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_, |
619 | 771 | } |
620 | 772 | output = make_tensor<ValueType>(one_d_vector, sh); |
621 | 773 | } |
| 774 | + |
| 775 | +template <typename ValueType> |
| 776 | +void Conv4D_Legacy_TBB(const Tensor& input, const Tensor& kernel_, |
| 777 | + const Tensor& bias_, Tensor& output, size_t stride_, |
| 778 | + size_t pads_, size_t dilations_) { |
| 779 | + size_t batch_size = input.get_shape()[0]; |
| 780 | + size_t in_height = input.get_shape()[2]; |
| 781 | + size_t in_width = input.get_shape()[3]; |
| 782 | + size_t in_channels = input.get_shape()[1]; |
| 783 | + |
| 784 | + size_t kernel_height = kernel_.get_shape()[0]; |
| 785 | + size_t kernel_width = kernel_.get_shape()[1]; |
| 786 | + size_t kernel_in_channels = kernel_.get_shape()[2]; |
| 787 | + size_t kernel_out_channels = kernel_.get_shape()[3]; |
| 788 | + |
| 789 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input( |
| 790 | + batch_size, |
| 791 | + std::vector<std::vector<std::vector<ValueType>>>( |
| 792 | + in_height + 2 * pads_, |
| 793 | + std::vector<std::vector<ValueType>>( |
| 794 | + in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0)))); |
| 795 | + |
| 796 | + tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size), |
| 797 | + [&](const tbb::blocked_range<size_t>& range) { |
| 798 | + for (size_t b = range.begin(); b != range.end(); ++b) { |
| 799 | + for (size_t h = 0; h < in_height; ++h) { |
| 800 | + for (size_t w = 0; w < in_width; ++w) { |
| 801 | + for (size_t c = 0; c < in_channels; ++c) { |
| 802 | + padded_input[b][h + pads_][w + pads_][c] = |
| 803 | + input.get<ValueType>({b, c, h, w}); |
| 804 | + } |
| 805 | + } |
| 806 | + } |
| 807 | + } |
| 808 | + }); |
| 809 | + |
| 810 | + size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_; |
| 811 | + size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_; |
| 812 | + |
| 813 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel( |
| 814 | + dilated_kernel_height, |
| 815 | + std::vector<std::vector<std::vector<ValueType>>>( |
| 816 | + dilated_kernel_width, |
| 817 | + std::vector<std::vector<ValueType>>( |
| 818 | + kernel_in_channels, |
| 819 | + std::vector<ValueType>(kernel_out_channels, 0)))); |
| 820 | + |
| 821 | + tbb::parallel_for( |
| 822 | + tbb::blocked_range<size_t>(0, kernel_out_channels), |
| 823 | + [&](const tbb::blocked_range<size_t>& range) { |
| 824 | + for (size_t oc = range.begin(); oc != range.end(); ++oc) { |
| 825 | + for (size_t h = 0; h < kernel_height; ++h) { |
| 826 | + for (size_t w = 0; w < kernel_width; ++w) { |
| 827 | + for (size_t ic = 0; ic < kernel_in_channels; ++ic) { |
| 828 | + dil_kernel[h * dilations_][w * dilations_][ic][oc] = |
| 829 | + kernel_.get<ValueType>({h, w, ic, oc}); |
| 830 | + } |
| 831 | + } |
| 832 | + } |
| 833 | + } |
| 834 | + }); |
| 835 | + |
| 836 | + size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_, |
| 837 | + pads_, dilations_); |
| 838 | + size_t out_width = |
| 839 | + ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_); |
| 840 | + |
| 841 | + std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor( |
| 842 | + batch_size, std::vector<std::vector<std::vector<ValueType>>>( |
| 843 | + kernel_out_channels, |
| 844 | + std::vector<std::vector<ValueType>>( |
| 845 | + out_height, std::vector<ValueType>(out_width, 0)))); |
| 846 | + |
| 847 | + tbb::parallel_for( |
| 848 | + tbb::blocked_range2d<size_t>(0, batch_size, 0, kernel_out_channels), |
| 849 | + [&](const tbb::blocked_range2d<size_t>& range) { |
| 850 | + for (size_t b = range.rows().begin(); b < range.rows().end(); ++b) { |
| 851 | + for (size_t oc = range.cols().begin(); oc < range.cols().end(); |
| 852 | + ++oc) { |
| 853 | + for (size_t i = 0; i < out_height; i += stride_) { |
| 854 | + for (size_t j = 0; j < out_width; j += stride_) { |
| 855 | + ValueType value = 0; |
| 856 | + |
| 857 | + for (size_t ic = 0; ic < in_channels; ++ic) { |
| 858 | + if (ic < kernel_in_channels) { |
| 859 | + for (size_t h = 0; h < dilated_kernel_height; ++h) { |
| 860 | + for (size_t w = 0; w < dilated_kernel_width; ++w) { |
| 861 | + if (i + h < padded_input[b].size() && |
| 862 | + j + w < padded_input[b][i + h].size()) { |
| 863 | + value += padded_input[b][i + h][j + w][ic] * |
| 864 | + dil_kernel[h][w][ic][oc]; |
| 865 | + } |
| 866 | + } |
| 867 | + } |
| 868 | + } |
| 869 | + } |
| 870 | + |
| 871 | + if (!bias_.empty()) { |
| 872 | + output_tensor[b][oc][i][j] = |
| 873 | + value + (*bias_.as<ValueType>())[oc]; |
| 874 | + } else { |
| 875 | + output_tensor[b][oc][i][j] = value; |
| 876 | + } |
| 877 | + } |
| 878 | + } |
| 879 | + } |
| 880 | + } |
| 881 | + }); |
| 882 | + |
| 883 | + Shape sh({batch_size, kernel_out_channels, out_height, out_width}); |
| 884 | + std::vector<ValueType> one_d_vector(batch_size * out_height * out_width * |
| 885 | + kernel_out_channels); |
| 886 | + |
| 887 | + tbb::parallel_for(tbb::blocked_range<size_t>(0, batch_size), |
| 888 | + [&](const tbb::blocked_range<size_t>& range) { |
| 889 | + size_t index_1d = range.begin() * kernel_out_channels * |
| 890 | + out_height * out_width; |
| 891 | + for (size_t b = range.begin(); b != range.end(); ++b) { |
| 892 | + for (size_t oc = 0; oc < kernel_out_channels; ++oc) { |
| 893 | + for (size_t h = 0; h < out_height; ++h) { |
| 894 | + for (size_t w = 0; w < out_width; ++w) { |
| 895 | + one_d_vector[index_1d++] = |
| 896 | + output_tensor[b][oc][h][w]; |
| 897 | + } |
| 898 | + } |
| 899 | + } |
| 900 | + } |
| 901 | + }); |
| 902 | + |
| 903 | + output = make_tensor<ValueType>(one_d_vector, sh); |
| 904 | +} |
622 | 905 | } // namespace it_lab_ai |
0 commit comments