diff --git a/CMakeLists.txt b/CMakeLists.txt index 472a54c..93bd50f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,8 +77,9 @@ if(ENABLE_DPCPP) join_helpers_lib cuckoo_hash_build join - groupby + groupby_global groupby_local + groupby_perfect hash_build_non_bitmask ) if(ENABLE_EXPERIMENTAL) diff --git a/bench.cpp b/bench.cpp index 9fea2e4..8ac95b4 100644 --- a/bench.cpp +++ b/bench.cpp @@ -16,7 +16,8 @@ int main(int argc, char *argv[]) { std::unique_ptr opts = std::make_unique(); size_t groups_count = 1; - size_t executors = 1; + size_t threads_count = 1; + size_t work_group_size = 1; opts->root_path = helpers::get_kernels_root_env(argv[0]); std::cout @@ -43,8 +44,11 @@ int main(int argc, char *argv[]) { desc.add_options()( "groups_count", po::value(&groups_count), "Number of unique keys for dwarfs with keys (groupby, hash build etc.)."); - desc.add_options()("executors", po::value(&executors), - "Number of executors for GroupByLocal."); + desc.add_options()("threads_count", po::value(&threads_count), + "Number of threads for GroupBy dwarfs."); + desc.add_options()("work_group_size", po::value(&work_group_size), + "Work group size for GroupBy dwarfs. threads_count must " + "be divisible by work_group_size."); po::positional_options_description pos_opts; pos_opts.add("dwarf", 1); @@ -84,7 +88,8 @@ int main(int argc, char *argv[]) { if (isGroupBy(dwarf_name)) { std::unique_ptr tmpPtr = - std::make_unique(*opts, groups_count, executors); + std::make_unique(*opts, groups_count, + threads_count, work_group_size); opts.reset(); opts = std::move(tmpPtr); } diff --git a/common/dpcpp/perfect_hashtable.hpp b/common/dpcpp/perfect_hashtable.hpp new file mode 100644 index 0000000..3aceb82 --- /dev/null +++ b/common/dpcpp/perfect_hashtable.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include + +template +class PerfectHashTable { +public: + PerfectHashTable(size_t hash_size, sycl::multi_ptr vals, + Key min_key) + : _vals(vals), _hash_size(hash_size), _hasher(hash_size, min_key) {} + + bool add(Key key, T val) { + sycl::atomic(_vals + _hasher(key)).fetch_add(val); + return true; + } + + bool insert(Key key, T val) { + sycl::atomic(_vals + _hasher(key)).store(val); + return true; + } + + const T at(const Key &key) const { + return sycl::atomic(_vals + _hasher(key)).load(); + } + +private: + class PerfectHashFunction { + public: + PerfectHashFunction(size_t hash_size, Key min_key) + : hash_size(hash_size), min_key(min_key) {} + + size_t operator()(Key key) const { return key - min_key; } + + private: + size_t hash_size; + Key min_key; + }; + + PerfectHashFunction _hasher; + size_t _hash_size; + + sycl::multi_ptr _vals; +}; \ No newline at end of file diff --git a/common/options.hpp b/common/options.hpp index bee868f..7403603 100644 --- a/common/options.hpp +++ b/common/options.hpp @@ -14,10 +14,12 @@ struct RunOptions { struct GroupByRunOptions : public RunOptions { GroupByRunOptions(const RunOptions &opts, size_t groups_count, - size_t executors) - : RunOptions(opts), groups_count(groups_count), executors(executors){}; + size_t threads_count, size_t work_group_size) + : RunOptions(opts), groups_count(groups_count), + threads_count(threads_count), work_group_size(work_group_size){}; size_t groups_count; - size_t executors; + size_t threads_count; + size_t work_group_size; }; std::istream &operator>>(std::istream &in, RunOptions::DeviceType &dt); diff --git a/groupby/CMakeLists.txt b/groupby/CMakeLists.txt index a7ca7e2..7e53261 100644 --- a/groupby/CMakeLists.txt +++ b/groupby/CMakeLists.txt @@ -1,4 +1,23 @@ if(ENABLE_DPCPP) - add_dpcpp_lib(groupby groupby.cpp) - add_dpcpp_lib(groupby_local groupby_local.cpp) + set(groupby_sources + groupby.cpp + ) + set(groupby_global_sources + groupby_global.cpp + ${groupby_sources} + ) + + set(groupby_local_sources + groupby_local.cpp + ${groupby_sources} + ) + + set(groupby_perfect_sources + perfect_groupby.cpp + ${groupby_sources} + ) + + add_dpcpp_lib(groupby_global "${groupby_global_sources}") + add_dpcpp_lib(groupby_local "${groupby_local_sources}") + add_dpcpp_lib(groupby_perfect "${groupby_perfect_sources}") endif() diff --git a/groupby/groupby.cpp b/groupby/groupby.cpp index 55643e0..ea4c28b 100644 --- a/groupby/groupby.cpp +++ b/groupby/groupby.cpp @@ -2,118 +2,44 @@ #include "common/dpcpp/hashtable.hpp" #include -namespace { -using Func = std::function; +GroupBy::GroupBy(const std::string &suffix) : Dwarf("GroupBy" + suffix) {} -std::vector expected_GroupBy(const std::vector &keys, - const std::vector &vals, - size_t groups_count, Func f) { - std::vector result(groups_count); - size_t data_size = keys.size(); - - for (int i = 0; i < data_size; i++) { - result[keys[i]] = f(result[keys[i]], vals[i]); +void GroupBy::run(const RunOptions &opts) { + for (auto size : opts.input_size) { + _run(size, meter()); } - - return result; } -} // namespace - -GroupBy::GroupBy() : Dwarf("GroupBy") {} - -void GroupBy::_run(const size_t buf_size, Meter &meter) { - constexpr uint32_t empty_element = std::numeric_limits::max(); - auto opts = static_cast(meter.opts()); - - const int groups_count = opts.groups_count; - const std::vector host_src_vals = - helpers::make_random(buf_size); - const std::vector host_src_keys = - helpers::make_random(buf_size, 0, groups_count - 1); - - std::vector expected = - expected_GroupBy(host_src_keys, host_src_vals, groups_count, - [](uint32_t x, uint32_t y) { return x + y; }); - - auto sel = get_device_selector(opts); - sycl::queue q{*sel}; - std::cout << "Selected device: " - << q.get_device().get_info() << "\n"; - - PolynomialHasher hasher(buf_size); - - for (auto it = 0; it < opts.iterations; ++it) { - std::vector data(buf_size, 0); - std::vector keys(buf_size, empty_element); - std::vector output(groups_count, 0); - - sycl::buffer data_buf(data); - sycl::buffer keys_buf(keys); - sycl::buffer src_vals(host_src_vals); - sycl::buffer src_keys(host_src_keys); - - auto host_start = std::chrono::steady_clock::now(); - q.submit([&](sycl::handler &h) { - auto sv = src_vals.get_access(h); - auto sk = src_keys.get_access(h); - - auto data_acc = data_buf.get_access(h); - auto keys_acc = keys_buf.get_access(h); - - h.parallel_for(buf_size, [=](auto &idx) { - NonOwningHashTableNonBitmask ht( - buf_size, keys_acc.get_pointer(), data_acc.get_pointer(), hasher, - empty_element); - - ht.add(sk[idx], sv[idx]); - }); - }).wait(); - - sycl::buffer out_buf(output); - - q.submit([&](sycl::handler &h) { - auto sv = src_vals.get_access(h); - auto sk = src_keys.get_access(h); - auto o = out_buf.get_access(h); - auto data_acc = data_buf.get_access(h); - auto keys_acc = keys_buf.get_access(h); - - h.parallel_for(buf_size, [=](auto &idx) { - NonOwningHashTableNonBitmask ht( - buf_size, keys_acc.get_pointer(), data_acc.get_pointer(), hasher, - empty_element); - - std::pair sum_for_group = ht.at(sk[idx]); - sycl::atomic(o.get_pointer() + sk[idx]) - .store(sum_for_group.first); - }); - }).wait(); - auto host_end = std::chrono::steady_clock::now(); - auto host_exe_time = std::chrono::duration_cast( - host_end - host_start) - .count(); - std::unique_ptr result = std::make_unique(); - result->host_time = host_end - host_start; - out_buf.get_access(); +void GroupBy::init(const RunOptions &opts) { + reporting_header_ = "total_time,group_by_time,reduction_time"; + meter().set_opts(opts); + DwarfParams params = {{"device_type", to_string(opts.device_ty)}}; + meter().set_params(params); +} - if (output != expected) { - std::cerr << "Incorrect results" << std::endl; - result->valid = false; - } +void GroupBy::generate_expected(size_t groups_count, AggregationFunc f) { + expected.resize(groups_count); + size_t data_size = src_keys.size(); - DwarfParams params{{"buf_size", std::to_string(buf_size)}}; - meter.add_result(std::move(params), std::move(result)); + for (int i = 0; i < data_size; i++) { + expected[src_keys[i]] = f(expected[src_keys[i]], src_vals[i]); } } -void GroupBy::run(const RunOptions &opts) { - for (auto size : opts.input_size) { - _run(size, meter()); +bool GroupBy::check_correctness(const std::vector &result) { + if (result != expected) { + std::cerr << "Incorrect results" << std::endl; + return false; } + return true; } -void GroupBy::init(const RunOptions &opts) { - meter().set_opts(opts); - DwarfParams params = {{"device_type", to_string(opts.device_ty)}}; - meter().set_params(params); -} \ No newline at end of file + +void GroupBy::generate_keys(size_t buf_size, size_t groups_count) { + src_keys = helpers::make_random(buf_size, 0, groups_count - 1); +} + +void GroupBy::generate_vals(size_t buf_size) { + src_vals = helpers::make_random(buf_size); +} + +size_t GroupBy::get_size(size_t buf_size) { return buf_size * 2; } diff --git a/groupby/groupby.hpp b/groupby/groupby.hpp index 77d1122..a3001d5 100644 --- a/groupby/groupby.hpp +++ b/groupby/groupby.hpp @@ -1,12 +1,31 @@ #pragma once #include "common/common.hpp" +#include class GroupBy : public Dwarf { public: - GroupBy(); + GroupBy(const std::string &suffix); void run(const RunOptions &opts) override; void init(const RunOptions &opts) override; -private: - void _run(const size_t buffer_size, Meter &meter); +protected: + using AggregationFunc = std::function; + AggregationFunc add = [](uint32_t acc, uint32_t x) { return acc + x; }; + AggregationFunc mul = [](uint32_t acc, uint32_t x) { return acc * x; }; + AggregationFunc count = [](uint32_t acc, uint32_t) { return acc + 1; }; + + const uint32_t _empty_element = std::numeric_limits::max(); + + virtual size_t get_size(size_t buf_size); + virtual void _run(const size_t buffer_size, Meter &meter) = 0; + + std::vector src_keys; + std::vector src_vals; + std::vector expected; + + void generate_keys(size_t buf_size, size_t groups_count); + void generate_vals(size_t buf_size); + void generate_expected(size_t groups_count, AggregationFunc f); + + bool check_correctness(const std::vector &result); }; \ No newline at end of file diff --git a/groupby/groupby_global.cpp b/groupby/groupby_global.cpp new file mode 100644 index 0000000..2b806b4 --- /dev/null +++ b/groupby/groupby_global.cpp @@ -0,0 +1,100 @@ +#include "groupby_global.hpp" +#include "common/dpcpp/hashtable.hpp" +#include + +GroupByGlobal::GroupByGlobal() : GroupBy("Global") {} + +void GroupByGlobal::_run(const size_t buf_size, Meter &meter) { + uint32_t empty_element = _empty_element; + auto opts = static_cast(meter.opts()); + + const int groups_count = opts.groups_count; + generate_vals(buf_size); + generate_keys(buf_size, groups_count); + generate_expected(groups_count, add); + + auto sel = get_device_selector(opts); + sycl::queue q{*sel}; + std::cout << "Selected device: " + << q.get_device().get_info() << "\n"; + + size_t hash_size = groups_count * 2; + PolynomialHasher hasher(hash_size); + + std::vector ht_vals(hash_size, 0); + std::vector ht_keys(hash_size, empty_element); + std::vector output(groups_count, 0); + + sycl::buffer ht_vals_buf(ht_vals); + sycl::buffer ht_keys_buf(ht_keys); + sycl::buffer src_vals_buf(src_vals); + sycl::buffer src_keys_buf(src_keys); + sycl::buffer out_buf(output); + + for (auto it = 0; it < opts.iterations; ++it) { + auto host_start = std::chrono::steady_clock::now(); + q.submit([&](sycl::handler &h) { + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); + + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); + + h.parallel_for(buf_size, [=](auto &idx) { + NonOwningHashTableNonBitmask ht( + hash_size, ht_k.get_pointer(), ht_v.get_pointer(), hasher, + empty_element); + + ht.add(sk[idx], sv[idx]); + }); + }).wait(); + + auto group_by_end = std::chrono::steady_clock::now(); + + q.submit([&](sycl::handler &h) { + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); + auto o = out_buf.get_access(h); + + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); + + h.parallel_for(buf_size, [=](auto &idx) { + NonOwningHashTableNonBitmask ht( + hash_size, ht_k.get_pointer(), ht_v.get_pointer(), hasher, + empty_element); + + std::pair sum_for_group = ht.at(sk[idx]); + sycl::atomic(o.get_pointer() + sk[idx]) + .store(sum_for_group.first); + }); + }).wait(); + auto host_end = std::chrono::steady_clock::now(); + std::unique_ptr result = + std::make_unique(); + result->host_time = host_end - host_start; + result->group_by_time = group_by_end - host_start; + result->reduction_time = host_end - group_by_end; + + out_buf.get_access(); + result->valid = check_correctness(output); + + DwarfParams params{{"buf_size", std::to_string(get_size(buf_size))}}; + meter.add_result(std::move(params), std::move(result)); + + q.submit([&](sycl::handler &h) { + auto o = out_buf.get_access(h); + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); + + h.single_task([=]() { + for (size_t i = 0; i < hash_size; i++) { + ht_v[i] = 0; + ht_k[i] = empty_element; + if (i < groups_count) + o[i] = 0; + } + }); + }).wait(); + } +} diff --git a/groupby/groupby_global.hpp b/groupby/groupby_global.hpp new file mode 100644 index 0000000..8171a5f --- /dev/null +++ b/groupby/groupby_global.hpp @@ -0,0 +1,11 @@ +#pragma once +#include "common/common.hpp" +#include "groupby.hpp" + +class GroupByGlobal : public GroupBy { +public: + GroupByGlobal(); + +private: + void _run(const size_t buffer_size, Meter &meter) override; +}; diff --git a/groupby/groupby_local.cpp b/groupby/groupby_local.cpp index bbefedf..45472f7 100644 --- a/groupby/groupby_local.cpp +++ b/groupby/groupby_local.cpp @@ -2,82 +2,58 @@ #include "common/dpcpp/hashtable.hpp" #include -namespace { -using Func = std::function; - -std::vector expected_GroupBy(const std::vector &keys, - const std::vector &vals, - size_t groups_count, Func f) { - std::vector result(groups_count); - size_t data_size = keys.size(); - - for (int i = 0; i < data_size; i++) { - result[keys[i]] = f(result[keys[i]], vals[i]); - } - - return result; -} -} // namespace - -GroupByLocal::GroupByLocal() : Dwarf("GroupByLocal") {} +GroupByLocal::GroupByLocal() : GroupBy("Local") {} void GroupByLocal::_run(const size_t buf_size, Meter &meter) { - constexpr uint32_t empty_element = std::numeric_limits::max(); + uint32_t empty_element = _empty_element; auto opts = static_cast(meter.opts()); const int groups_count = opts.groups_count; - const int executors = opts.executors; - const std::vector host_src_vals = - helpers::make_random(buf_size); - const std::vector host_src_keys = - helpers::make_random(buf_size, 0, groups_count - 1); - - std::vector expected = - expected_GroupBy(host_src_keys, host_src_vals, groups_count, - [](uint32_t x, uint32_t y) { return x + y; }); + const int threads_count = opts.threads_count; + generate_vals(buf_size); + generate_keys(buf_size, groups_count); + generate_expected(groups_count, add); auto sel = get_device_selector(opts); sycl::queue q{*sel}; std::cout << "Selected device: " << q.get_device().get_info() << "\n"; - SimpleHasher hasher(groups_count); + size_t hash_size = groups_count * 2; + SimpleHasher hasher(hash_size); - for (auto it = 0; it < opts.iterations; ++it) { - std::vector data(groups_count * executors, 0); - std::vector keys(groups_count * executors, empty_element); - std::vector output(groups_count, 0); + std::vector ht_vals(hash_size * threads_count, 0); + std::vector ht_keys(hash_size * threads_count, empty_element); + std::vector output(groups_count, 0); - sycl::buffer data_buf(data); - sycl::buffer keys_buf(keys); - sycl::buffer src_vals(host_src_vals); - sycl::buffer src_keys(host_src_keys); - sycl::buffer out_buf(output); + sycl::buffer ht_vals_buf(ht_vals); + sycl::buffer ht_keys_buf(ht_keys); + sycl::buffer src_vals_buf(src_vals); + sycl::buffer src_keys_buf(src_keys); + sycl::buffer out_buf(output); + for (auto it = 0; it < opts.iterations; ++it) { auto host_start = std::chrono::steady_clock::now(); q.submit([&](sycl::handler &h) { - auto sv = src_vals.get_access(h); - auto sk = src_keys.get_access(h); - - auto data_acc = data_buf.get_access(h); - auto keys_acc = keys_buf.get_access(h); + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); - const size_t work_per_executor = std::ceil((float)buf_size / executors); + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); h.parallel_for( - executors, [=](auto &idx) { - size_t hash_table_ptr_offset = (idx * groups_count); + threads_count, [=](auto &idx) { + size_t hash_table_ptr_offset = (idx * hash_size); auto executor_keys_ptr = - keys_acc.get_pointer() + hash_table_ptr_offset; + ht_k.get_pointer() + hash_table_ptr_offset; auto executor_vals_ptr = - data_acc.get_pointer() + hash_table_ptr_offset; + ht_v.get_pointer() + hash_table_ptr_offset; LinearHashtable> ht( - groups_count, executor_keys_ptr, executor_vals_ptr, hasher, + hash_size, executor_keys_ptr, executor_vals_ptr, hasher, empty_element); - for (size_t i = work_per_executor * idx; - i < work_per_executor * (idx + 1) && i < buf_size; i++) + for (size_t i = idx; i < buf_size; i += threads_count) ht.add(sk[i], sv[i]); }); }).wait(); @@ -85,24 +61,22 @@ void GroupByLocal::_run(const size_t buf_size, Meter &meter) { auto group_by_end = std::chrono::steady_clock::now(); q.submit([&](sycl::handler &h) { - auto sv = src_vals.get_access(h); - auto sk = src_keys.get_access(h); + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); - auto data_acc = data_buf.get_access(h); - auto keys_acc = keys_buf.get_access(h); + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); auto o = out_buf.get_access(h); h.single_task([=]() { - for (int idx = 0; idx < executors; idx++) { - size_t hash_table_ptr_offset = (idx * groups_count); - auto executor_keys_ptr = - keys_acc.get_pointer() + hash_table_ptr_offset; - auto executor_vals_ptr = - data_acc.get_pointer() + hash_table_ptr_offset; + for (int idx = 0; idx < threads_count; idx++) { + size_t hash_table_ptr_offset = (idx * hash_size); + auto executor_keys_ptr = ht_k.get_pointer() + hash_table_ptr_offset; + auto executor_vals_ptr = ht_v.get_pointer() + hash_table_ptr_offset; LinearHashtable> ht( - groups_count, executor_keys_ptr, executor_vals_ptr, hasher, + hash_size, executor_keys_ptr, executor_vals_ptr, hasher, empty_element); for (int j = 0; j < groups_count; j++) @@ -117,26 +91,26 @@ void GroupByLocal::_run(const size_t buf_size, Meter &meter) { result->host_time = host_end - host_start; result->group_by_time = group_by_end - host_start; result->reduction_time = host_end - group_by_end; - out_buf.get_access(); - if (output != expected) { - std::cerr << "Incorrect results" << std::endl; - result->valid = false; - } + out_buf.get_access(); + result->valid = check_correctness(output); - DwarfParams params{{"buf_size", std::to_string(buf_size)}}; + DwarfParams params{{"buf_size", std::to_string(get_size(buf_size))}}; meter.add_result(std::move(params), std::move(result)); - } -} -void GroupByLocal::run(const RunOptions &opts) { - for (auto size : opts.input_size) { - _run(size, meter()); + q.submit([&](sycl::handler &h) { + auto o = out_buf.get_access(h); + auto ht_v = ht_vals_buf.get_access(h); + auto ht_k = ht_keys_buf.get_access(h); + + h.single_task([=]() { + for (size_t i = 0; i < hash_size * threads_count; i++) { + ht_v[i] = 0; + ht_k[i] = empty_element; + if (i < groups_count) + o[i] = 0; + } + }); + }).wait(); } } -void GroupByLocal::init(const RunOptions &opts) { - reporting_header_ = "total_time,group_by_time,reduction_time"; - meter().set_opts(opts); - DwarfParams params = {{"device_type", to_string(opts.device_ty)}}; - meter().set_params(params); -} diff --git a/groupby/groupby_local.hpp b/groupby/groupby_local.hpp index 3970301..96ab266 100644 --- a/groupby/groupby_local.hpp +++ b/groupby/groupby_local.hpp @@ -1,12 +1,11 @@ #pragma once #include "common/common.hpp" +#include "groupby.hpp" -class GroupByLocal : public Dwarf { +class GroupByLocal : public GroupBy { public: GroupByLocal(); - void run(const RunOptions &opts) override; - void init(const RunOptions &opts) override; private: - void _run(const size_t buffer_size, Meter &meter); + void _run(const size_t buffer_size, Meter &meter) override; }; diff --git a/groupby/perfect_groupby.cpp b/groupby/perfect_groupby.cpp new file mode 100644 index 0000000..1450f1e --- /dev/null +++ b/groupby/perfect_groupby.cpp @@ -0,0 +1,102 @@ +#include "perfect_groupby.hpp" +#include "common/dpcpp/dpcpp_common.hpp" +#include "common/dpcpp/perfect_hashtable.hpp" +#include + +PerfectGroupBy::PerfectGroupBy() : GroupBy("Perfect") {} + +void PerfectGroupBy::_run(const size_t buf_size, Meter &meter) { + uint32_t empty_element = _empty_element; + auto opts = static_cast(meter.opts()); + + const int groups_count = opts.groups_count; + size_t threads_count = opts.threads_count; + size_t work_group_size = opts.work_group_size; + generate_vals(buf_size); + generate_keys(buf_size, groups_count); + generate_expected(groups_count, add); + + auto sel = get_device_selector(opts); + sycl::queue q{*sel}; + std::cout << "Selected device: " + << q.get_device().get_info() << "\n"; + const size_t min_key = 0; + std::vector ht_vals(groups_count * threads_count, 0); + std::vector output(groups_count, 0); + + sycl::buffer ht_vals_buf(ht_vals); + sycl::buffer src_vals_buf(src_vals); + sycl::buffer src_keys_buf(src_keys); + sycl::buffer out_buf(output); + + for (auto it = 0; it < opts.iterations; ++it) { + auto host_start = std::chrono::steady_clock::now(); + q.submit([&](sycl::handler &h) { + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); + + auto ht_v = ht_vals_buf.get_access(h); + + h.parallel_for( + sycl::nd_range{{threads_count}, {work_group_size}}, [=](auto &idx) { + PerfectHashTable ht( + groups_count, + ht_v.get_pointer() + idx.get_global_id() * groups_count, + min_key); + + for (size_t i = idx.get_global_id(); i < buf_size; + i += threads_count) + ht.add(sk[i], sv[i]); + }); + }).wait(); + + auto group_by_end = std::chrono::steady_clock::now(); + + q.submit([&](sycl::handler &h) { + auto sv = src_vals_buf.get_access(h); + auto sk = src_keys_buf.get_access(h); + + auto ht_v = ht_vals_buf.get_access(h); + + auto o = out_buf.get_access(h); + + h.single_task([=]() { + for (size_t idx = 0; idx < threads_count; idx++) { + PerfectHashTable ht( + groups_count, ht_v.get_pointer() + idx * groups_count, min_key); + + for (int j = 0; j < groups_count; j++) + o[j] += ht.at(j); + } + }); + }).wait(); + + auto host_end = std::chrono::steady_clock::now(); + std::unique_ptr result = + std::make_unique(); + result->host_time = host_end - host_start; + result->group_by_time = group_by_end - host_start; + result->reduction_time = host_end - group_by_end; + + out_buf.get_access(); + result->valid = check_correctness(output); + + DwarfParams params{{"buf_size", std::to_string(get_size(buf_size))}}; + meter.add_result(std::move(params), std::move(result)); + + q.submit([&](sycl::handler &h) { + auto o = out_buf.get_access(h); + auto ht_v = ht_vals_buf.get_access(h); + + h.single_task([=]() { + for (size_t i = 0; i < groups_count * threads_count; i++) { + ht_v[i] = 0; + if (i < groups_count) + o[i] = 0; + } + }); + }).wait(); + } +} + +size_t PerfectGroupBy::get_size(size_t buf_size) { return buf_size; } \ No newline at end of file diff --git a/groupby/perfect_groupby.hpp b/groupby/perfect_groupby.hpp new file mode 100644 index 0000000..705c340 --- /dev/null +++ b/groupby/perfect_groupby.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "common/common.hpp" +#include "groupby.hpp" + +class PerfectGroupBy : public GroupBy { +public: + PerfectGroupBy(); + +private: + void _run(const size_t buffer_size, Meter &meter) override; + size_t get_size(size_t buf_size) override; +}; diff --git a/register_dwarfs.cpp b/register_dwarfs.cpp index cda2941..677a20a 100644 --- a/register_dwarfs.cpp +++ b/register_dwarfs.cpp @@ -1,8 +1,9 @@ #include "register_dwarfs.hpp" #include "common/registry.hpp" #include "constant/constant.hpp" -#include "groupby/groupby.hpp" +#include "groupby/groupby_global.hpp" #include "groupby/groupby_local.hpp" +#include "groupby/perfect_groupby.hpp" #include "hash/cuckoo_hash_build.hpp" #include "hash/hash_build.hpp" #include "hash/hash_build_non_bitmask.hpp" @@ -32,8 +33,9 @@ void populate_registry() { registry->registerd(new HashBuild()); registry->registerd(new NestedLoopJoin()); registry->registerd(new CuckooHashBuild()); - registry->registerd(new GroupBy()); + registry->registerd(new GroupByGlobal()); registry->registerd(new GroupByLocal()); + registry->registerd(new PerfectGroupBy()); registry->registerd(new Join()); registry->registerd(new HashBuildNonBitmask()); #ifdef EXPERIMENTAL diff --git a/tests/dwarf_tests/dwarf_tests.cpp b/tests/dwarf_tests/dwarf_tests.cpp index e63d615..3b7c10a 100644 --- a/tests/dwarf_tests/dwarf_tests.cpp +++ b/tests/dwarf_tests/dwarf_tests.cpp @@ -70,7 +70,8 @@ GENERATE_TEST_SUITE(ReduceDPCPP); GENERATE_TEST_SUITE(HashBuild); GENERATE_TEST_SUITE(NestedLoopJoin); GENERATE_TEST_SUITE(CuckooHashBuild); -GENERATE_TEST_SUITE_GROUPBY(GroupBy); +GENERATE_TEST_SUITE_GROUPBY(GroupByGlobal); +GENERATE_TEST_SUITE_GROUPBY(PerfectGroupBy); GENERATE_TEST_SUITE_GROUPBY(GroupByLocal); GENERATE_TEST_SUITE(Join); GENERATE_TEST_SUITE(HashBuildNonBitmask); diff --git a/tests/dwarf_tests/dwarfs.hpp b/tests/dwarf_tests/dwarfs.hpp index b9660a4..ecc8f44 100644 --- a/tests/dwarf_tests/dwarfs.hpp +++ b/tests/dwarf_tests/dwarfs.hpp @@ -2,7 +2,9 @@ #include "constant/constant.hpp" #include "groupby/groupby.hpp" +#include "groupby/groupby_global.hpp" #include "groupby/groupby_local.hpp" +#include "groupby/perfect_groupby.hpp" #include "hash/cuckoo_hash_build.hpp" #include "hash/hash_build.hpp" #include "hash/hash_build_non_bitmask.hpp" @@ -15,4 +17,4 @@ #include "reduce/reduce.hpp" #include "scan/scan.hpp" #include "sort/radix.hpp" -#include "sort/tbbsort.hpp" +#include "sort/tbbsort.hpp" \ No newline at end of file diff --git a/tests/dwarf_tests/utils.cpp b/tests/dwarf_tests/utils.cpp index b6f45e0..7b053c7 100644 --- a/tests/dwarf_tests/utils.cpp +++ b/tests/dwarf_tests/utils.cpp @@ -37,13 +37,13 @@ std::unique_ptr get_cpu_test_opts(size_t size) { } std::unique_ptr get_cpu_test_opts_groupby(size_t size) { - return std::make_unique(*get_cpu_test_opts(size), 64, - 1024); + return std::make_unique(*get_cpu_test_opts(size), 64, 1024, + 32); } std::unique_ptr get_gpu_test_opts_groupby(size_t size) { - return std::make_unique(*get_gpu_test_opts(size), 64, - 1024); + return std::make_unique(*get_gpu_test_opts(size), 64, 1024, + 32); } std::string get_kernels_root_tests() {