Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit b4d67d9

Browse files
akroviakovkurapov-peter
authored andcommitted
Unify kernel creation & dispatch + heterogen flag
1 parent da5cfe6 commit b4d67d9

File tree

8 files changed

+267
-575
lines changed

8 files changed

+267
-575
lines changed

omniscidb/QueryEngine/CompilationOptions.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ std::ostream& operator<<(std::ostream& os, const compiler::CallingConvDesc& desc
7171
}
7272
}
7373

74+
std::ostream& operator<<(std::ostream& os, const ExecutorDispatchMode ddm) {
75+
constexpr char const* strings[]{"KernelPerFragment", "MultifragmentKernel"};
76+
return os << strings[static_cast<int>(ddm)];
77+
}
78+
7479
std::ostream& operator<<(std::ostream& os,
7580
const compiler::CodegenTraitsDescriptor& desc) {
7681
os << "{local=" << desc.local_addr_space_ << ",global=" << desc.global_addr_space_

omniscidb/QueryEngine/CompilationOptions.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ std::ostream& operator<<(std::ostream& os, const compiler::CallingConvDesc& desc
201201
std::ostream& operator<<(std::ostream& os, const compiler::CodegenTraitsDescriptor& desc);
202202
std::ostream& operator<<(std::ostream& os, const ExecutorExplainType& eet);
203203
std::ostream& operator<<(std::ostream& os, const CompilationOptions& co);
204+
std::ostream& operator<<(std::ostream& os, const ExecutorDispatchMode ddm);
204205
#endif
205206

206207
#endif // QUERYENGINE_COMPILATIONOPTIONS_H

omniscidb/QueryEngine/CostModel/Dispatchers/ExecutionPolicy.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,21 @@ class ExecutionPolicy {
3737
virtual std::string name() const = 0;
3838

3939
virtual ~ExecutionPolicy() = default;
40+
41+
// Probe/modify modes during kernel building (do not iterate). These are the default
42+
// modes.
43+
std::unordered_map<ExecutorDeviceType, ExecutorDispatchMode> devices_dispatch_modes{
44+
{ExecutorDeviceType::CPU, ExecutorDispatchMode::KernelPerFragment},
45+
{ExecutorDeviceType::GPU, ExecutorDispatchMode::KernelPerFragment}};
4046
};
4147

4248
inline std::ostream& operator<<(std::ostream& os, const ExecutionPolicy& policy) {
43-
return os << policy.name();
49+
os << policy.name() << "\n";
50+
os << "Dispatching modes: \n";
51+
for (const auto& device_disp_mode : policy.devices_dispatch_modes) {
52+
os << device_disp_mode.first << " - " << device_disp_mode.second << "\n";
53+
}
54+
return os;
4455
}
4556

4657
} // namespace policy

omniscidb/QueryEngine/Descriptors/QueryFragmentDescriptor.cpp

Lines changed: 75 additions & 165 deletions
Large diffs are not rendered by default.

omniscidb/QueryEngine/Descriptors/QueryFragmentDescriptor.h

Lines changed: 30 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -78,51 +78,35 @@ class QueryFragmentDescriptor {
7878
void buildFragmentKernelMap(const RelAlgExecutionUnit& ra_exe_unit,
7979
const std::vector<uint64_t>& frag_offsets,
8080
const policy::ExecutionPolicy* policy,
81-
const int device_count,
82-
const bool enable_multifrag_kernels,
8381
Executor* executor,
8482
compiler::CodegenTraitsDescriptor cgen_traits_desc);
8583

8684
/**
87-
* Dispatch multi-fragment kernels. Currently GPU only. Each GPU should have only one
88-
* kernel, with multiple fragments in its fragments list.
85+
* Dispatch according to policy
8986
*/
9087
template <typename DISPATCH_FCN>
91-
void assignFragsToMultiDispatch(DISPATCH_FCN f) const {
92-
for (const auto& device_type_itr : execution_kernels_per_device_) {
93-
for (const auto& device_itr : device_type_itr.second) {
94-
const auto& execution_kernels = device_itr.second;
95-
CHECK_EQ(execution_kernels.size(), size_t(1));
96-
97-
const auto& fragments_list = execution_kernels.front().fragments;
98-
f(device_itr.first, fragments_list, rowid_lookup_key_);
99-
}
100-
}
101-
}
102-
103-
template <typename DISPATCH_FCN>
104-
void assignFragsToMultiHeterogeneousDispatch(
105-
DISPATCH_FCN dispatcher_f,
106-
const RelAlgExecutionUnit& ra_exe_unit) const {
107-
std::unordered_map<int, size_t> cpu_execution_kernel_index;
88+
void dispatchKernelsToDevices(DISPATCH_FCN dispatcher_f,
89+
const RelAlgExecutionUnit& ra_exe_unit,
90+
policy::ExecutionPolicy* policy) const {
91+
std::unordered_map<ExecutorDeviceType, std::unordered_map<int, size_t>>
92+
execution_kernel_index;
10893
size_t tuple_count = 0;
109-
110-
if (execution_kernels_per_device_.count(ExecutorDeviceType::CPU)) {
111-
cpu_execution_kernel_index.reserve(
112-
execution_kernels_per_device_.at(ExecutorDeviceType::CPU).size());
113-
for (const auto& device_itr :
114-
execution_kernels_per_device_.at(ExecutorDeviceType::CPU)) {
115-
CHECK(
116-
cpu_execution_kernel_index.insert(std::make_pair(device_itr.first, size_t(0)))
117-
.second);
94+
for (const auto& device_type_itr : execution_kernels_per_device_) {
95+
if (policy->devices_dispatch_modes.at(device_type_itr.first) ==
96+
ExecutorDispatchMode::KernelPerFragment) {
97+
for (const auto& device_itr : device_type_itr.second) {
98+
CHECK(execution_kernel_index[device_type_itr.first]
99+
.insert(std::make_pair(device_itr.first, size_t(0)))
100+
.second);
101+
}
118102
}
119103
}
120104

121105
for (const auto& device_type_itr : execution_kernels_per_device_) {
122-
if (device_type_itr.first == ExecutorDeviceType::GPU) {
106+
if (policy->devices_dispatch_modes.at(device_type_itr.first) ==
107+
ExecutorDispatchMode::MultifragmentKernel) {
123108
for (const auto& device_itr : device_type_itr.second) {
124109
const auto& execution_kernels = device_itr.second;
125-
CHECK_EQ(execution_kernels.size(), size_t(1));
126110
const auto& fragments_list = execution_kernels.front().fragments;
127111
dispatcher_f(
128112
device_itr.first, fragments_list, rowid_lookup_key_, device_type_itr.first);
@@ -131,71 +115,27 @@ class QueryFragmentDescriptor {
131115
bool dispatch_finished = false;
132116
while (!dispatch_finished) {
133117
dispatch_finished = true;
134-
for (const auto& device_itr : device_type_itr.second) {
135-
auto& kernel_idx = cpu_execution_kernel_index[device_itr.first];
136-
if (kernel_idx < device_itr.second.size()) {
137-
dispatch_finished = false;
138-
const auto& execution_kernel = device_itr.second[kernel_idx++];
139-
dispatcher_f(device_itr.first,
140-
execution_kernel.fragments,
141-
rowid_lookup_key_,
142-
device_type_itr.first);
143-
if (terminateDispatchMaybe(tuple_count, ra_exe_unit, execution_kernel)) {
144-
return;
118+
for (const auto& device_type_itr : execution_kernels_per_device_)
119+
for (const auto& device_itr : device_type_itr.second) {
120+
auto& kernel_idx =
121+
execution_kernel_index[device_type_itr.first][device_itr.first];
122+
if (kernel_idx < device_itr.second.size()) {
123+
dispatch_finished = false;
124+
const auto& execution_kernel = device_itr.second[kernel_idx++];
125+
dispatcher_f(device_itr.first,
126+
execution_kernel.fragments,
127+
rowid_lookup_key_,
128+
device_type_itr.first);
129+
if (terminateDispatchMaybe(tuple_count, ra_exe_unit, execution_kernel)) {
130+
return;
131+
}
145132
}
146133
}
147-
}
148134
}
149135
}
150136
}
151137
}
152138

153-
/**
154-
* Dispatch one fragment for each device. Iterate the device map and dispatch one kernel
155-
* for each device per iteration. This allows balanced dispatch as well as early
156-
* termination if the number of rows passing the kernel can be computed at dispatch time
157-
* and the scan limit is reached.
158-
*/
159-
template <typename DISPATCH_FCN>
160-
void assignFragsToKernelDispatch(DISPATCH_FCN f,
161-
const RelAlgExecutionUnit& ra_exe_unit) const {
162-
if (execution_kernels_per_device_.empty()) {
163-
return;
164-
}
165-
166-
size_t tuple_count = 0;
167-
168-
std::map<ExecutorDeviceType, std::unordered_map<int, size_t>> execution_kernel_index;
169-
for (const auto& device_type_itr : execution_kernels_per_device_) {
170-
for (const auto& device_itr : device_type_itr.second) {
171-
CHECK(execution_kernel_index[device_type_itr.first]
172-
.insert(std::make_pair(device_itr.first, size_t(0)))
173-
.second);
174-
}
175-
}
176-
177-
bool dispatch_finished = false;
178-
while (!dispatch_finished) {
179-
dispatch_finished = true;
180-
for (const auto& device_type_itr : execution_kernels_per_device_)
181-
for (const auto& device_itr : device_type_itr.second) {
182-
auto& kernel_idx =
183-
execution_kernel_index[device_type_itr.first][device_itr.first];
184-
if (kernel_idx < device_itr.second.size()) {
185-
dispatch_finished = false;
186-
const auto& execution_kernel = device_itr.second[kernel_idx++];
187-
f(device_itr.first,
188-
execution_kernel.fragments,
189-
rowid_lookup_key_,
190-
device_type_itr.first);
191-
if (terminateDispatchMaybe(tuple_count, ra_exe_unit, execution_kernel)) {
192-
return;
193-
}
194-
}
195-
}
196-
}
197-
}
198-
199139
bool shouldCheckWorkUnitWatchdog() const {
200140
return rowid_lookup_key_ < 0 && !execution_kernels_per_device_.empty();
201141
}
@@ -218,23 +158,20 @@ class QueryFragmentDescriptor {
218158
const RelAlgExecutionUnit& ra_exe_unit,
219159
const std::vector<uint64_t>& frag_offsets,
220160
const policy::ExecutionPolicy* policy,
221-
const int device_count,
222161
const size_t num_bytes_for_row,
223162
Executor* executor,
224163
compiler::CodegenTraitsDescriptor cgen_traits_desc);
225164

226165
void buildFragmentPerKernelMap(const RelAlgExecutionUnit& ra_exe_unit,
227166
const std::vector<uint64_t>& frag_offsets,
228167
const policy::ExecutionPolicy* policy,
229-
const int device_count,
230168
const size_t num_bytes_for_row,
231169
Executor* executor,
232170
compiler::CodegenTraitsDescriptor cgen_traits_desc);
233171

234172
void buildMultifragKernelMap(const RelAlgExecutionUnit& ra_exe_unit,
235173
const std::vector<uint64_t>& frag_offsets,
236174
const policy::ExecutionPolicy* policy,
237-
const int device_count,
238175
const size_t num_bytes_for_row,
239176
Executor* executor,
240177
compiler::CodegenTraitsDescriptor cgen_traits_desc);
@@ -244,7 +181,6 @@ class QueryFragmentDescriptor {
244181
const InputDescriptor& table_desc,
245182
const std::vector<uint64_t>& frag_offsets,
246183
const policy::ExecutionPolicy* policy,
247-
const int device_count,
248184
const size_t num_bytes_for_row,
249185
const std::optional<size_t> table_desc_offset,
250186
Executor* executor,

0 commit comments

Comments
 (0)