From fdfc379454474c3b00e40b7727267f6ec130ffab Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Thu, 13 Apr 2023 11:06:21 -0500 Subject: [PATCH 1/9] Add files for GPU SM Intel testing --- omniscidb/Tests/CMakeLists.txt | 4 + omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 683 +++++++++++++++++++ omniscidb/Tests/GpuSharedMemoryTestIntel.h | 89 +++ 3 files changed, 776 insertions(+) create mode 100644 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp create mode 100644 omniscidb/Tests/GpuSharedMemoryTestIntel.h diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt index 0de06ebb50..0d2734695f 100644 --- a/omniscidb/Tests/CMakeLists.txt +++ b/omniscidb/Tests/CMakeLists.txt @@ -65,16 +65,19 @@ if(ENABLE_L0) add_executable(SpirvBuildTest SpirvBuildTest.cpp) add_executable(DataMgrWithL0Test DataMgrWithL0Test.cpp) add_executable(IntelGPUEnablingTest IntelGPUEnablingTest.cpp) + add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp) target_link_libraries(L0MgrExecuteTest L0Mgr gtest ${llvm_libs} Logger OSDependent) target_link_libraries(SpirvBuildTest gtest ${llvm_libs}) target_link_libraries(DataMgrWithL0Test DataMgr gtest) target_link_libraries(IntelGPUEnablingTest gtest QueryEngine ArrowQueryRunner) + target_link_libraries(GpuSharedMemoryTestIntel gtest QueryEngine ArrowQueryRunner) add_test(L0MgrExecuteTest L0MgrExecuteTest ${TEST_ARGS}) add_test(SpirvBuildTest SpirvBuildTest ${TEST_ARGS}) add_test(DataMgrWithL0Test DataMgrWithL0Test ${TEST_ARGS}) add_test(IntelGPUEnablingTest IntelGPUEnablingTest ${TEST_ARGS}) + add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS}) endif() add_executable(CostModelTest CostModel/CostModelTest.cpp) @@ -286,6 +289,7 @@ if(ENABLE_L0) SpirvBuildTest L0MgrExecuteTest IntelGPUEnablingTest + GpuSharedMemoryTestIntel ) set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling") diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp new file mode 100644 index 0000000000..a4da9c48de --- /dev/null +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -0,0 +1,683 @@ +/* + * Copyright 2019 OmniSci, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "GpuSharedMemoryTest.h" +#include "QueryEngine/CompilationOptions.h" +#include "QueryEngine/LLVMGlobalContext.h" +#include "QueryEngine/OutputBufferInitialization.h" +#include "QueryEngine/ResultSetReduction.h" +#include "QueryEngine/ResultSetReductionJIT.h" + +extern bool g_is_test_env; + +auto int8_type = hdk::ir::Context::defaultCtx().int8(); +auto int16_type = hdk::ir::Context::defaultCtx().int16(); +auto int32_type = hdk::ir::Context::defaultCtx().int32(); +auto int64_type = hdk::ir::Context::defaultCtx().int64(); +auto float_type = hdk::ir::Context::defaultCtx().fp32(); +auto double_type = hdk::ir::Context::defaultCtx().fp64(); + +namespace { + +void init_storage_buffer(int8_t* buffer, + const std::vector& targets, + const QueryMemoryDescriptor& query_mem_desc) { + // get the initial values for all the aggregate columns + const auto init_agg_vals = init_agg_val_vec(targets, query_mem_desc); + CHECK(!query_mem_desc.didOutputColumnar()); + CHECK(query_mem_desc.getQueryDescriptionType() == + QueryDescriptionType::GroupByPerfectHash); + + const auto row_size = query_mem_desc.getRowSize(); + CHECK(query_mem_desc.hasKeylessHash()); + for (size_t entry_idx = 0; entry_idx < query_mem_desc.getEntryCount(); ++entry_idx) { + const auto row_ptr = buffer + entry_idx * row_size; + size_t init_agg_idx{0}; + int64_t init_val{0}; + // initialize each row's aggregate columns: + auto col_ptr = row_ptr + query_mem_desc.getColOffInBytes(0); + for (size_t slot_idx = 0; slot_idx < query_mem_desc.getSlotCount(); slot_idx++) { + if (query_mem_desc.getPaddedSlotWidthBytes(slot_idx) > 0) { + init_val = init_agg_vals[init_agg_idx++]; + } + switch (query_mem_desc.getPaddedSlotWidthBytes(slot_idx)) { + case 4: + *reinterpret_cast(col_ptr) = static_cast(init_val); + break; + case 8: + *reinterpret_cast(col_ptr) = init_val; + break; + case 0: + break; + default: + UNREACHABLE(); + } + col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, entry_idx, slot_idx); + } + } +} + +} // namespace + +void GpuReductionTester::codegenWrapperKernel() { + const unsigned address_space = 0; + auto pi8_type = llvm::Type::getInt8PtrTy(context_, address_space); + std::vector input_arguments; + input_arguments.push_back(llvm::PointerType::get(pi8_type, address_space)); + input_arguments.push_back(llvm::Type::getInt64Ty(context_)); // num input buffers + input_arguments.push_back(llvm::Type::getInt8PtrTy(context_, address_space)); + + llvm::FunctionType* ft = + llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false); + wrapper_kernel_ = llvm::Function::Create( + ft, llvm::Function::ExternalLinkage, "wrapper_kernel", module_); + + auto arg_it = wrapper_kernel_->arg_begin(); + auto input_ptrs = &*arg_it; + input_ptrs->setName("input_pointers"); + arg_it++; + auto num_buffers = &*arg_it; + num_buffers->setName("num_buffers"); + arg_it++; + auto output_buffer = &*arg_it; + output_buffer->setName("output_buffer"); + + llvm::IRBuilder<> ir_builder(context_); + + auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", wrapper_kernel_); + auto bb_body = llvm::BasicBlock::Create(context_, ".body", wrapper_kernel_); + auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", wrapper_kernel_); + + // return if blockIdx.x > num_buffers + ir_builder.SetInsertPoint(bb_entry); + auto get_block_index_func = getFunction("get_block_index"); + auto block_index = ir_builder.CreateCall(get_block_index_func, {}, "block_index"); + const auto is_block_inbound = + ir_builder.CreateICmpSLT(block_index, num_buffers, "is_block_inbound"); + ir_builder.CreateCondBr(is_block_inbound, bb_body, bb_exit); + + // locate the corresponding input buffer: + ir_builder.SetInsertPoint(bb_body); + auto input_buffer_gep = ir_builder.CreateGEP( + input_ptrs->getType()->getScalarType()->getPointerElementType(), + input_ptrs, + block_index); + auto input_buffer = ir_builder.CreateLoad( + llvm::Type::getInt8PtrTy(context_, address_space), input_buffer_gep); + auto input_buffer_ptr = + ir_builder.CreatePointerCast(input_buffer, + llvm::Type::getInt64PtrTy(context_, address_space), + "input_buffer_ptr"); + const auto buffer_size = ll_int( + static_cast(query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU)), + context_); + + // initializing shared memory and copy input buffer into shared memory buffer: + auto init_smem_func = getFunction("init_shared_mem"); + auto smem_input_buffer_ptr = ir_builder.CreateCall(init_smem_func, + { + input_buffer_ptr, + buffer_size, + }, + "smem_input_buffer_ptr"); + + auto output_buffer_ptr = + ir_builder.CreatePointerCast(output_buffer, + llvm::Type::getInt64PtrTy(context_, address_space), + "output_buffer_ptr"); + // call the reduction function + CHECK(reduction_func_); + std::vector reduction_args{ + output_buffer_ptr, smem_input_buffer_ptr, buffer_size}; + ir_builder.CreateCall(reduction_func_, reduction_args); + ir_builder.CreateBr(bb_exit); + + ir_builder.SetInsertPoint(bb_exit); + ir_builder.CreateRet(nullptr); +} + +namespace { +void prepare_generated_gpu_kernel(llvm::Module* module, + llvm::LLVMContext& context, + llvm::Function* kernel) { + // might be extra, remove and clean up + module->setDataLayout( + "e-p:64:64:64-i1:8:8-i8:8:8-" + "i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-" + "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); + module->setTargetTriple("nvptx64-nvidia-cuda"); + + llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations"); + + llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(kernel), + llvm::MDString::get(context, "kernel"), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + llvm::Type::getInt32Ty(context), 1))}; + + // Append metadata to nvvm.annotations + md->addOperand(llvm::MDNode::get(context, md_vals)); +} + +std::unique_ptr compile_and_link_gpu_code( + const std::string& cuda_llir, + llvm::Module* module, + // CudaMgr_Namespace::CudaMgr* cuda_mgr, + const std::string& kernel_name, + const size_t gpu_block_size = 1024, + const size_t gpu_device_idx = 0) { + CHECK(module); + // CHECK(cuda_mgr); + // auto& context = module->getContext(); + // std::unique_ptr nvptx_target_machine = + // compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch()); + // const auto ptx = + // compiler::CUDABackend::generatePTX(cuda_llir, nvptx_target_machine.get(), + // context); + + // auto cubin_result = ptx_to_cubin(ptx, gpu_block_size, cuda_mgr); + // auto& option_keys = cubin_result.option_keys; + // auto& option_values = cubin_result.option_values; + // auto cubin = cubin_result.cubin; + // auto link_state = cubin_result.link_state; + // const auto num_options = option_keys.size(); + // auto gpu_context = std::make_unique(cubin, + // kernel_name, + // gpu_device_idx, + // cuda_mgr, + // num_options, + // &option_keys[0], + // &option_values[0]); + + // checkCudaErrors(cuLinkDestroy(link_state)); @LM + // return gpu_context; + return NULL; +} + +std::vector> create_and_fill_input_result_sets( + const size_t num_input_buffers, + std::shared_ptr row_set_mem_owner, + const QueryMemoryDescriptor& query_mem_desc, + const std::vector& target_infos, + std::vector& generators, + const std::vector& steps) { + std::vector> result_sets; + for (size_t i = 0; i < num_input_buffers; i++) { + result_sets.push_back(std::make_unique(target_infos, + ExecutorDeviceType::CPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0)); + const auto storage = result_sets.back()->allocateStorage(); + // fill_storage_buffer(storage->getUnderlyingBuffer(), + // target_infos, + // query_mem_desc, + // generators[i], + // steps[i]); + } + return result_sets; +} + +std::pair, std::unique_ptr> +create_and_init_output_result_sets(std::shared_ptr row_set_mem_owner, + const QueryMemoryDescriptor& query_mem_desc, + const std::vector& target_infos) { + // CPU result set, will eventually host CPU reduciton results for validations + auto cpu_result_set = std::make_unique(target_infos, + ExecutorDeviceType::CPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0); + auto cpu_storage_result = cpu_result_set->allocateStorage(); + init_storage_buffer( + cpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc); + + // GPU result set, will eventually host GPU reduction results + auto gpu_result_set = std::make_unique(target_infos, + ExecutorDeviceType::GPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0); + auto gpu_storage_result = gpu_result_set->allocateStorage(); + init_storage_buffer( + gpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc); + return std::make_pair(std::move(cpu_result_set), std::move(gpu_result_set)); +} +void perform_reduction_on_cpu(std::vector>& result_sets, + const ResultSetStorage* cpu_result_storage) { + CHECK(result_sets.size() > 0); + Config config; + // for codegen only + // auto executor = Executor::getExecutor(nullptr); + // ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(), + // result_sets.front()->getTargetInfos(), + // result_sets.front()->getTargetInitVals(), + // config, + // executor.get()); + // const auto reduction_code = reduction_jit.codegen(); + // for (auto& result_set : result_sets) { + // ResultSetReduction::reduce(*cpu_result_storage, + // *(result_set->getStorage()), + // {}, + // reduction_code, + // config, + // executor.get()); + // } +} + +struct TestInputData { + size_t device_id; + size_t num_input_buffers; + std::vector target_infos; + int8_t suggested_agg_widths; + size_t min_entry; + size_t max_entry; + size_t step_size; + bool keyless_hash; + int32_t target_index_for_key; + TestInputData() + : device_id(0) + , num_input_buffers(0) + , suggested_agg_widths(0) + , min_entry(0) + , max_entry(0) + , step_size(2) + , keyless_hash(false) + , target_index_for_key(0) {} + TestInputData& setDeviceId(const size_t id) { + device_id = id; + return *this; + } + TestInputData& setNumInputBuffers(size_t num_buffers) { + num_input_buffers = num_buffers; + return *this; + } + TestInputData& setTargetInfos(std::vector tis) { + target_infos = tis; + return *this; + } + TestInputData& setAggWidth(int8_t agg_width) { + suggested_agg_widths = agg_width; + return *this; + } + TestInputData& setMinEntry(size_t min_e) { + min_entry = min_e; + return *this; + } + TestInputData& setMaxEntry(size_t max_e) { + max_entry = max_e; + return *this; + } + TestInputData& setKeylessHash(bool is_keyless) { + keyless_hash = is_keyless; + return *this; + } + TestInputData& setTargetIndexForKey(size_t target_idx) { + target_index_for_key = target_idx; + return *this; + } + TestInputData& setStepSize(size_t step) { + step_size = step; + return *this; + } +}; + +void perform_test_and_verify_results(TestInputData input) { + auto executor = Executor::getExecutor(nullptr, nullptr); + auto& context = executor->getContext(); + auto cgen_state = std::unique_ptr( + new CgenState({}, false, false, executor->getExtensionModuleContext(), context)); + cgen_state->set_module_shallow_copy( + executor->getExtensionModuleContext()->getRTModule(/*is_l0=*/false)); + // auto module = cgen_state->module_; + // module->setDataLayout( + // "e-p:64:64:64-i1:8:8-i8:8:8-" + // "i16:16:16-i32:32:32-i64:64:64-" + // "f32:32:32-f64:64:64-v16:16:16-" + // "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); + // module->setTargetTriple("nvptx64-nvidia-cuda"); + // auto cuda_mgr = std::make_unique(1); + // const auto row_set_mem_owner = + // std::make_shared(nullptr, Executor::getArenaBlockSize()); + // auto query_mem_desc = perfect_hash_one_col_desc( + // input.target_infos, input.suggested_agg_widths, input.min_entry, + // input.max_entry); + // if (input.keyless_hash) { + // query_mem_desc.setHasKeylessHash(true); + // query_mem_desc.setTargetIdxForKey(input.target_index_for_key); + // } + + // std::vector generators( + // input.num_input_buffers, StrideNumberGenerator(1, input.step_size)); + // std::vector steps(input.num_input_buffers, input.step_size); + // auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers, + // row_set_mem_owner, + // query_mem_desc, + // input.target_infos, + // generators, + // steps); + + // const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets( + // row_set_mem_owner, query_mem_desc, input.target_infos); + + // // performing reduciton using the GPU reduction code: + // Config config; + // GpuReductionTester gpu_smem_tester(config, + // module, + // context, + // query_mem_desc, + // input.target_infos, + // init_agg_val_vec(input.target_infos, + // query_mem_desc), cuda_mgr.get(), executor.get()); + // gpu_smem_tester.codegen(CompilationOptions::defaults( + // ExecutorDeviceType::GPU, + // false)); // generate code for gpu reduciton and initialization + // gpu_smem_tester.codegenWrapperKernel(); + // gpu_smem_tester.performReductionTest( + // input_result_sets, gpu_result_set->getStorage(), input.device_id); + + // // CPU reduction for validation: + // perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage()); + + // const auto cmp_result = + // std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(), + // gpu_result_set->getStorage()->getUnderlyingBuffer(), + // query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU)); + // ASSERT_EQ(cmp_result, 0); +} + +} // namespace + +void GpuReductionTester::performReductionTest( + const std::vector>& result_sets, + const ResultSetStorage* gpu_result_storage, + const size_t device_id) { + prepare_generated_gpu_kernel(module_, context_, getWrapperKernel()); + + std::stringstream ss; + llvm::raw_os_ostream os(ss); + module_->print(os, nullptr); + os.flush(); + std::string module_str(ss.str()); + + // std::unique_ptr gpu_context(compile_and_link_gpu_code( + // module_str, module_, cuda_mgr_, getWrapperKernel()->getName().str())); + + // const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU); + // const size_t num_buffers = result_sets.size(); + // std::vector d_input_buffers; + // for (size_t i = 0; i < num_buffers; i++) { + // d_input_buffers.push_back(cuda_mgr_->allocateDeviceMem(buffer_size, device_id)); + // cuda_mgr_->copyHostToDevice(d_input_buffers[i], + // result_sets[i]->getStorage()->getUnderlyingBuffer(), + // buffer_size, + // device_id); + // } + + // constexpr size_t num_kernel_params = 3; + // CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params); + + // // parameter 1: an array of device pointers + // std::vector h_input_buffer_dptrs; + // h_input_buffer_dptrs.reserve(num_buffers); + // std::transform(d_input_buffers.begin(), + // d_input_buffers.end(), + // std::back_inserter(h_input_buffer_dptrs), + // [](int8_t* dptr) { return reinterpret_cast(dptr); }); + + // auto d_input_buffer_dptrs = + // cuda_mgr_->allocateDeviceMem(num_buffers * sizeof(CUdeviceptr), device_id); + // cuda_mgr_->copyHostToDevice(d_input_buffer_dptrs, + // reinterpret_cast(h_input_buffer_dptrs.data()), + // num_buffers * sizeof(CUdeviceptr), + // device_id); + + // // parameter 2: number of buffers + // auto d_num_buffers = cuda_mgr_->allocateDeviceMem(sizeof(int64_t), device_id); + // cuda_mgr_->copyHostToDevice(d_num_buffers, + // reinterpret_cast(&num_buffers), + // sizeof(int64_t), + // device_id); + + // // parameter 3: device pointer to the output buffer + // auto d_result_buffer = cuda_mgr_->allocateDeviceMem(buffer_size, device_id); + // cuda_mgr_->copyHostToDevice( + // d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size, + // device_id); + + // // collecting all kernel parameters: + // std::vector h_kernel_params{ + // reinterpret_cast(d_input_buffer_dptrs), + // reinterpret_cast(d_num_buffers), + // reinterpret_cast(d_result_buffer)}; + + // // casting each kernel parameter to be a void* device ptr itself: + // std::vector kernel_param_ptrs; + // kernel_param_ptrs.reserve(num_kernel_params); + // std::transform(h_kernel_params.begin(), + // h_kernel_params.end(), + // std::back_inserter(kernel_param_ptrs), + // [](CUdeviceptr& param) { return ¶m; }); + + // // launching a kernel: + // auto cu_func = static_cast(gpu_context->kernel()); + // // we launch as many threadblocks as there are input buffers: + // // in other words, each input buffer is handled by a single threadblock. + + // // checkCudaErrors(cuLaunchKernel(cu_func, + // // num_buffers, + // // 1, + // // 1, + // // 1024, + // // 1, + // // 1, + // // buffer_size, + // // 0, + // // kernel_param_ptrs.data(), + // // nullptr)); + + // // transfer back the results: + // cuda_mgr_->copyDeviceToHost( + // gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size, + // device_id); + + // // release the gpu memory used: + // for (auto& d_buffer : d_input_buffers) { + // cuda_mgr_->freeDeviceMem(d_buffer); + // } + // cuda_mgr_->freeDeviceMem(d_input_buffer_dptrs); + // cuda_mgr_->freeDeviceMem(d_num_buffers); + // cuda_mgr_->freeDeviceMem(d_result_buffer); +} + +// TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) { +// for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setTargetInfos(generate_custom_agg_target_infos( +// {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type})) +// .setAggWidth(4) +// .setMinEntry(0) +// .setMaxEntry(num_entries) +// .setStepSize(2) +// .setKeylessHash(true) +// .setTargetIndexForKey(0); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableEntries_CountQuery_8B_Group) { +// for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, {hdk::ir::AggType::kCount}, {int64_type}, {int64_type})) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(num_entries) +// .setStepSize(2) +// .setKeylessHash(true) +// .setTargetIndexForKey(0); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_1) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(126) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kMin, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kAvg}, +// {int64_type, int64_type, int64_type, int64_type, double_type}, +// {int32_type, int32_type, int32_type, int32_type, int32_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_2) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(126) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMin}, +// {int64_type, double_type, int64_type, int64_type, int64_type}, +// {int32_type, int32_type, int32_type, int32_type, int32_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_3) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(367) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMin}, +// {int64_type, double_type, double_type, double_type, double_type}, +// {int32_type, double_type, double_type, double_type, double_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_4) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(517) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMin}, +// {int64_type, float_type, float_type, float_type, float_type}, +// {int16_type, float_type, float_type, float_type, float_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableNumBuffers) { +// TestInputData input; +// input.setDeviceId(0) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(266) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kMin}, +// {int32_type, int64_type, double_type, float_type, double_type}, +// {int8_type, int8_type, int16_type, float_type, double_type})); + +// for (auto& num_buffers : {2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128}) { +// input.setNumInputBuffers(num_buffers); +// perform_test_and_verify_results(input); +// } +// } + +int main(int argc, char** argv) { + g_is_test_env = true; + + TestHelpers::init_logger_stderr_only(argc, argv); + testing::InitGoogleTest(&argc, argv); + + int err{0}; + try { + err = RUN_ALL_TESTS(); + } catch (const std::exception& e) { + LOG(ERROR) << e.what(); + } + return err; +} diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h new file mode 100644 index 0000000000..18cab64b05 --- /dev/null +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h @@ -0,0 +1,89 @@ +/* + * Copyright 2019 OmniSci, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "CudaMgr/CudaMgr.h" +#include "Logger/Logger.h" +#include "QueryEngine/CodeGenerator.h" +#include "QueryEngine/GpuSharedMemoryUtils.h" +#include "QueryEngine/LLVMFunctionAttributesUtil.h" +#include "QueryEngine/NvidiaKernel.h" +#include "QueryEngine/OutputBufferInitialization.h" +#include "ResultSetTestUtils.h" +#include "Shared/TargetInfo.h" +#include "TestHelpers.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class StrideNumberGenerator : public NumberGenerator { + public: + StrideNumberGenerator(const int64_t start, const int64_t stride) + : crt_(start), stride_(stride), start_(start) {} + + int64_t getNextValue() override { + const auto crt = crt_; + crt_ += stride_; + return crt; + } + + void reset() override { crt_ = start_; } + + private: + int64_t crt_; + int64_t stride_; + int64_t start_; +}; + +class GpuReductionTester : public GpuSharedMemCodeBuilder { + public: + GpuReductionTester(const Config& config, + llvm::Module* module, + llvm::LLVMContext& context, + const QueryMemoryDescriptor& qmd, + const std::vector& targets, + const std::vector& init_agg_values, + CudaMgr_Namespace::CudaMgr* cuda_mgr, + Executor* executor) + : GpuSharedMemCodeBuilder(module, + context, + qmd, + targets, + init_agg_values, + config, + executor) + , cuda_mgr_(cuda_mgr) { + // CHECK(getReductionFunction()); + } + void codegenWrapperKernel(); + llvm::Function* getWrapperKernel() const { return wrapper_kernel_; } + void performReductionTest(const std::vector>& result_sets, + const ResultSetStorage* gpu_result_storage, + const size_t device_id); + + private: + CudaMgr_Namespace::CudaMgr* cuda_mgr_; + llvm::Function* wrapper_kernel_; +}; From 565a3e0554f11e61e86f65a9ec4760cf6b57d2e0 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Fri, 14 Apr 2023 09:45:02 -0500 Subject: [PATCH 2/9] Replacing cudaMgr by L0mgr --- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 38 ++++++++++---------- omniscidb/Tests/GpuSharedMemoryTestIntel.h | 10 +++--- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index a4da9c48de..dfe76790e6 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "GpuSharedMemoryTest.h" +#include "GpuSharedMemoryTestIntel.h" #include "QueryEngine/CompilationOptions.h" #include "QueryEngine/LLVMGlobalContext.h" #include "QueryEngine/OutputBufferInitialization.h" @@ -175,13 +175,13 @@ void prepare_generated_gpu_kernel(llvm::Module* module, std::unique_ptr compile_and_link_gpu_code( const std::string& cuda_llir, llvm::Module* module, - // CudaMgr_Namespace::CudaMgr* cuda_mgr, + l0::L0Manager* l0_mgr, const std::string& kernel_name, const size_t gpu_block_size = 1024, const size_t gpu_device_idx = 0) { CHECK(module); - // CHECK(cuda_mgr); - // auto& context = module->getContext(); + CHECK(l0_mgr); + auto& context = module->getContext(); // std::unique_ptr nvptx_target_machine = // compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch()); // const auto ptx = @@ -267,21 +267,21 @@ void perform_reduction_on_cpu(std::vector>& result_se CHECK(result_sets.size() > 0); Config config; // for codegen only - // auto executor = Executor::getExecutor(nullptr); - // ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(), - // result_sets.front()->getTargetInfos(), - // result_sets.front()->getTargetInitVals(), - // config, - // executor.get()); - // const auto reduction_code = reduction_jit.codegen(); - // for (auto& result_set : result_sets) { - // ResultSetReduction::reduce(*cpu_result_storage, - // *(result_set->getStorage()), - // {}, - // reduction_code, - // config, - // executor.get()); - // } + auto executor = Executor::getExecutor(nullptr); + ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(), + result_sets.front()->getTargetInfos(), + result_sets.front()->getTargetInitVals(), + config, + executor.get()); + const auto reduction_code = reduction_jit.codegen(); + for (auto& result_set : result_sets) { + ResultSetReduction::reduce(*cpu_result_storage, + *(result_set->getStorage()), + {}, + reduction_code, + config, + executor.get()); + } } struct TestInputData { diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h index 18cab64b05..cbdcd71289 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.h +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h @@ -16,12 +16,12 @@ #pragma once -#include "CudaMgr/CudaMgr.h" +#include "L0Mgr/L0Mgr.h" #include "Logger/Logger.h" #include "QueryEngine/CodeGenerator.h" #include "QueryEngine/GpuSharedMemoryUtils.h" #include "QueryEngine/LLVMFunctionAttributesUtil.h" -#include "QueryEngine/NvidiaKernel.h" +// #include "QueryEngine/NvidiaKernel.h" #include "QueryEngine/OutputBufferInitialization.h" #include "ResultSetTestUtils.h" #include "Shared/TargetInfo.h" @@ -65,7 +65,7 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder { const QueryMemoryDescriptor& qmd, const std::vector& targets, const std::vector& init_agg_values, - CudaMgr_Namespace::CudaMgr* cuda_mgr, + l0::L0Manager* l0_mgr, Executor* executor) : GpuSharedMemCodeBuilder(module, context, @@ -74,7 +74,7 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder { init_agg_values, config, executor) - , cuda_mgr_(cuda_mgr) { + , l0_mgr_(l0_mgr) { // CHECK(getReductionFunction()); } void codegenWrapperKernel(); @@ -84,6 +84,6 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder { const size_t device_id); private: - CudaMgr_Namespace::CudaMgr* cuda_mgr_; + l0::L0Manager* l0_mgr_; llvm::Function* wrapper_kernel_; }; From 7e22fb315cbf90170d76407e7c5f7f61ca5002b5 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Fri, 14 Apr 2023 10:31:53 -0500 Subject: [PATCH 3/9] Add compile_and_link_gpu_code for L0 --- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 45 +++++++++----------- omniscidb/Tests/GpuSharedMemoryTestIntel.h | 2 +- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index dfe76790e6..a519c8adba 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -15,6 +15,7 @@ */ #include "GpuSharedMemoryTestIntel.h" +#include #include "QueryEngine/CompilationOptions.h" #include "QueryEngine/LLVMGlobalContext.h" #include "QueryEngine/OutputBufferInitialization.h" @@ -172,8 +173,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module, md->addOperand(llvm::MDNode::get(context, md_vals)); } -std::unique_ptr compile_and_link_gpu_code( - const std::string& cuda_llir, +std::unique_ptr compile_and_link_gpu_code( llvm::Module* module, l0::L0Manager* l0_mgr, const std::string& kernel_name, @@ -182,29 +182,24 @@ std::unique_ptr compile_and_link_gpu_code( CHECK(module); CHECK(l0_mgr); auto& context = module->getContext(); - // std::unique_ptr nvptx_target_machine = - // compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch()); - // const auto ptx = - // compiler::CUDABackend::generatePTX(cuda_llir, nvptx_target_machine.get(), - // context); - - // auto cubin_result = ptx_to_cubin(ptx, gpu_block_size, cuda_mgr); - // auto& option_keys = cubin_result.option_keys; - // auto& option_values = cubin_result.option_values; - // auto cubin = cubin_result.cubin; - // auto link_state = cubin_result.link_state; - // const auto num_options = option_keys.size(); - // auto gpu_context = std::make_unique(cubin, - // kernel_name, - // gpu_device_idx, - // cuda_mgr, - // num_options, - // &option_keys[0], - // &option_values[0]); - - // checkCudaErrors(cuLinkDestroy(link_state)); @LM - // return gpu_context; - return NULL; + + SPIRV::TranslatorOpts opts; + opts.enableAllExtensions(); + opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::OpenCL12); + opts.setDebugInfoEIS(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100); + + std::ostringstream ss; + std::string err; + auto success = writeSpirv(module, opts, ss, err); + CHECK(success) << "Spirv translation failed with error: " << err << "\n"; + + L0BinResult bin_result; + bin_result = spv_to_bin(ss.str(), kernel_name, gpu_block_size, l0_mgr); + + auto l0_context = std::make_unique( + bin_result.device, bin_result.kernel, bin_result.module, l0_mgr, 0, 1); + + return l0_context; } std::vector> create_and_fill_input_result_sets( diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h index cbdcd71289..535e0a47e4 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.h +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h @@ -20,8 +20,8 @@ #include "Logger/Logger.h" #include "QueryEngine/CodeGenerator.h" #include "QueryEngine/GpuSharedMemoryUtils.h" +#include "QueryEngine/L0Kernel.h" #include "QueryEngine/LLVMFunctionAttributesUtil.h" -// #include "QueryEngine/NvidiaKernel.h" #include "QueryEngine/OutputBufferInitialization.h" #include "ResultSetTestUtils.h" #include "Shared/TargetInfo.h" From 6a64c86d9dd3d4da4ec1394553bfce87990e1bf2 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Fri, 14 Apr 2023 10:46:25 -0500 Subject: [PATCH 4/9] Fix fill_storage_buffer method --- omniscidb/Tests/CMakeLists.txt | 2 +- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt index 0d2734695f..1f5094cb7f 100644 --- a/omniscidb/Tests/CMakeLists.txt +++ b/omniscidb/Tests/CMakeLists.txt @@ -65,7 +65,7 @@ if(ENABLE_L0) add_executable(SpirvBuildTest SpirvBuildTest.cpp) add_executable(DataMgrWithL0Test DataMgrWithL0Test.cpp) add_executable(IntelGPUEnablingTest IntelGPUEnablingTest.cpp) - add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp) + add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp ResultSetTestUtils.cpp) target_link_libraries(L0MgrExecuteTest L0Mgr gtest ${llvm_libs} Logger OSDependent) target_link_libraries(SpirvBuildTest gtest ${llvm_libs}) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index a519c8adba..221051b70d 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -219,11 +219,11 @@ std::vector> create_and_fill_input_result_sets( 0, 0)); const auto storage = result_sets.back()->allocateStorage(); - // fill_storage_buffer(storage->getUnderlyingBuffer(), - // target_infos, - // query_mem_desc, - // generators[i], - // steps[i]); + fill_storage_buffer(storage->getUnderlyingBuffer(), + target_infos, + query_mem_desc, + generators[i], + steps[i]); } return result_sets; } From 048233f5dfbe7f703b6897f1738eab019be4f8f5 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Fri, 14 Apr 2023 11:29:00 -0500 Subject: [PATCH 5/9] Enable perform_test_and_verify_results for L0 --- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 108 +++++++++---------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index 221051b70d..2008902eb1 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -343,61 +343,61 @@ void perform_test_and_verify_results(TestInputData input) { new CgenState({}, false, false, executor->getExtensionModuleContext(), context)); cgen_state->set_module_shallow_copy( executor->getExtensionModuleContext()->getRTModule(/*is_l0=*/false)); - // auto module = cgen_state->module_; - // module->setDataLayout( - // "e-p:64:64:64-i1:8:8-i8:8:8-" - // "i16:16:16-i32:32:32-i64:64:64-" - // "f32:32:32-f64:64:64-v16:16:16-" - // "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); - // module->setTargetTriple("nvptx64-nvidia-cuda"); - // auto cuda_mgr = std::make_unique(1); - // const auto row_set_mem_owner = - // std::make_shared(nullptr, Executor::getArenaBlockSize()); - // auto query_mem_desc = perfect_hash_one_col_desc( - // input.target_infos, input.suggested_agg_widths, input.min_entry, - // input.max_entry); - // if (input.keyless_hash) { - // query_mem_desc.setHasKeylessHash(true); - // query_mem_desc.setTargetIdxForKey(input.target_index_for_key); - // } + auto module = cgen_state->module_; + module->setDataLayout( + "e-p:64:64:64-i1:8:8-i8:8:8-" + "i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-" + "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); + module->setTargetTriple("spir-unknown-unknown"); + auto l0_mgr = std::make_unique(); + const auto row_set_mem_owner = + std::make_shared(nullptr, Executor::getArenaBlockSize()); + auto query_mem_desc = perfect_hash_one_col_desc( + input.target_infos, input.suggested_agg_widths, input.min_entry, input.max_entry); + if (input.keyless_hash) { + query_mem_desc.setHasKeylessHash(true); + query_mem_desc.setTargetIdxForKey(input.target_index_for_key); + } + + std::vector generators( + input.num_input_buffers, StrideNumberGenerator(1, input.step_size)); + std::vector steps(input.num_input_buffers, input.step_size); + auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers, + row_set_mem_owner, + query_mem_desc, + input.target_infos, + generators, + steps); - // std::vector generators( - // input.num_input_buffers, StrideNumberGenerator(1, input.step_size)); - // std::vector steps(input.num_input_buffers, input.step_size); - // auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers, - // row_set_mem_owner, - // query_mem_desc, - // input.target_infos, - // generators, - // steps); - - // const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets( - // row_set_mem_owner, query_mem_desc, input.target_infos); - - // // performing reduciton using the GPU reduction code: - // Config config; - // GpuReductionTester gpu_smem_tester(config, - // module, - // context, - // query_mem_desc, - // input.target_infos, - // init_agg_val_vec(input.target_infos, - // query_mem_desc), cuda_mgr.get(), executor.get()); - // gpu_smem_tester.codegen(CompilationOptions::defaults( - // ExecutorDeviceType::GPU, - // false)); // generate code for gpu reduciton and initialization - // gpu_smem_tester.codegenWrapperKernel(); - // gpu_smem_tester.performReductionTest( - // input_result_sets, gpu_result_set->getStorage(), input.device_id); - - // // CPU reduction for validation: - // perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage()); - - // const auto cmp_result = - // std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(), - // gpu_result_set->getStorage()->getUnderlyingBuffer(), - // query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU)); - // ASSERT_EQ(cmp_result, 0); + const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets( + row_set_mem_owner, query_mem_desc, input.target_infos); + + // performing reduciton using the GPU reduction code: + Config config; + GpuReductionTester gpu_smem_tester(config, + module, + context, + query_mem_desc, + input.target_infos, + init_agg_val_vec(input.target_infos, query_mem_desc), + l0_mgr.get(), + executor.get()); + gpu_smem_tester.codegen(CompilationOptions::defaults( + ExecutorDeviceType::GPU, + false)); // generate code for gpu reduciton and initialization + gpu_smem_tester.codegenWrapperKernel(); + gpu_smem_tester.performReductionTest( + input_result_sets, gpu_result_set->getStorage(), input.device_id); + + // CPU reduction for validation: + perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage()); + + const auto cmp_result = + std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(), + gpu_result_set->getStorage()->getUnderlyingBuffer(), + query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU)); + ASSERT_EQ(cmp_result, 0); } } // namespace From 91df5856a5e110f6ea358204f52c411812784b36 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Fri, 14 Apr 2023 13:15:43 -0500 Subject: [PATCH 6/9] Enable all the missing L0 --- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 217 ++++++++++--------- 1 file changed, 112 insertions(+), 105 deletions(-) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index 2008902eb1..1ef81efd57 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -174,6 +174,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module, } std::unique_ptr compile_and_link_gpu_code( + const std::string& l0_llir, llvm::Module* module, l0::L0Manager* l0_mgr, const std::string& kernel_name, @@ -181,7 +182,6 @@ std::unique_ptr compile_and_link_gpu_code( const size_t gpu_device_idx = 0) { CHECK(module); CHECK(l0_mgr); - auto& context = module->getContext(); SPIRV::TranslatorOpts opts; opts.enableAllExtensions(); @@ -414,112 +414,119 @@ void GpuReductionTester::performReductionTest( os.flush(); std::string module_str(ss.str()); - // std::unique_ptr gpu_context(compile_and_link_gpu_code( - // module_str, module_, cuda_mgr_, getWrapperKernel()->getName().str())); - - // const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU); - // const size_t num_buffers = result_sets.size(); - // std::vector d_input_buffers; - // for (size_t i = 0; i < num_buffers; i++) { - // d_input_buffers.push_back(cuda_mgr_->allocateDeviceMem(buffer_size, device_id)); - // cuda_mgr_->copyHostToDevice(d_input_buffers[i], - // result_sets[i]->getStorage()->getUnderlyingBuffer(), - // buffer_size, - // device_id); - // } - - // constexpr size_t num_kernel_params = 3; - // CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params); - - // // parameter 1: an array of device pointers - // std::vector h_input_buffer_dptrs; - // h_input_buffer_dptrs.reserve(num_buffers); - // std::transform(d_input_buffers.begin(), - // d_input_buffers.end(), - // std::back_inserter(h_input_buffer_dptrs), - // [](int8_t* dptr) { return reinterpret_cast(dptr); }); - - // auto d_input_buffer_dptrs = - // cuda_mgr_->allocateDeviceMem(num_buffers * sizeof(CUdeviceptr), device_id); - // cuda_mgr_->copyHostToDevice(d_input_buffer_dptrs, - // reinterpret_cast(h_input_buffer_dptrs.data()), - // num_buffers * sizeof(CUdeviceptr), - // device_id); - - // // parameter 2: number of buffers - // auto d_num_buffers = cuda_mgr_->allocateDeviceMem(sizeof(int64_t), device_id); - // cuda_mgr_->copyHostToDevice(d_num_buffers, - // reinterpret_cast(&num_buffers), - // sizeof(int64_t), - // device_id); - - // // parameter 3: device pointer to the output buffer - // auto d_result_buffer = cuda_mgr_->allocateDeviceMem(buffer_size, device_id); - // cuda_mgr_->copyHostToDevice( - // d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size, - // device_id); - - // // collecting all kernel parameters: - // std::vector h_kernel_params{ - // reinterpret_cast(d_input_buffer_dptrs), - // reinterpret_cast(d_num_buffers), - // reinterpret_cast(d_result_buffer)}; - - // // casting each kernel parameter to be a void* device ptr itself: - // std::vector kernel_param_ptrs; - // kernel_param_ptrs.reserve(num_kernel_params); - // std::transform(h_kernel_params.begin(), - // h_kernel_params.end(), - // std::back_inserter(kernel_param_ptrs), - // [](CUdeviceptr& param) { return ¶m; }); - - // // launching a kernel: - // auto cu_func = static_cast(gpu_context->kernel()); - // // we launch as many threadblocks as there are input buffers: - // // in other words, each input buffer is handled by a single threadblock. - - // // checkCudaErrors(cuLaunchKernel(cu_func, - // // num_buffers, - // // 1, - // // 1, - // // 1024, - // // 1, - // // 1, - // // buffer_size, - // // 0, - // // kernel_param_ptrs.data(), - // // nullptr)); - - // // transfer back the results: - // cuda_mgr_->copyDeviceToHost( - // gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size, - // device_id); - - // // release the gpu memory used: - // for (auto& d_buffer : d_input_buffers) { - // cuda_mgr_->freeDeviceMem(d_buffer); - // } - // cuda_mgr_->freeDeviceMem(d_input_buffer_dptrs); - // cuda_mgr_->freeDeviceMem(d_num_buffers); - // cuda_mgr_->freeDeviceMem(d_result_buffer); + std::unique_ptr gpu_context(compile_and_link_gpu_code( + module_str, module_, l0_mgr_, getWrapperKernel()->getName().str())); + + const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU); + const size_t num_buffers = result_sets.size(); + std::vector d_input_buffers; + for (size_t i = 0; i < num_buffers; i++) { + d_input_buffers.push_back(l0_mgr_->allocateDeviceMem(buffer_size, device_id)); + l0_mgr_->copyHostToDevice(d_input_buffers[i], + result_sets[i]->getStorage()->getUnderlyingBuffer(), + buffer_size, + device_id); + } + + constexpr size_t num_kernel_params = 3; + CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params); + + // parameter 1: an array of device pointers + typedef int8_t* L0deviceptr; + std::vector h_input_buffer_dptrs; + h_input_buffer_dptrs.reserve(num_buffers); + std::transform(d_input_buffers.begin(), + d_input_buffers.end(), + std::back_inserter(h_input_buffer_dptrs), + [](int8_t* dptr) { return reinterpret_cast(dptr); }); + + auto d_input_buffer_dptrs = + l0_mgr_->allocateDeviceMem(num_buffers * sizeof(L0deviceptr), device_id); + l0_mgr_->copyHostToDevice(d_input_buffer_dptrs, + reinterpret_cast(h_input_buffer_dptrs.data()), + num_buffers * sizeof(L0deviceptr), + device_id); + + // parameter 2: number of buffers + auto d_num_buffers = l0_mgr_->allocateDeviceMem(sizeof(int64_t), device_id); + l0_mgr_->copyHostToDevice(d_num_buffers, + reinterpret_cast(&num_buffers), + sizeof(int64_t), + device_id); + + // parameter 3: device pointer to the output buffer + auto d_result_buffer = l0_mgr_->allocateDeviceMem(buffer_size, device_id); + l0_mgr_->copyHostToDevice( + d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size, device_id); + + // collecting all kernel parameters: + std::vector h_kernel_params{ + reinterpret_cast(d_input_buffer_dptrs), + reinterpret_cast(d_num_buffers), + reinterpret_cast(d_result_buffer)}; + + // casting each kernel parameter to be a void* device ptr itself: + std::vector kernel_param_ptrs; + kernel_param_ptrs.reserve(num_kernel_params); + std::transform(h_kernel_params.begin(), + h_kernel_params.end(), + std::back_inserter(kernel_param_ptrs), + [](L0deviceptr& param) { return ¶m; }); + + // launching a kernel: + typedef void* L0function; + auto l0_func = static_cast(gpu_context->kernel()); + // we launch as many threadblocks as there are input buffers: + // in other words, each input buffer is handled by a single threadblock. + + // std::unique_ptr gpu_context + // auto l0_ctx = dynamic_cast(gpu_context); + // l0::L0Kernel* kernel = l0_ctx->getNativeCode(device_id); + // l0::L0Device* device = l0_ctx->getDevice(device_id); + + auto kernel = gpu_context->kernel(); + auto device = gpu_context->device(); + + auto q = device->command_queue(); + auto q_list = device->create_command_list(); + // l0::GroupCount gc = {ko.gridDimX, ko.gridDimY, ko.gridDimZ}; + l0::GroupCount gc = {1, 1, 1024}; + // LOG(INFO) << "Launching L0 kernel with group size: {" << ko.gridDimX << "," + // << ko.gridDimY << "," << ko.gridDimZ << "}\n"; + // q_list->launch(kernel, kernel_param_ptrs.data(), gc); //<< here is the problem + + q_list->launch(*kernel, gc); + q_list->submit(*q.get()); + + // transfer back the results: + l0_mgr_->copyDeviceToHost( + gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size, device_id); + + // release the gpu memory used: + for (auto& d_buffer : d_input_buffers) { + l0_mgr_->freeDeviceMem(d_buffer); + } + l0_mgr_->freeDeviceMem(d_input_buffer_dptrs); + l0_mgr_->freeDeviceMem(d_num_buffers); + l0_mgr_->freeDeviceMem(d_result_buffer); } -// TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) { -// for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { -// TestInputData input; -// input.setDeviceId(0) -// .setNumInputBuffers(4) -// .setTargetInfos(generate_custom_agg_target_infos( -// {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type})) -// .setAggWidth(4) -// .setMinEntry(0) -// .setMaxEntry(num_entries) -// .setStepSize(2) -// .setKeylessHash(true) -// .setTargetIndexForKey(0); -// perform_test_and_verify_results(input); -// } -// } +TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) { + for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { + TestInputData input; + input.setDeviceId(0) + .setNumInputBuffers(4) + .setTargetInfos(generate_custom_agg_target_infos( + {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type})) + .setAggWidth(4) + .setMinEntry(0) + .setMaxEntry(num_entries) + .setStepSize(2) + .setKeylessHash(true) + .setTargetIndexForKey(0); + perform_test_and_verify_results(input); + } +} // TEST(SingleColumn, VariableEntries_CountQuery_8B_Group) { // for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { From e03cc6fd9ecaffbb6a0a78d0b905b51afe3f8591 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Mon, 17 Apr 2023 06:41:11 -0500 Subject: [PATCH 7/9] Extend CMakeList for IntelGPU SM test --- omniscidb/Tests/CMakeLists.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt index 1f5094cb7f..40e6e21aa1 100644 --- a/omniscidb/Tests/CMakeLists.txt +++ b/omniscidb/Tests/CMakeLists.txt @@ -143,6 +143,10 @@ if(ENABLE_CUDA) target_link_libraries(GpuSharedMemoryTest gtest Logger QueryEngine) endif() +if(ENABLE_L0) + target_link_libraries(GpuSharedMemoryTestIntel gtest Logger QueryEngine) +endif() + set(TEST_ARGS "--gtest_output=xml:../") add_test(UtilTest UtilTest ${TEST_ARGS}) add_test(ArrowBasedExecuteTest ArrowBasedExecuteTest ${TEST_ARGS}) @@ -181,6 +185,7 @@ if(ENABLE_CUDA) add_test(GpuSharedMemoryTest GpuSharedMemoryTest ${TEST_ARGS}) endif() if(ENABLE_L0) + add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS}) add_test(NAME PuntToCpu COMMAND ArrowBasedExecuteTest "--gtest_filter=Select.Punt*" ${TEST_ARGS}) set_tests_properties(PuntToCpu PROPERTIES LABELS "enabling") endif() @@ -236,6 +241,9 @@ if(ENABLE_CUDA) list(APPEND TEST_PROGRAMS GpuSharedMemoryTest) endif() +if(ENABLE_L0) + list(APPEND TEST_PROGRAMS GpuSharedMemoryTestIntel) +endif() #if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") # list(APPEND TEST_PROGRAMS UdfTest) #endif() @@ -293,6 +301,19 @@ if(ENABLE_L0) ) set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling") + # set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp) + + # set(intel_gpu_module_name RuntimeFunctionsL0.bc) + # set(intel_gpu_module_internal_suffix "L0_internal.bc") + # function(precompile_intel_gpu_module SOURCE_FILE) + # set(module_compiler_flags -Xclang -ffake-address-space-map -DL0_RUNTIME_ENABLED) + # precompile_llvm_module(${SOURCE_FILE} ${intel_gpu_module_internal_suffix} ${module_compiler_flags}) + # endfunction() + # set(precompile_intel_gpu_module_cmd "precompile_intel_gpu_module") + + # precompile_modules("intel_gpu_precompiled_module_list" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} "${intel_gpu_runtime_function_sources}") + # link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_list}") + add_custom_target(enabling_tests COMMAND mkdir -p tmp COMMAND touch tmp/DictPayload From b0e4b3ebe6813138bbc5a74f74410dd602a88a41 Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Mon, 17 Apr 2023 08:24:29 -0500 Subject: [PATCH 8/9] Update GpuSMUtils to enable L0 --- omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp | 10 ++++++++++ omniscidb/Tests/CMakeLists.txt | 12 ------------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp index a1b023c72a..99a1845ddd 100644 --- a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp +++ b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp @@ -163,6 +163,16 @@ void GpuSharedMemCodeBuilder::codegenReduction(const CompilationOptions& co) { "f32:32:32-f64:64:64-v16:16:16-" "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); reduction_code.module->setTargetTriple("nvptx64-nvidia-cuda"); + +#ifdef HAVE_L0 + reduction_code.module->setDataLayout( + "e-p:64:64:64-i1:8:8-i8:8:8-" + "i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-" + "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); + reduction_code.module->setTargetTriple("spir64-unknown-unknown"); +#endif + llvm::Linker linker(*module_); std::unique_ptr owner(reduction_code.module); bool link_error = linker.linkInModule(std::move(owner)); diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt index 40e6e21aa1..5bf96f7677 100644 --- a/omniscidb/Tests/CMakeLists.txt +++ b/omniscidb/Tests/CMakeLists.txt @@ -301,18 +301,6 @@ if(ENABLE_L0) ) set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling") - # set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp) - - # set(intel_gpu_module_name RuntimeFunctionsL0.bc) - # set(intel_gpu_module_internal_suffix "L0_internal.bc") - # function(precompile_intel_gpu_module SOURCE_FILE) - # set(module_compiler_flags -Xclang -ffake-address-space-map -DL0_RUNTIME_ENABLED) - # precompile_llvm_module(${SOURCE_FILE} ${intel_gpu_module_internal_suffix} ${module_compiler_flags}) - # endfunction() - # set(precompile_intel_gpu_module_cmd "precompile_intel_gpu_module") - - # precompile_modules("intel_gpu_precompiled_module_list" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} "${intel_gpu_runtime_function_sources}") - # link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_list}") add_custom_target(enabling_tests COMMAND mkdir -p tmp From cad4e99d341f66dd0386fb35b763f78d1a5904eb Mon Sep 17 00:00:00 2001 From: Laurent Montigny Date: Mon, 17 Apr 2023 08:25:00 -0500 Subject: [PATCH 9/9] Update triple for spir64 --- omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp index 1ef81efd57..f726881afd 100644 --- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -160,7 +160,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module, "i16:16:16-i32:32:32-i64:64:64-" "f32:32:32-f64:64:64-v16:16:16-" "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); - module->setTargetTriple("nvptx64-nvidia-cuda"); + module->setTargetTriple("spir64-unknown-unknown"); llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations"); @@ -349,7 +349,7 @@ void perform_test_and_verify_results(TestInputData input) { "i16:16:16-i32:32:32-i64:64:64-" "f32:32:32-f64:64:64-v16:16:16-" "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); - module->setTargetTriple("spir-unknown-unknown"); + module->setTargetTriple("spir64-unknown-unknown"); auto l0_mgr = std::make_unique(); const auto row_set_mem_owner = std::make_shared(nullptr, Executor::getArenaBlockSize());