From fdfc379454474c3b00e40b7727267f6ec130ffab Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Thu, 13 Apr 2023 11:06:21 -0500
Subject: [PATCH 1/9] Add files for GPU SM Intel testing

---
 omniscidb/Tests/CMakeLists.txt               |   4 +
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 683 +++++++++++++++++++
 omniscidb/Tests/GpuSharedMemoryTestIntel.h   |  89 +++
 3 files changed, 776 insertions(+)
 create mode 100644 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
 create mode 100644 omniscidb/Tests/GpuSharedMemoryTestIntel.h
diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt
index 0de06ebb50..0d2734695f 100644
--- a/omniscidb/Tests/CMakeLists.txt
+++ b/omniscidb/Tests/CMakeLists.txt
@@ -65,16 +65,19 @@ if(ENABLE_L0)
   add_executable(SpirvBuildTest SpirvBuildTest.cpp)
   add_executable(DataMgrWithL0Test DataMgrWithL0Test.cpp)
   add_executable(IntelGPUEnablingTest IntelGPUEnablingTest.cpp)
+  add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp)
 
   target_link_libraries(L0MgrExecuteTest L0Mgr gtest ${llvm_libs} Logger OSDependent)
   target_link_libraries(SpirvBuildTest gtest ${llvm_libs})
   target_link_libraries(DataMgrWithL0Test DataMgr gtest)
   target_link_libraries(IntelGPUEnablingTest gtest QueryEngine ArrowQueryRunner)
+  target_link_libraries(GpuSharedMemoryTestIntel gtest QueryEngine ArrowQueryRunner)
 
   add_test(L0MgrExecuteTest L0MgrExecuteTest ${TEST_ARGS})
   add_test(SpirvBuildTest SpirvBuildTest ${TEST_ARGS})
   add_test(DataMgrWithL0Test DataMgrWithL0Test ${TEST_ARGS})
   add_test(IntelGPUEnablingTest IntelGPUEnablingTest ${TEST_ARGS})
+  add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS})
 endif()
 
 add_executable(CostModelTest CostModel/CostModelTest.cpp)
@@ -286,6 +289,7 @@ if(ENABLE_L0)
     SpirvBuildTest
     L0MgrExecuteTest
     IntelGPUEnablingTest
+    GpuSharedMemoryTestIntel
   )
   set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling")
 
diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
new file mode 100644
index 0000000000..a4da9c48de
--- /dev/null
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -0,0 +1,683 @@
+/*
+ * Copyright 2019 OmniSci, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GpuSharedMemoryTest.h"
+#include "QueryEngine/CompilationOptions.h"
+#include "QueryEngine/LLVMGlobalContext.h"
+#include "QueryEngine/OutputBufferInitialization.h"
+#include "QueryEngine/ResultSetReduction.h"
+#include "QueryEngine/ResultSetReductionJIT.h"
+
+extern bool g_is_test_env;
+
+auto int8_type = hdk::ir::Context::defaultCtx().int8();
+auto int16_type = hdk::ir::Context::defaultCtx().int16();
+auto int32_type = hdk::ir::Context::defaultCtx().int32();
+auto int64_type = hdk::ir::Context::defaultCtx().int64();
+auto float_type = hdk::ir::Context::defaultCtx().fp32();
+auto double_type = hdk::ir::Context::defaultCtx().fp64();
+
+namespace {
+
+void init_storage_buffer(int8_t* buffer,
+                         const std::vector<TargetInfo>& targets,
+                         const QueryMemoryDescriptor& query_mem_desc) {
+  // get the initial values for all the aggregate columns
+  const auto init_agg_vals = init_agg_val_vec(targets, query_mem_desc);
+  CHECK(!query_mem_desc.didOutputColumnar());
+  CHECK(query_mem_desc.getQueryDescriptionType() ==
+        QueryDescriptionType::GroupByPerfectHash);
+
+  const auto row_size = query_mem_desc.getRowSize();
+  CHECK(query_mem_desc.hasKeylessHash());
+  for (size_t entry_idx = 0; entry_idx < query_mem_desc.getEntryCount(); ++entry_idx) {
+    const auto row_ptr = buffer + entry_idx * row_size;
+    size_t init_agg_idx{0};
+    int64_t init_val{0};
+    // initialize each row's aggregate columns:
+    auto col_ptr = row_ptr + query_mem_desc.getColOffInBytes(0);
+    for (size_t slot_idx = 0; slot_idx < query_mem_desc.getSlotCount(); slot_idx++) {
+      if (query_mem_desc.getPaddedSlotWidthBytes(slot_idx) > 0) {
+        init_val = init_agg_vals[init_agg_idx++];
+      }
+      switch (query_mem_desc.getPaddedSlotWidthBytes(slot_idx)) {
+        case 4:
+          *reinterpret_cast<int32_t*>(col_ptr) = static_cast<int32_t>(init_val);
+          break;
+        case 8:
+          *reinterpret_cast<int64_t*>(col_ptr) = init_val;
+          break;
+        case 0:
+          break;
+        default:
+          UNREACHABLE();
+      }
+      col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, entry_idx, slot_idx);
+    }
+  }
+}
+
+}  // namespace
+
+void GpuReductionTester::codegenWrapperKernel() {
+  const unsigned address_space = 0;
+  auto pi8_type = llvm::Type::getInt8PtrTy(context_, address_space);
+  std::vector<llvm::Type*> input_arguments;
+  input_arguments.push_back(llvm::PointerType::get(pi8_type, address_space));
+  input_arguments.push_back(llvm::Type::getInt64Ty(context_));  // num input buffers
+  input_arguments.push_back(llvm::Type::getInt8PtrTy(context_, address_space));
+
+  llvm::FunctionType* ft =
+      llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false);
+  wrapper_kernel_ = llvm::Function::Create(
+      ft, llvm::Function::ExternalLinkage, "wrapper_kernel", module_);
+
+  auto arg_it = wrapper_kernel_->arg_begin();
+  auto input_ptrs = &*arg_it;
+  input_ptrs->setName("input_pointers");
+  arg_it++;
+  auto num_buffers = &*arg_it;
+  num_buffers->setName("num_buffers");
+  arg_it++;
+  auto output_buffer = &*arg_it;
+  output_buffer->setName("output_buffer");
+
+  llvm::IRBuilder<> ir_builder(context_);
+
+  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", wrapper_kernel_);
+  auto bb_body = llvm::BasicBlock::Create(context_, ".body", wrapper_kernel_);
+  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", wrapper_kernel_);
+
+  // return if blockIdx.x > num_buffers
+  ir_builder.SetInsertPoint(bb_entry);
+  auto get_block_index_func = getFunction("get_block_index");
+  auto block_index = ir_builder.CreateCall(get_block_index_func, {}, "block_index");
+  const auto is_block_inbound =
+      ir_builder.CreateICmpSLT(block_index, num_buffers, "is_block_inbound");
+  ir_builder.CreateCondBr(is_block_inbound, bb_body, bb_exit);
+
+  // locate the corresponding input buffer:
+  ir_builder.SetInsertPoint(bb_body);
+  auto input_buffer_gep = ir_builder.CreateGEP(
+      input_ptrs->getType()->getScalarType()->getPointerElementType(),
+      input_ptrs,
+      block_index);
+  auto input_buffer = ir_builder.CreateLoad(
+      llvm::Type::getInt8PtrTy(context_, address_space), input_buffer_gep);
+  auto input_buffer_ptr =
+      ir_builder.CreatePointerCast(input_buffer,
+                                   llvm::Type::getInt64PtrTy(context_, address_space),
+                                   "input_buffer_ptr");
+  const auto buffer_size = ll_int(
+      static_cast<int32_t>(query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU)),
+      context_);
+
+  // initializing shared memory and copy input buffer into shared memory buffer:
+  auto init_smem_func = getFunction("init_shared_mem");
+  auto smem_input_buffer_ptr = ir_builder.CreateCall(init_smem_func,
+                                                     {
+                                                         input_buffer_ptr,
+                                                         buffer_size,
+                                                     },
+                                                     "smem_input_buffer_ptr");
+
+  auto output_buffer_ptr =
+      ir_builder.CreatePointerCast(output_buffer,
+                                   llvm::Type::getInt64PtrTy(context_, address_space),
+                                   "output_buffer_ptr");
+  // call the reduction function
+  CHECK(reduction_func_);
+  std::vector<llvm::Value*> reduction_args{
+      output_buffer_ptr, smem_input_buffer_ptr, buffer_size};
+  ir_builder.CreateCall(reduction_func_, reduction_args);
+  ir_builder.CreateBr(bb_exit);
+
+  ir_builder.SetInsertPoint(bb_exit);
+  ir_builder.CreateRet(nullptr);
+}
+
+namespace {
+void prepare_generated_gpu_kernel(llvm::Module* module,
+                                  llvm::LLVMContext& context,
+                                  llvm::Function* kernel) {
+  // might be extra, remove and clean up
+  module->setDataLayout(
+      "e-p:64:64:64-i1:8:8-i8:8:8-"
+      "i16:16:16-i32:32:32-i64:64:64-"
+      "f32:32:32-f64:64:64-v16:16:16-"
+      "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
+  module->setTargetTriple("nvptx64-nvidia-cuda");
+
+  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
+
+  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(kernel),
+                               llvm::MDString::get(context, "kernel"),
+                               llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                                   llvm::Type::getInt32Ty(context), 1))};
+
+  // Append metadata to nvvm.annotations
+  md->addOperand(llvm::MDNode::get(context, md_vals));
+}
+
+std::unique_ptr<CudaDeviceCompilationContext> compile_and_link_gpu_code(
+    const std::string& cuda_llir,
+    llvm::Module* module,
+    // CudaMgr_Namespace::CudaMgr* cuda_mgr,
+    const std::string& kernel_name,
+    const size_t gpu_block_size = 1024,
+    const size_t gpu_device_idx = 0) {
+  CHECK(module);
+  // CHECK(cuda_mgr);
+  // auto& context = module->getContext();
+  // std::unique_ptr<llvm::TargetMachine> nvptx_target_machine =
+  //     compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch());
+  // const auto ptx =
+  //     compiler::CUDABackend::generatePTX(cuda_llir, nvptx_target_machine.get(),
+  //     context);
+
+  // auto cubin_result = ptx_to_cubin(ptx, gpu_block_size, cuda_mgr);
+  // auto& option_keys = cubin_result.option_keys;
+  // auto& option_values = cubin_result.option_values;
+  // auto cubin = cubin_result.cubin;
+  // auto link_state = cubin_result.link_state;
+  // const auto num_options = option_keys.size();
+  // auto gpu_context = std::make_unique<CudaDeviceCompilationContext>(cubin,
+  //                                                                   kernel_name,
+  //                                                                   gpu_device_idx,
+  //                                                                   cuda_mgr,
+  //                                                                   num_options,
+  //                                                                   &option_keys[0],
+  //                                                                   &option_values[0]);
+
+  // checkCudaErrors(cuLinkDestroy(link_state)); @LM
+  // return gpu_context;
+  return NULL;
+}
+
+std::vector<std::unique_ptr<ResultSet>> create_and_fill_input_result_sets(
+    const size_t num_input_buffers,
+    std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
+    const QueryMemoryDescriptor& query_mem_desc,
+    const std::vector<TargetInfo>& target_infos,
+    std::vector<StrideNumberGenerator>& generators,
+    const std::vector<size_t>& steps) {
+  std::vector<std::unique_ptr<ResultSet>> result_sets;
+  for (size_t i = 0; i < num_input_buffers; i++) {
+    result_sets.push_back(std::make_unique<ResultSet>(target_infos,
+                                                      ExecutorDeviceType::CPU,
+                                                      query_mem_desc,
+                                                      row_set_mem_owner,
+                                                      nullptr,
+                                                      0,
+                                                      0));
+    const auto storage = result_sets.back()->allocateStorage();
+    // fill_storage_buffer(storage->getUnderlyingBuffer(),
+    //                     target_infos,
+    //                     query_mem_desc,
+    //                     generators[i],
+    // steps[i]);
+  }
+  return result_sets;
+}
+
+std::pair<std::unique_ptr<ResultSet>, std::unique_ptr<ResultSet>>
+create_and_init_output_result_sets(std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
+                                   const QueryMemoryDescriptor& query_mem_desc,
+                                   const std::vector<TargetInfo>& target_infos) {
+  // CPU result set, will eventually host CPU reduciton results for validations
+  auto cpu_result_set = std::make_unique<ResultSet>(target_infos,
+                                                    ExecutorDeviceType::CPU,
+                                                    query_mem_desc,
+                                                    row_set_mem_owner,
+                                                    nullptr,
+                                                    0,
+                                                    0);
+  auto cpu_storage_result = cpu_result_set->allocateStorage();
+  init_storage_buffer(
+      cpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc);
+
+  // GPU result set, will eventually host GPU reduction results
+  auto gpu_result_set = std::make_unique<ResultSet>(target_infos,
+                                                    ExecutorDeviceType::GPU,
+                                                    query_mem_desc,
+                                                    row_set_mem_owner,
+                                                    nullptr,
+                                                    0,
+                                                    0);
+  auto gpu_storage_result = gpu_result_set->allocateStorage();
+  init_storage_buffer(
+      gpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc);
+  return std::make_pair(std::move(cpu_result_set), std::move(gpu_result_set));
+}
+void perform_reduction_on_cpu(std::vector<std::unique_ptr<ResultSet>>& result_sets,
+                              const ResultSetStorage* cpu_result_storage) {
+  CHECK(result_sets.size() > 0);
+  Config config;
+  // for codegen only
+  // auto executor = Executor::getExecutor(nullptr);
+  // ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(),
+  //                                     result_sets.front()->getTargetInfos(),
+  //                                     result_sets.front()->getTargetInitVals(),
+  //                                     config,
+  //                                     executor.get());
+  // const auto reduction_code = reduction_jit.codegen();
+  // for (auto& result_set : result_sets) {
+  //   ResultSetReduction::reduce(*cpu_result_storage,
+  //                              *(result_set->getStorage()),
+  //                              {},
+  //                              reduction_code,
+  //                              config,
+  //                              executor.get());
+  // }
+}
+
+struct TestInputData {
+  size_t device_id;
+  size_t num_input_buffers;
+  std::vector<TargetInfo> target_infos;
+  int8_t suggested_agg_widths;
+  size_t min_entry;
+  size_t max_entry;
+  size_t step_size;
+  bool keyless_hash;
+  int32_t target_index_for_key;
+  TestInputData()
+      : device_id(0)
+      , num_input_buffers(0)
+      , suggested_agg_widths(0)
+      , min_entry(0)
+      , max_entry(0)
+      , step_size(2)
+      , keyless_hash(false)
+      , target_index_for_key(0) {}
+  TestInputData& setDeviceId(const size_t id) {
+    device_id = id;
+    return *this;
+  }
+  TestInputData& setNumInputBuffers(size_t num_buffers) {
+    num_input_buffers = num_buffers;
+    return *this;
+  }
+  TestInputData& setTargetInfos(std::vector<TargetInfo> tis) {
+    target_infos = tis;
+    return *this;
+  }
+  TestInputData& setAggWidth(int8_t agg_width) {
+    suggested_agg_widths = agg_width;
+    return *this;
+  }
+  TestInputData& setMinEntry(size_t min_e) {
+    min_entry = min_e;
+    return *this;
+  }
+  TestInputData& setMaxEntry(size_t max_e) {
+    max_entry = max_e;
+    return *this;
+  }
+  TestInputData& setKeylessHash(bool is_keyless) {
+    keyless_hash = is_keyless;
+    return *this;
+  }
+  TestInputData& setTargetIndexForKey(size_t target_idx) {
+    target_index_for_key = target_idx;
+    return *this;
+  }
+  TestInputData& setStepSize(size_t step) {
+    step_size = step;
+    return *this;
+  }
+};
+
+void perform_test_and_verify_results(TestInputData input) {
+  auto executor = Executor::getExecutor(nullptr, nullptr);
+  auto& context = executor->getContext();
+  auto cgen_state = std::unique_ptr<CgenState>(
+      new CgenState({}, false, false, executor->getExtensionModuleContext(), context));
+  cgen_state->set_module_shallow_copy(
+      executor->getExtensionModuleContext()->getRTModule(/*is_l0=*/false));
+  // auto module = cgen_state->module_;
+  // module->setDataLayout(
+  //     "e-p:64:64:64-i1:8:8-i8:8:8-"
+  //     "i16:16:16-i32:32:32-i64:64:64-"
+  //     "f32:32:32-f64:64:64-v16:16:16-"
+  //     "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
+  // module->setTargetTriple("nvptx64-nvidia-cuda");
+  // auto cuda_mgr = std::make_unique<CudaMgr_Namespace::CudaMgr>(1);
+  // const auto row_set_mem_owner =
+  //     std::make_shared<RowSetMemoryOwner>(nullptr, Executor::getArenaBlockSize());
+  // auto query_mem_desc = perfect_hash_one_col_desc(
+  //     input.target_infos, input.suggested_agg_widths, input.min_entry,
+  //     input.max_entry);
+  // if (input.keyless_hash) {
+  //   query_mem_desc.setHasKeylessHash(true);
+  //   query_mem_desc.setTargetIdxForKey(input.target_index_for_key);
+  // }
+
+  // std::vector<StrideNumberGenerator> generators(
+  //     input.num_input_buffers, StrideNumberGenerator(1, input.step_size));
+  // std::vector<size_t> steps(input.num_input_buffers, input.step_size);
+  // auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers,
+  //                                                            row_set_mem_owner,
+  //                                                            query_mem_desc,
+  //                                                            input.target_infos,
+  //                                                            generators,
+  //                                                            steps);
+
+  // const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets(
+  //     row_set_mem_owner, query_mem_desc, input.target_infos);
+
+  // // performing reduciton using the GPU reduction code:
+  // Config config;
+  // GpuReductionTester gpu_smem_tester(config,
+  //                                    module,
+  //                                    context,
+  //                                    query_mem_desc,
+  //                                    input.target_infos,
+  //                                    init_agg_val_vec(input.target_infos,
+  //                                    query_mem_desc), cuda_mgr.get(), executor.get());
+  // gpu_smem_tester.codegen(CompilationOptions::defaults(
+  //     ExecutorDeviceType::GPU,
+  //     false));  // generate code for gpu reduciton and initialization
+  // gpu_smem_tester.codegenWrapperKernel();
+  // gpu_smem_tester.performReductionTest(
+  //     input_result_sets, gpu_result_set->getStorage(), input.device_id);
+
+  // // CPU reduction for validation:
+  // perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage());
+
+  // const auto cmp_result =
+  //     std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(),
+  //                 gpu_result_set->getStorage()->getUnderlyingBuffer(),
+  //                 query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU));
+  // ASSERT_EQ(cmp_result, 0);
+}
+
+}  // namespace
+
+void GpuReductionTester::performReductionTest(
+    const std::vector<std::unique_ptr<ResultSet>>& result_sets,
+    const ResultSetStorage* gpu_result_storage,
+    const size_t device_id) {
+  prepare_generated_gpu_kernel(module_, context_, getWrapperKernel());
+
+  std::stringstream ss;
+  llvm::raw_os_ostream os(ss);
+  module_->print(os, nullptr);
+  os.flush();
+  std::string module_str(ss.str());
+
+  // std::unique_ptr<CudaDeviceCompilationContext> gpu_context(compile_and_link_gpu_code(
+  //     module_str, module_, cuda_mgr_, getWrapperKernel()->getName().str()));
+
+  // const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU);
+  // const size_t num_buffers = result_sets.size();
+  // std::vector<int8_t*> d_input_buffers;
+  // for (size_t i = 0; i < num_buffers; i++) {
+  //   d_input_buffers.push_back(cuda_mgr_->allocateDeviceMem(buffer_size, device_id));
+  //   cuda_mgr_->copyHostToDevice(d_input_buffers[i],
+  //                               result_sets[i]->getStorage()->getUnderlyingBuffer(),
+  //                               buffer_size,
+  //                               device_id);
+  // }
+
+  // constexpr size_t num_kernel_params = 3;
+  // CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params);
+
+  // // parameter 1: an array of device pointers
+  // std::vector<CUdeviceptr> h_input_buffer_dptrs;
+  // h_input_buffer_dptrs.reserve(num_buffers);
+  // std::transform(d_input_buffers.begin(),
+  //                d_input_buffers.end(),
+  //                std::back_inserter(h_input_buffer_dptrs),
+  //                [](int8_t* dptr) { return reinterpret_cast<CUdeviceptr>(dptr); });
+
+  // auto d_input_buffer_dptrs =
+  //     cuda_mgr_->allocateDeviceMem(num_buffers * sizeof(CUdeviceptr), device_id);
+  // cuda_mgr_->copyHostToDevice(d_input_buffer_dptrs,
+  //                             reinterpret_cast<int8_t*>(h_input_buffer_dptrs.data()),
+  //                             num_buffers * sizeof(CUdeviceptr),
+  //                             device_id);
+
+  // // parameter 2: number of buffers
+  // auto d_num_buffers = cuda_mgr_->allocateDeviceMem(sizeof(int64_t), device_id);
+  // cuda_mgr_->copyHostToDevice(d_num_buffers,
+  //                             reinterpret_cast<const int8_t*>(&num_buffers),
+  //                             sizeof(int64_t),
+  //                             device_id);
+
+  // // parameter 3: device pointer to the output buffer
+  // auto d_result_buffer = cuda_mgr_->allocateDeviceMem(buffer_size, device_id);
+  // cuda_mgr_->copyHostToDevice(
+  //     d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size,
+  //     device_id);
+
+  // // collecting all kernel parameters:
+  // std::vector<CUdeviceptr> h_kernel_params{
+  //     reinterpret_cast<CUdeviceptr>(d_input_buffer_dptrs),
+  //     reinterpret_cast<CUdeviceptr>(d_num_buffers),
+  //     reinterpret_cast<CUdeviceptr>(d_result_buffer)};
+
+  // // casting each kernel parameter to be a void* device ptr itself:
+  // std::vector<void*> kernel_param_ptrs;
+  // kernel_param_ptrs.reserve(num_kernel_params);
+  // std::transform(h_kernel_params.begin(),
+  //                h_kernel_params.end(),
+  //                std::back_inserter(kernel_param_ptrs),
+  //                [](CUdeviceptr& param) { return &param; });
+
+  // // launching a kernel:
+  // auto cu_func = static_cast<CUfunction>(gpu_context->kernel());
+  // // we launch as many threadblocks as there are input buffers:
+  // // in other words, each input buffer is handled by a single threadblock.
+
+  // // checkCudaErrors(cuLaunchKernel(cu_func,
+  // //                                num_buffers,
+  // //                                1,
+  // //                                1,
+  // //                                1024,
+  // //                                1,
+  // //                                1,
+  // //                                buffer_size,
+  // //                                0,
+  // //  kernel_param_ptrs.data(),
+  // //  nullptr));
+
+  // // transfer back the results:
+  // cuda_mgr_->copyDeviceToHost(
+  //     gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size,
+  //     device_id);
+
+  // // release the gpu memory used:
+  // for (auto& d_buffer : d_input_buffers) {
+  //   cuda_mgr_->freeDeviceMem(d_buffer);
+  // }
+  // cuda_mgr_->freeDeviceMem(d_input_buffer_dptrs);
+  // cuda_mgr_->freeDeviceMem(d_num_buffers);
+  // cuda_mgr_->freeDeviceMem(d_result_buffer);
+}
+
+// TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) {
+//   for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) {
+//     TestInputData input;
+//     input.setDeviceId(0)
+//         .setNumInputBuffers(4)
+//         .setTargetInfos(generate_custom_agg_target_infos(
+//             {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type}))
+//         .setAggWidth(4)
+//         .setMinEntry(0)
+//         .setMaxEntry(num_entries)
+//         .setStepSize(2)
+//         .setKeylessHash(true)
+//         .setTargetIndexForKey(0);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableEntries_CountQuery_8B_Group) {
+//   for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) {
+//     TestInputData input;
+//     input.setDeviceId(0)
+//         .setNumInputBuffers(4)
+//         .setTargetInfos(generate_custom_agg_target_infos(
+//             {8}, {hdk::ir::AggType::kCount}, {int64_type}, {int64_type}))
+//         .setAggWidth(8)
+//         .setMinEntry(0)
+//         .setMaxEntry(num_entries)
+//         .setStepSize(2)
+//         .setKeylessHash(true)
+//         .setTargetIndexForKey(0);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableSteps_FixedEntries_1) {
+//   TestInputData input;
+//   input.setDeviceId(0)
+//       .setNumInputBuffers(4)
+//       .setAggWidth(8)
+//       .setMinEntry(0)
+//       .setMaxEntry(126)
+//       .setKeylessHash(true)
+//       .setTargetIndexForKey(0)
+//       .setTargetInfos(generate_custom_agg_target_infos(
+//           {8},
+//           {hdk::ir::AggType::kCount,
+//            hdk::ir::AggType::kMax,
+//            hdk::ir::AggType::kMin,
+//            hdk::ir::AggType::kSum,
+//            hdk::ir::AggType::kAvg},
+//           {int64_type, int64_type, int64_type, int64_type, double_type},
+//           {int32_type, int32_type, int32_type, int32_type, int32_type}));
+
+//   for (auto& step_size : {2, 3, 5, 7, 11, 13}) {
+//     input.setStepSize(step_size);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableSteps_FixedEntries_2) {
+//   TestInputData input;
+//   input.setDeviceId(0)
+//       .setNumInputBuffers(4)
+//       .setAggWidth(8)
+//       .setMinEntry(0)
+//       .setMaxEntry(126)
+//       .setKeylessHash(true)
+//       .setTargetIndexForKey(0)
+//       .setTargetInfos(generate_custom_agg_target_infos(
+//           {8},
+//           {hdk::ir::AggType::kCount,
+//            hdk::ir::AggType::kAvg,
+//            hdk::ir::AggType::kMax,
+//            hdk::ir::AggType::kSum,
+//            hdk::ir::AggType::kMin},
+//           {int64_type, double_type, int64_type, int64_type, int64_type},
+//           {int32_type, int32_type, int32_type, int32_type, int32_type}));
+
+//   for (auto& step_size : {2, 3, 5, 7, 11, 13}) {
+//     input.setStepSize(step_size);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableSteps_FixedEntries_3) {
+//   TestInputData input;
+//   input.setDeviceId(0)
+//       .setNumInputBuffers(4)
+//       .setAggWidth(8)
+//       .setMinEntry(0)
+//       .setMaxEntry(367)
+//       .setKeylessHash(true)
+//       .setTargetIndexForKey(0)
+//       .setTargetInfos(generate_custom_agg_target_infos(
+//           {8},
+//           {hdk::ir::AggType::kCount,
+//            hdk::ir::AggType::kMax,
+//            hdk::ir::AggType::kAvg,
+//            hdk::ir::AggType::kSum,
+//            hdk::ir::AggType::kMin},
+//           {int64_type, double_type, double_type, double_type, double_type},
+//           {int32_type, double_type, double_type, double_type, double_type}));
+
+//   for (auto& step_size : {2, 3, 5, 7, 11, 13}) {
+//     input.setStepSize(step_size);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableSteps_FixedEntries_4) {
+//   TestInputData input;
+//   input.setDeviceId(0)
+//       .setNumInputBuffers(4)
+//       .setAggWidth(8)
+//       .setMinEntry(0)
+//       .setMaxEntry(517)
+//       .setKeylessHash(true)
+//       .setTargetIndexForKey(0)
+//       .setTargetInfos(generate_custom_agg_target_infos(
+//           {8},
+//           {hdk::ir::AggType::kCount,
+//            hdk::ir::AggType::kSum,
+//            hdk::ir::AggType::kMax,
+//            hdk::ir::AggType::kAvg,
+//            hdk::ir::AggType::kMin},
+//           {int64_type, float_type, float_type, float_type, float_type},
+//           {int16_type, float_type, float_type, float_type, float_type}));
+
+//   for (auto& step_size : {2, 3, 5, 7, 11, 13}) {
+//     input.setStepSize(step_size);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+// TEST(SingleColumn, VariableNumBuffers) {
+//   TestInputData input;
+//   input.setDeviceId(0)
+//       .setAggWidth(8)
+//       .setMinEntry(0)
+//       .setMaxEntry(266)
+//       .setKeylessHash(true)
+//       .setTargetIndexForKey(0)
+//       .setTargetInfos(generate_custom_agg_target_infos(
+//           {8},
+//           {hdk::ir::AggType::kCount,
+//            hdk::ir::AggType::kSum,
+//            hdk::ir::AggType::kAvg,
+//            hdk::ir::AggType::kMax,
+//            hdk::ir::AggType::kMin},
+//           {int32_type, int64_type, double_type, float_type, double_type},
+//           {int8_type, int8_type, int16_type, float_type, double_type}));
+
+//   for (auto& num_buffers : {2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128}) {
+//     input.setNumInputBuffers(num_buffers);
+//     perform_test_and_verify_results(input);
+//   }
+// }
+
+int main(int argc, char** argv) {
+  g_is_test_env = true;
+
+  TestHelpers::init_logger_stderr_only(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  int err{0};
+  try {
+    err = RUN_ALL_TESTS();
+  } catch (const std::exception& e) {
+    LOG(ERROR) << e.what();
+  }
+  return err;
+}
diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
new file mode 100644
index 0000000000..18cab64b05
--- /dev/null
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 OmniSci, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "CudaMgr/CudaMgr.h"
+#include "Logger/Logger.h"
+#include "QueryEngine/CodeGenerator.h"
+#include "QueryEngine/GpuSharedMemoryUtils.h"
+#include "QueryEngine/LLVMFunctionAttributesUtil.h"
+#include "QueryEngine/NvidiaKernel.h"
+#include "QueryEngine/OutputBufferInitialization.h"
+#include "ResultSetTestUtils.h"
+#include "Shared/TargetInfo.h"
+#include "TestHelpers.h"
+
+#include <gtest/gtest.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Support/raw_os_ostream.h>
+#include <iostream>
+#include <memory>
+
+class StrideNumberGenerator : public NumberGenerator {
+ public:
+  StrideNumberGenerator(const int64_t start, const int64_t stride)
+      : crt_(start), stride_(stride), start_(start) {}
+
+  int64_t getNextValue() override {
+    const auto crt = crt_;
+    crt_ += stride_;
+    return crt;
+  }
+
+  void reset() override { crt_ = start_; }
+
+ private:
+  int64_t crt_;
+  int64_t stride_;
+  int64_t start_;
+};
+
+class GpuReductionTester : public GpuSharedMemCodeBuilder {
+ public:
+  GpuReductionTester(const Config& config,
+                     llvm::Module* module,
+                     llvm::LLVMContext& context,
+                     const QueryMemoryDescriptor& qmd,
+                     const std::vector<TargetInfo>& targets,
+                     const std::vector<int64_t>& init_agg_values,
+                     CudaMgr_Namespace::CudaMgr* cuda_mgr,
+                     Executor* executor)
+      : GpuSharedMemCodeBuilder(module,
+                                context,
+                                qmd,
+                                targets,
+                                init_agg_values,
+                                config,
+                                executor)
+      , cuda_mgr_(cuda_mgr) {
+    // CHECK(getReductionFunction());
+  }
+  void codegenWrapperKernel();
+  llvm::Function* getWrapperKernel() const { return wrapper_kernel_; }
+  void performReductionTest(const std::vector<std::unique_ptr<ResultSet>>& result_sets,
+                            const ResultSetStorage* gpu_result_storage,
+                            const size_t device_id);
+
+ private:
+  CudaMgr_Namespace::CudaMgr* cuda_mgr_;
+  llvm::Function* wrapper_kernel_;
+};

From 565a3e0554f11e61e86f65a9ec4760cf6b57d2e0 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Fri, 14 Apr 2023 09:45:02 -0500
Subject: [PATCH 2/9] Replacing cudaMgr by L0mgr

---
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 38 ++++++++++----------
 omniscidb/Tests/GpuSharedMemoryTestIntel.h   | 10 +++---
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index a4da9c48de..dfe76790e6 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "GpuSharedMemoryTest.h"
+#include "GpuSharedMemoryTestIntel.h"
 #include "QueryEngine/CompilationOptions.h"
 #include "QueryEngine/LLVMGlobalContext.h"
 #include "QueryEngine/OutputBufferInitialization.h"
@@ -175,13 +175,13 @@ void prepare_generated_gpu_kernel(llvm::Module* module,
 std::unique_ptr<CudaDeviceCompilationContext> compile_and_link_gpu_code(
     const std::string& cuda_llir,
     llvm::Module* module,
-    // CudaMgr_Namespace::CudaMgr* cuda_mgr,
+    l0::L0Manager* l0_mgr,
     const std::string& kernel_name,
     const size_t gpu_block_size = 1024,
     const size_t gpu_device_idx = 0) {
   CHECK(module);
-  // CHECK(cuda_mgr);
-  // auto& context = module->getContext();
+  CHECK(l0_mgr);
+  auto& context = module->getContext();
   // std::unique_ptr<llvm::TargetMachine> nvptx_target_machine =
   //     compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch());
   // const auto ptx =
@@ -267,21 +267,21 @@ void perform_reduction_on_cpu(std::vector<std::unique_ptr<ResultSet>>& result_se
   CHECK(result_sets.size() > 0);
   Config config;
   // for codegen only
-  // auto executor = Executor::getExecutor(nullptr);
-  // ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(),
-  //                                     result_sets.front()->getTargetInfos(),
-  //                                     result_sets.front()->getTargetInitVals(),
-  //                                     config,
-  //                                     executor.get());
-  // const auto reduction_code = reduction_jit.codegen();
-  // for (auto& result_set : result_sets) {
-  //   ResultSetReduction::reduce(*cpu_result_storage,
-  //                              *(result_set->getStorage()),
-  //                              {},
-  //                              reduction_code,
-  //                              config,
-  //                              executor.get());
-  // }
+  auto executor = Executor::getExecutor(nullptr);
+  ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(),
+                                      result_sets.front()->getTargetInfos(),
+                                      result_sets.front()->getTargetInitVals(),
+                                      config,
+                                      executor.get());
+  const auto reduction_code = reduction_jit.codegen();
+  for (auto& result_set : result_sets) {
+    ResultSetReduction::reduce(*cpu_result_storage,
+                               *(result_set->getStorage()),
+                               {},
+                               reduction_code,
+                               config,
+                               executor.get());
+  }
 }
 
 struct TestInputData {
diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
index 18cab64b05..cbdcd71289 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.h
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "CudaMgr/CudaMgr.h"
+#include "L0Mgr/L0Mgr.h"
 #include "Logger/Logger.h"
 #include "QueryEngine/CodeGenerator.h"
 #include "QueryEngine/GpuSharedMemoryUtils.h"
 #include "QueryEngine/LLVMFunctionAttributesUtil.h"
-#include "QueryEngine/NvidiaKernel.h"
+// #include "QueryEngine/NvidiaKernel.h"
 #include "QueryEngine/OutputBufferInitialization.h"
 #include "ResultSetTestUtils.h"
 #include "Shared/TargetInfo.h"
@@ -65,7 +65,7 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder {
                      const QueryMemoryDescriptor& qmd,
                      const std::vector<TargetInfo>& targets,
                      const std::vector<int64_t>& init_agg_values,
-                     CudaMgr_Namespace::CudaMgr* cuda_mgr,
+                     l0::L0Manager* l0_mgr,
                      Executor* executor)
       : GpuSharedMemCodeBuilder(module,
                                 context,
@@ -74,7 +74,7 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder {
                                 init_agg_values,
                                 config,
                                 executor)
-      , cuda_mgr_(cuda_mgr) {
+      , l0_mgr_(l0_mgr) {
     // CHECK(getReductionFunction());
   }
   void codegenWrapperKernel();
@@ -84,6 +84,6 @@ class GpuReductionTester : public GpuSharedMemCodeBuilder {
                             const size_t device_id);
 
  private:
-  CudaMgr_Namespace::CudaMgr* cuda_mgr_;
+  l0::L0Manager* l0_mgr_;
   llvm::Function* wrapper_kernel_;
 };

From 7e22fb315cbf90170d76407e7c5f7f61ca5002b5 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Fri, 14 Apr 2023 10:31:53 -0500
Subject: [PATCH 3/9] Add compile_and_link_gpu_code for L0

---
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 45 +++++++++-----------
 omniscidb/Tests/GpuSharedMemoryTestIntel.h   |  2 +-
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index dfe76790e6..a519c8adba 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "GpuSharedMemoryTestIntel.h"
+#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
 #include "QueryEngine/CompilationOptions.h"
 #include "QueryEngine/LLVMGlobalContext.h"
 #include "QueryEngine/OutputBufferInitialization.h"
@@ -172,8 +173,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module,
   md->addOperand(llvm::MDNode::get(context, md_vals));
 }
 
-std::unique_ptr<CudaDeviceCompilationContext> compile_and_link_gpu_code(
-    const std::string& cuda_llir,
+std::unique_ptr<L0DeviceCompilationContext> compile_and_link_gpu_code(
     llvm::Module* module,
     l0::L0Manager* l0_mgr,
     const std::string& kernel_name,
@@ -182,29 +182,24 @@ std::unique_ptr<CudaDeviceCompilationContext> compile_and_link_gpu_code(
   CHECK(module);
   CHECK(l0_mgr);
   auto& context = module->getContext();
-  // std::unique_ptr<llvm::TargetMachine> nvptx_target_machine =
-  //     compiler::CUDABackend::initializeNVPTXBackend(cuda_mgr->getDeviceArch());
-  // const auto ptx =
-  //     compiler::CUDABackend::generatePTX(cuda_llir, nvptx_target_machine.get(),
-  //     context);
-
-  // auto cubin_result = ptx_to_cubin(ptx, gpu_block_size, cuda_mgr);
-  // auto& option_keys = cubin_result.option_keys;
-  // auto& option_values = cubin_result.option_values;
-  // auto cubin = cubin_result.cubin;
-  // auto link_state = cubin_result.link_state;
-  // const auto num_options = option_keys.size();
-  // auto gpu_context = std::make_unique<CudaDeviceCompilationContext>(cubin,
-  //                                                                   kernel_name,
-  //                                                                   gpu_device_idx,
-  //                                                                   cuda_mgr,
-  //                                                                   num_options,
-  //                                                                   &option_keys[0],
-  //                                                                   &option_values[0]);
-
-  // checkCudaErrors(cuLinkDestroy(link_state)); @LM
-  // return gpu_context;
-  return NULL;
+
+  SPIRV::TranslatorOpts opts;
+  opts.enableAllExtensions();
+  opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::OpenCL12);
+  opts.setDebugInfoEIS(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100);
+
+  std::ostringstream ss;
+  std::string err;
+  auto success = writeSpirv(module, opts, ss, err);
+  CHECK(success) << "Spirv translation failed with error: " << err << "\n";
+
+  L0BinResult bin_result;
+  bin_result = spv_to_bin(ss.str(), kernel_name, gpu_block_size, l0_mgr);
+
+  auto l0_context = std::make_unique<L0DeviceCompilationContext>(
+      bin_result.device, bin_result.kernel, bin_result.module, l0_mgr, 0, 1);
+
+  return l0_context;
 }
 
 std::vector<std::unique_ptr<ResultSet>> create_and_fill_input_result_sets(
diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
index cbdcd71289..535e0a47e4 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.h
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h
@@ -20,8 +20,8 @@
 #include "Logger/Logger.h"
 #include "QueryEngine/CodeGenerator.h"
 #include "QueryEngine/GpuSharedMemoryUtils.h"
+#include "QueryEngine/L0Kernel.h"
 #include "QueryEngine/LLVMFunctionAttributesUtil.h"
-// #include "QueryEngine/NvidiaKernel.h"
 #include "QueryEngine/OutputBufferInitialization.h"
 #include "ResultSetTestUtils.h"
 #include "Shared/TargetInfo.h"

From 6a64c86d9dd3d4da4ec1394553bfce87990e1bf2 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Fri, 14 Apr 2023 10:46:25 -0500
Subject: [PATCH 4/9] Fix fill_storage_buffer method

---
 omniscidb/Tests/CMakeLists.txt               |  2 +-
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt
index 0d2734695f..1f5094cb7f 100644
--- a/omniscidb/Tests/CMakeLists.txt
+++ b/omniscidb/Tests/CMakeLists.txt
@@ -65,7 +65,7 @@ if(ENABLE_L0)
   add_executable(SpirvBuildTest SpirvBuildTest.cpp)
   add_executable(DataMgrWithL0Test DataMgrWithL0Test.cpp)
   add_executable(IntelGPUEnablingTest IntelGPUEnablingTest.cpp)
-  add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp)
+  add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp ResultSetTestUtils.cpp)
 
   target_link_libraries(L0MgrExecuteTest L0Mgr gtest ${llvm_libs} Logger OSDependent)
   target_link_libraries(SpirvBuildTest gtest ${llvm_libs})
diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index a519c8adba..221051b70d 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -219,11 +219,11 @@ std::vector<std::unique_ptr<ResultSet>> create_and_fill_input_result_sets(
                                                       0,
                                                       0));
     const auto storage = result_sets.back()->allocateStorage();
-    // fill_storage_buffer(storage->getUnderlyingBuffer(),
-    //                     target_infos,
-    //                     query_mem_desc,
-    //                     generators[i],
-    // steps[i]);
+    fill_storage_buffer(storage->getUnderlyingBuffer(),
+                        target_infos,
+                        query_mem_desc,
+                        generators[i],
+                        steps[i]);
   }
   return result_sets;
 }

From 048233f5dfbe7f703b6897f1738eab019be4f8f5 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Fri, 14 Apr 2023 11:29:00 -0500
Subject: [PATCH 5/9] Enable perform_test_and_verify_results for L0

---
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 108 +++++++++----------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index 221051b70d..2008902eb1 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -343,61 +343,61 @@ void perform_test_and_verify_results(TestInputData input) {
       new CgenState({}, false, false, executor->getExtensionModuleContext(), context));
   cgen_state->set_module_shallow_copy(
       executor->getExtensionModuleContext()->getRTModule(/*is_l0=*/false));
-  // auto module = cgen_state->module_;
-  // module->setDataLayout(
-  //     "e-p:64:64:64-i1:8:8-i8:8:8-"
-  //     "i16:16:16-i32:32:32-i64:64:64-"
-  //     "f32:32:32-f64:64:64-v16:16:16-"
-  //     "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
-  // module->setTargetTriple("nvptx64-nvidia-cuda");
-  // auto cuda_mgr = std::make_unique<CudaMgr_Namespace::CudaMgr>(1);
-  // const auto row_set_mem_owner =
-  //     std::make_shared<RowSetMemoryOwner>(nullptr, Executor::getArenaBlockSize());
-  // auto query_mem_desc = perfect_hash_one_col_desc(
-  //     input.target_infos, input.suggested_agg_widths, input.min_entry,
-  //     input.max_entry);
-  // if (input.keyless_hash) {
-  //   query_mem_desc.setHasKeylessHash(true);
-  //   query_mem_desc.setTargetIdxForKey(input.target_index_for_key);
-  // }
+  auto module = cgen_state->module_;
+  module->setDataLayout(
+      "e-p:64:64:64-i1:8:8-i8:8:8-"
+      "i16:16:16-i32:32:32-i64:64:64-"
+      "f32:32:32-f64:64:64-v16:16:16-"
+      "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
+  module->setTargetTriple("spir-unknown-unknown");
+  auto l0_mgr = std::make_unique<l0::L0Manager>();
+  const auto row_set_mem_owner =
+      std::make_shared<RowSetMemoryOwner>(nullptr, Executor::getArenaBlockSize());
+  auto query_mem_desc = perfect_hash_one_col_desc(
+      input.target_infos, input.suggested_agg_widths, input.min_entry, input.max_entry);
+  if (input.keyless_hash) {
+    query_mem_desc.setHasKeylessHash(true);
+    query_mem_desc.setTargetIdxForKey(input.target_index_for_key);
+  }
+
+  std::vector<StrideNumberGenerator> generators(
+      input.num_input_buffers, StrideNumberGenerator(1, input.step_size));
+  std::vector<size_t> steps(input.num_input_buffers, input.step_size);
+  auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers,
+                                                             row_set_mem_owner,
+                                                             query_mem_desc,
+                                                             input.target_infos,
+                                                             generators,
+                                                             steps);
 
-  // std::vector<StrideNumberGenerator> generators(
-  //     input.num_input_buffers, StrideNumberGenerator(1, input.step_size));
-  // std::vector<size_t> steps(input.num_input_buffers, input.step_size);
-  // auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers,
-  //                                                            row_set_mem_owner,
-  //                                                            query_mem_desc,
-  //                                                            input.target_infos,
-  //                                                            generators,
-  //                                                            steps);
-
-  // const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets(
-  //     row_set_mem_owner, query_mem_desc, input.target_infos);
-
-  // // performing reduciton using the GPU reduction code:
-  // Config config;
-  // GpuReductionTester gpu_smem_tester(config,
-  //                                    module,
-  //                                    context,
-  //                                    query_mem_desc,
-  //                                    input.target_infos,
-  //                                    init_agg_val_vec(input.target_infos,
-  //                                    query_mem_desc), cuda_mgr.get(), executor.get());
-  // gpu_smem_tester.codegen(CompilationOptions::defaults(
-  //     ExecutorDeviceType::GPU,
-  //     false));  // generate code for gpu reduciton and initialization
-  // gpu_smem_tester.codegenWrapperKernel();
-  // gpu_smem_tester.performReductionTest(
-  //     input_result_sets, gpu_result_set->getStorage(), input.device_id);
-
-  // // CPU reduction for validation:
-  // perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage());
-
-  // const auto cmp_result =
-  //     std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(),
-  //                 gpu_result_set->getStorage()->getUnderlyingBuffer(),
-  //                 query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU));
-  // ASSERT_EQ(cmp_result, 0);
+  const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets(
+      row_set_mem_owner, query_mem_desc, input.target_infos);
+
+  // performing reduciton using the GPU reduction code:
+  Config config;
+  GpuReductionTester gpu_smem_tester(config,
+                                     module,
+                                     context,
+                                     query_mem_desc,
+                                     input.target_infos,
+                                     init_agg_val_vec(input.target_infos, query_mem_desc),
+                                     l0_mgr.get(),
+                                     executor.get());
+  gpu_smem_tester.codegen(CompilationOptions::defaults(
+      ExecutorDeviceType::GPU,
+      false));  // generate code for gpu reduciton and initialization
+  gpu_smem_tester.codegenWrapperKernel();
+  gpu_smem_tester.performReductionTest(
+      input_result_sets, gpu_result_set->getStorage(), input.device_id);
+
+  // CPU reduction for validation:
+  perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage());
+
+  const auto cmp_result =
+      std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(),
+                  gpu_result_set->getStorage()->getUnderlyingBuffer(),
+                  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU));
+  ASSERT_EQ(cmp_result, 0);
 }
 
 }  // namespace

From 91df5856a5e110f6ea358204f52c411812784b36 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Fri, 14 Apr 2023 13:15:43 -0500
Subject: [PATCH 6/9] Enable all the missing L0

---
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 217 ++++++++++---------
 1 file changed, 112 insertions(+), 105 deletions(-)

diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index 2008902eb1..1ef81efd57 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -174,6 +174,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module,
 }
 
 std::unique_ptr<L0DeviceCompilationContext> compile_and_link_gpu_code(
+    const std::string& l0_llir,
     llvm::Module* module,
     l0::L0Manager* l0_mgr,
     const std::string& kernel_name,
@@ -181,7 +182,6 @@ std::unique_ptr<L0DeviceCompilationContext> compile_and_link_gpu_code(
     const size_t gpu_device_idx = 0) {
   CHECK(module);
   CHECK(l0_mgr);
-  auto& context = module->getContext();
 
   SPIRV::TranslatorOpts opts;
   opts.enableAllExtensions();
@@ -414,112 +414,119 @@ void GpuReductionTester::performReductionTest(
   os.flush();
   std::string module_str(ss.str());
 
-  // std::unique_ptr<CudaDeviceCompilationContext> gpu_context(compile_and_link_gpu_code(
-  //     module_str, module_, cuda_mgr_, getWrapperKernel()->getName().str()));
-
-  // const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU);
-  // const size_t num_buffers = result_sets.size();
-  // std::vector<int8_t*> d_input_buffers;
-  // for (size_t i = 0; i < num_buffers; i++) {
-  //   d_input_buffers.push_back(cuda_mgr_->allocateDeviceMem(buffer_size, device_id));
-  //   cuda_mgr_->copyHostToDevice(d_input_buffers[i],
-  //                               result_sets[i]->getStorage()->getUnderlyingBuffer(),
-  //                               buffer_size,
-  //                               device_id);
-  // }
-
-  // constexpr size_t num_kernel_params = 3;
-  // CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params);
-
-  // // parameter 1: an array of device pointers
-  // std::vector<CUdeviceptr> h_input_buffer_dptrs;
-  // h_input_buffer_dptrs.reserve(num_buffers);
-  // std::transform(d_input_buffers.begin(),
-  //                d_input_buffers.end(),
-  //                std::back_inserter(h_input_buffer_dptrs),
-  //                [](int8_t* dptr) { return reinterpret_cast<CUdeviceptr>(dptr); });
-
-  // auto d_input_buffer_dptrs =
-  //     cuda_mgr_->allocateDeviceMem(num_buffers * sizeof(CUdeviceptr), device_id);
-  // cuda_mgr_->copyHostToDevice(d_input_buffer_dptrs,
-  //                             reinterpret_cast<int8_t*>(h_input_buffer_dptrs.data()),
-  //                             num_buffers * sizeof(CUdeviceptr),
-  //                             device_id);
-
-  // // parameter 2: number of buffers
-  // auto d_num_buffers = cuda_mgr_->allocateDeviceMem(sizeof(int64_t), device_id);
-  // cuda_mgr_->copyHostToDevice(d_num_buffers,
-  //                             reinterpret_cast<const int8_t*>(&num_buffers),
-  //                             sizeof(int64_t),
-  //                             device_id);
-
-  // // parameter 3: device pointer to the output buffer
-  // auto d_result_buffer = cuda_mgr_->allocateDeviceMem(buffer_size, device_id);
-  // cuda_mgr_->copyHostToDevice(
-  //     d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size,
-  //     device_id);
-
-  // // collecting all kernel parameters:
-  // std::vector<CUdeviceptr> h_kernel_params{
-  //     reinterpret_cast<CUdeviceptr>(d_input_buffer_dptrs),
-  //     reinterpret_cast<CUdeviceptr>(d_num_buffers),
-  //     reinterpret_cast<CUdeviceptr>(d_result_buffer)};
-
-  // // casting each kernel parameter to be a void* device ptr itself:
-  // std::vector<void*> kernel_param_ptrs;
-  // kernel_param_ptrs.reserve(num_kernel_params);
-  // std::transform(h_kernel_params.begin(),
-  //                h_kernel_params.end(),
-  //                std::back_inserter(kernel_param_ptrs),
-  //                [](CUdeviceptr& param) { return &param; });
-
-  // // launching a kernel:
-  // auto cu_func = static_cast<CUfunction>(gpu_context->kernel());
-  // // we launch as many threadblocks as there are input buffers:
-  // // in other words, each input buffer is handled by a single threadblock.
-
-  // // checkCudaErrors(cuLaunchKernel(cu_func,
-  // //                                num_buffers,
-  // //                                1,
-  // //                                1,
-  // //                                1024,
-  // //                                1,
-  // //                                1,
-  // //                                buffer_size,
-  // //                                0,
-  // //  kernel_param_ptrs.data(),
-  // //  nullptr));
-
-  // // transfer back the results:
-  // cuda_mgr_->copyDeviceToHost(
-  //     gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size,
-  //     device_id);
-
-  // // release the gpu memory used:
-  // for (auto& d_buffer : d_input_buffers) {
-  //   cuda_mgr_->freeDeviceMem(d_buffer);
-  // }
-  // cuda_mgr_->freeDeviceMem(d_input_buffer_dptrs);
-  // cuda_mgr_->freeDeviceMem(d_num_buffers);
-  // cuda_mgr_->freeDeviceMem(d_result_buffer);
+  std::unique_ptr<L0DeviceCompilationContext> gpu_context(compile_and_link_gpu_code(
+      module_str, module_, l0_mgr_, getWrapperKernel()->getName().str()));
+
+  const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU);
+  const size_t num_buffers = result_sets.size();
+  std::vector<int8_t*> d_input_buffers;
+  for (size_t i = 0; i < num_buffers; i++) {
+    d_input_buffers.push_back(l0_mgr_->allocateDeviceMem(buffer_size, device_id));
+    l0_mgr_->copyHostToDevice(d_input_buffers[i],
+                              result_sets[i]->getStorage()->getUnderlyingBuffer(),
+                              buffer_size,
+                              device_id);
+  }
+
+  constexpr size_t num_kernel_params = 3;
+  CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params);
+
+  // parameter 1: an array of device pointers
+  typedef int8_t* L0deviceptr;
+  std::vector<L0deviceptr> h_input_buffer_dptrs;
+  h_input_buffer_dptrs.reserve(num_buffers);
+  std::transform(d_input_buffers.begin(),
+                 d_input_buffers.end(),
+                 std::back_inserter(h_input_buffer_dptrs),
+                 [](int8_t* dptr) { return reinterpret_cast<L0deviceptr>(dptr); });
+
+  auto d_input_buffer_dptrs =
+      l0_mgr_->allocateDeviceMem(num_buffers * sizeof(L0deviceptr), device_id);
+  l0_mgr_->copyHostToDevice(d_input_buffer_dptrs,
+                            reinterpret_cast<int8_t*>(h_input_buffer_dptrs.data()),
+                            num_buffers * sizeof(L0deviceptr),
+                            device_id);
+
+  // parameter 2: number of buffers
+  auto d_num_buffers = l0_mgr_->allocateDeviceMem(sizeof(int64_t), device_id);
+  l0_mgr_->copyHostToDevice(d_num_buffers,
+                            reinterpret_cast<const int8_t*>(&num_buffers),
+                            sizeof(int64_t),
+                            device_id);
+
+  // parameter 3: device pointer to the output buffer
+  auto d_result_buffer = l0_mgr_->allocateDeviceMem(buffer_size, device_id);
+  l0_mgr_->copyHostToDevice(
+      d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size, device_id);
+
+  // collecting all kernel parameters:
+  std::vector<L0deviceptr> h_kernel_params{
+      reinterpret_cast<L0deviceptr>(d_input_buffer_dptrs),
+      reinterpret_cast<L0deviceptr>(d_num_buffers),
+      reinterpret_cast<L0deviceptr>(d_result_buffer)};
+
+  // casting each kernel parameter to be a void* device ptr itself:
+  std::vector<void*> kernel_param_ptrs;
+  kernel_param_ptrs.reserve(num_kernel_params);
+  std::transform(h_kernel_params.begin(),
+                 h_kernel_params.end(),
+                 std::back_inserter(kernel_param_ptrs),
+                 [](L0deviceptr& param) { return &param; });
+
+  // launching a kernel:
+  typedef void* L0function;
+  auto l0_func = static_cast<L0function>(gpu_context->kernel());
+  // we launch as many threadblocks as there are input buffers:
+  // in other words, each input buffer is handled by a single threadblock.
+
+  // std::unique_ptr<L0DeviceCompilationContext> gpu_context
+  // auto l0_ctx = dynamic_cast<const L0CompilationContext*>(gpu_context);
+  // l0::L0Kernel* kernel = l0_ctx->getNativeCode(device_id);
+  // l0::L0Device* device = l0_ctx->getDevice(device_id);
+
+  auto kernel = gpu_context->kernel();
+  auto device = gpu_context->device();
+
+  auto q = device->command_queue();
+  auto q_list = device->create_command_list();
+  // l0::GroupCount gc = {ko.gridDimX, ko.gridDimY, ko.gridDimZ};
+  l0::GroupCount gc = {1, 1, 1024};
+  // LOG(INFO) << "Launching L0 kernel with group size: {" << ko.gridDimX << ","
+  //           << ko.gridDimY << "," << ko.gridDimZ << "}\n";
+  // q_list->launch(kernel, kernel_param_ptrs.data(), gc); //<< here is the problem
+
+  q_list->launch(*kernel, gc);
+  q_list->submit(*q.get());
+
+  // transfer back the results:
+  l0_mgr_->copyDeviceToHost(
+      gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size, device_id);
+
+  // release the gpu memory used:
+  for (auto& d_buffer : d_input_buffers) {
+    l0_mgr_->freeDeviceMem(d_buffer);
+  }
+  l0_mgr_->freeDeviceMem(d_input_buffer_dptrs);
+  l0_mgr_->freeDeviceMem(d_num_buffers);
+  l0_mgr_->freeDeviceMem(d_result_buffer);
 }
 
-// TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) {
-//   for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) {
-//     TestInputData input;
-//     input.setDeviceId(0)
-//         .setNumInputBuffers(4)
-//         .setTargetInfos(generate_custom_agg_target_infos(
-//             {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type}))
-//         .setAggWidth(4)
-//         .setMinEntry(0)
-//         .setMaxEntry(num_entries)
-//         .setStepSize(2)
-//         .setKeylessHash(true)
-//         .setTargetIndexForKey(0);
-//     perform_test_and_verify_results(input);
-//   }
-// }
+TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) {
+  for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) {
+    TestInputData input;
+    input.setDeviceId(0)
+        .setNumInputBuffers(4)
+        .setTargetInfos(generate_custom_agg_target_infos(
+            {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type}))
+        .setAggWidth(4)
+        .setMinEntry(0)
+        .setMaxEntry(num_entries)
+        .setStepSize(2)
+        .setKeylessHash(true)
+        .setTargetIndexForKey(0);
+    perform_test_and_verify_results(input);
+  }
+}
 
 // TEST(SingleColumn, VariableEntries_CountQuery_8B_Group) {
 //   for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) {

From e03cc6fd9ecaffbb6a0a78d0b905b51afe3f8591 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Mon, 17 Apr 2023 06:41:11 -0500
Subject: [PATCH 7/9] Extend CMakeList for IntelGPU SM test

---
 omniscidb/Tests/CMakeLists.txt | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt
index 1f5094cb7f..40e6e21aa1 100644
--- a/omniscidb/Tests/CMakeLists.txt
+++ b/omniscidb/Tests/CMakeLists.txt
@@ -143,6 +143,10 @@ if(ENABLE_CUDA)
   target_link_libraries(GpuSharedMemoryTest gtest Logger QueryEngine)
 endif()
 
+if(ENABLE_L0)
+  target_link_libraries(GpuSharedMemoryTestIntel gtest Logger QueryEngine)
+endif()
+
 set(TEST_ARGS "--gtest_output=xml:../")
 add_test(UtilTest UtilTest ${TEST_ARGS})
 add_test(ArrowBasedExecuteTest ArrowBasedExecuteTest ${TEST_ARGS})
@@ -181,6 +185,7 @@ if(ENABLE_CUDA)
   add_test(GpuSharedMemoryTest GpuSharedMemoryTest ${TEST_ARGS})
 endif()
 if(ENABLE_L0)
+  add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS})
   add_test(NAME PuntToCpu COMMAND ArrowBasedExecuteTest "--gtest_filter=Select.Punt*" ${TEST_ARGS})
   set_tests_properties(PuntToCpu PROPERTIES LABELS "enabling")
 endif()
@@ -236,6 +241,9 @@ if(ENABLE_CUDA)
   list(APPEND TEST_PROGRAMS GpuSharedMemoryTest)
 endif()
 
+if(ENABLE_L0)
+  list(APPEND TEST_PROGRAMS GpuSharedMemoryTestIntel)
+endif()
 #if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
 #  list(APPEND TEST_PROGRAMS UdfTest)
 #endif()
@@ -293,6 +301,19 @@ if(ENABLE_L0)
   )
   set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling")
 
+  # set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp)
+
+  # set(intel_gpu_module_name RuntimeFunctionsL0.bc)
+  # set(intel_gpu_module_internal_suffix "L0_internal.bc")
+  # function(precompile_intel_gpu_module SOURCE_FILE)
+  #     set(module_compiler_flags -Xclang -ffake-address-space-map -DL0_RUNTIME_ENABLED)
+  #     precompile_llvm_module(${SOURCE_FILE} ${intel_gpu_module_internal_suffix} ${module_compiler_flags})
+  # endfunction()
+  # set(precompile_intel_gpu_module_cmd "precompile_intel_gpu_module")
+
+  # precompile_modules("intel_gpu_precompiled_module_list" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} "${intel_gpu_runtime_function_sources}")
+  # link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_list}")
+
   add_custom_target(enabling_tests
       COMMAND mkdir -p tmp
       COMMAND touch tmp/DictPayload

From b0e4b3ebe6813138bbc5a74f74410dd602a88a41 Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Mon, 17 Apr 2023 08:24:29 -0500
Subject: [PATCH 8/9] Update GpuSMUtils to enable L0

---
 omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp | 10 ++++++++++
 omniscidb/Tests/CMakeLists.txt                 | 12 ------------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp
index a1b023c72a..99a1845ddd 100644
--- a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp
+++ b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp
@@ -163,6 +163,16 @@ void GpuSharedMemCodeBuilder::codegenReduction(const CompilationOptions& co) {
       "f32:32:32-f64:64:64-v16:16:16-"
       "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
   reduction_code.module->setTargetTriple("nvptx64-nvidia-cuda");
+
+#ifdef HAVE_L0
+  reduction_code.module->setDataLayout(
+      "e-p:64:64:64-i1:8:8-i8:8:8-"
+      "i16:16:16-i32:32:32-i64:64:64-"
+      "f32:32:32-f64:64:64-v16:16:16-"
+      "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
+  reduction_code.module->setTargetTriple("spir64-unknown-unknown");
+#endif
+
   llvm::Linker linker(*module_);
   std::unique_ptr<llvm::Module> owner(reduction_code.module);
   bool link_error = linker.linkInModule(std::move(owner));
diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt
index 40e6e21aa1..5bf96f7677 100644
--- a/omniscidb/Tests/CMakeLists.txt
+++ b/omniscidb/Tests/CMakeLists.txt
@@ -301,18 +301,6 @@ if(ENABLE_L0)
   )
   set_tests_properties(${ENABLING_TESTS} PROPERTIES LABELS "enabling")
 
-  # set(intel_gpu_runtime_function_sources l0_mapd_rt.cpp DateAdd.cpp DateTruncate.cpp)
-
-  # set(intel_gpu_module_name RuntimeFunctionsL0.bc)
-  # set(intel_gpu_module_internal_suffix "L0_internal.bc")
-  # function(precompile_intel_gpu_module SOURCE_FILE)
-  #     set(module_compiler_flags -Xclang -ffake-address-space-map -DL0_RUNTIME_ENABLED)
-  #     precompile_llvm_module(${SOURCE_FILE} ${intel_gpu_module_internal_suffix} ${module_compiler_flags})
-  # endfunction()
-  # set(precompile_intel_gpu_module_cmd "precompile_intel_gpu_module")
-
-  # precompile_modules("intel_gpu_precompiled_module_list" ${intel_gpu_module_internal_suffix} ${precompile_intel_gpu_module_cmd} "${intel_gpu_runtime_function_sources}")
-  # link_runtime_module(${intel_gpu_module_name} "${intel_gpu_precompiled_module_list}")
 
   add_custom_target(enabling_tests
       COMMAND mkdir -p tmp

From cad4e99d341f66dd0386fb35b763f78d1a5904eb Mon Sep 17 00:00:00 2001
From: Laurent Montigny <laurent.montigny@intel.com>
Date: Mon, 17 Apr 2023 08:25:00 -0500
Subject: [PATCH 9/9] Update triple for spir64

---
 omniscidb/Tests/GpuSharedMemoryTestIntel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
index 1ef81efd57..f726881afd 100644
--- a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
+++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp
@@ -160,7 +160,7 @@ void prepare_generated_gpu_kernel(llvm::Module* module,
       "i16:16:16-i32:32:32-i64:64:64-"
       "f32:32:32-f64:64:64-v16:16:16-"
       "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
-  module->setTargetTriple("nvptx64-nvidia-cuda");
+  module->setTargetTriple("spir64-unknown-unknown");
 
   llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
 
@@ -349,7 +349,7 @@ void perform_test_and_verify_results(TestInputData input) {
       "i16:16:16-i32:32:32-i64:64:64-"
       "f32:32:32-f64:64:64-v16:16:16-"
       "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
-  module->setTargetTriple("spir-unknown-unknown");
+  module->setTargetTriple("spir64-unknown-unknown");
   auto l0_mgr = std::make_unique<l0::L0Manager>();
   const auto row_set_mem_owner =
       std::make_shared<RowSetMemoryOwner>(nullptr, Executor::getArenaBlockSize());