frags order

akroviakov · akroviakov · commit c4af116cacf5 · 2023-11-01T08:09:17.000-07:00
diff --git a/omniscidb/QueryEngine/ColumnFetcher.cpp b/omniscidb/QueryEngine/ColumnFetcher.cpp
@@ -59,7 +59,6 @@ std::pair<const int8_t*, size_t> ColumnFetcher::getOneColumnFragment(
     const Data_Namespace::MemoryLevel effective_mem_lvl,
     const int device_id,
     DeviceAllocator* device_allocator,
-    const size_t thread_idx,
     std::vector<std::shared_ptr<Chunk_NS::Chunk>>& chunks_owner,
     DataProvider* data_provider,
     ColumnCacheMap& column_cache) {
@@ -115,7 +114,6 @@ JoinColumn ColumnFetcher::makeJoinColumn(
     const Data_Namespace::MemoryLevel effective_mem_lvl,
     const int device_id,
     DeviceAllocator* device_allocator,
-    const size_t thread_idx,
     std::vector<std::shared_ptr<Chunk_NS::Chunk>>& chunks_owner,
     std::vector<std::shared_ptr<void>>& malloc_owner,
     DataProvider* data_provider,
@@ -142,7 +140,6 @@ JoinColumn ColumnFetcher::makeJoinColumn(
         effective_mem_lvl,
         effective_mem_lvl == Data_Namespace::CPU_LEVEL ? 0 : device_id,
         device_allocator,
-        thread_idx,
         chunks_owner,
         data_provider,
         column_cache);
@@ -364,8 +361,7 @@ const int8_t* ColumnFetcher::linearizeColumnFragments(
     std::list<ChunkIter>& chunk_iter_holder,
     const Data_Namespace::MemoryLevel memory_level,
     const int device_id,
-    DeviceAllocator* device_allocator,
-    const size_t thread_idx) const {
+    DeviceAllocator* device_allocator) const {
   auto timer = DEBUG_TIMER(__func__);
   int db_id = col_info->db_id;
   int table_id = col_info->table_id;
@@ -477,8 +473,7 @@ const int8_t* ColumnFetcher::linearizeColumnFragments(
                                              total_data_buf_size,
                                              total_idx_buf_size,
                                              total_num_tuples,
-                                             device_allocator,
-                                             thread_idx);
+                                             device_allocator);
       } else {
         CHECK(type->isVarLenArray());
         VLOG(2) << "Linearize variable-length multi-frag array column (col_id: " << col_id
@@ -496,8 +491,7 @@ const int8_t* ColumnFetcher::linearizeColumnFragments(
                                            total_data_buf_size,
                                            total_idx_buf_size,
                                            total_num_tuples,
-                                           device_allocator,
-                                           thread_idx);
+                                           device_allocator);
       }
     }
     if (type->isString()) {
@@ -516,8 +510,7 @@ const int8_t* ColumnFetcher::linearizeColumnFragments(
                                          total_data_buf_size,
                                          total_idx_buf_size,
                                          total_num_tuples,
-                                         device_allocator,
-                                         thread_idx);
+                                         device_allocator);
     }
   }
   CHECK(res.first);  // check merged data buffer
@@ -573,8 +566,7 @@ MergedChunk ColumnFetcher::linearizeVarLenArrayColFrags(
     const size_t total_data_buf_size,
     const size_t total_idx_buf_size,
     const size_t total_num_tuples,
-    DeviceAllocator* device_allocator,
-    const size_t thread_idx) const {
+    DeviceAllocator* device_allocator) const {
   // for linearization of varlen col we have to deal with not only data buffer
   // but also its underlying index buffer which is responsible for offset of varlen value
   // basically we maintain per-device linearized (data/index) buffer
@@ -902,8 +894,7 @@ MergedChunk ColumnFetcher::linearizeFixedLenArrayColFrags(
     const size_t total_data_buf_size,
     const size_t total_idx_buf_size,
     const size_t total_num_tuples,
-    DeviceAllocator* device_allocator,
-    const size_t thread_idx) const {
+    DeviceAllocator* device_allocator) const {
   int64_t linearization_time_ms = 0;
   auto clock_begin = timer_start();
   // linearize collected fragments
diff --git a/omniscidb/QueryEngine/ColumnFetcher.h b/omniscidb/QueryEngine/ColumnFetcher.h
@@ -46,7 +46,6 @@ class ColumnFetcher {
       const Data_Namespace::MemoryLevel effective_mem_lvl,
       const int device_id,
       DeviceAllocator* device_allocator,
-      const size_t thread_idx,
       std::vector<std::shared_ptr<Chunk_NS::Chunk>>& chunks_owner,
       DataProvider* data_provider,
       ColumnCacheMap& column_cache);
@@ -59,7 +58,6 @@ class ColumnFetcher {
       const Data_Namespace::MemoryLevel effective_mem_lvl,
       const int device_id,
       DeviceAllocator* device_allocator,
-      const size_t thread_idx,
       std::vector<std::shared_ptr<Chunk_NS::Chunk>>& chunks_owner,
       std::vector<std::shared_ptr<void>>& malloc_owner,
       DataProvider* data_provider,
@@ -90,8 +88,7 @@ class ColumnFetcher {
       std::list<ChunkIter>& chunk_iter_holder,
       const Data_Namespace::MemoryLevel memory_level,
       const int device_id,
-      DeviceAllocator* device_allocator,
-      const size_t thread_idx) const;
+      DeviceAllocator* device_allocator) const;
 
   void freeTemporaryCpuLinearizedIdxBuf();
   void freeLinearizedBuf();
@@ -118,8 +115,7 @@ class ColumnFetcher {
       const size_t total_data_buf_size,
       const size_t total_idx_buf_size,
       const size_t total_num_tuples,
-      DeviceAllocator* device_allocator,
-      const size_t thread_idx) const;
+      DeviceAllocator* device_allocator) const;
 
   MergedChunk linearizeFixedLenArrayColFrags(
       std::list<std::shared_ptr<Chunk_NS::Chunk>>& chunk_holder,
@@ -133,8 +129,7 @@ class ColumnFetcher {
       const size_t total_data_buf_size,
       const size_t total_idx_buf_size,
       const size_t total_num_tuples,
-      DeviceAllocator* device_allocator,
-      const size_t thread_idx) const;
+      DeviceAllocator* device_allocator) const;
 
   void addMergedChunkIter(const int table_id,
                           const int col_id,
diff --git a/omniscidb/QueryEngine/Execute.cpp b/omniscidb/QueryEngine/Execute.cpp
@@ -2871,7 +2871,7 @@ std::map<size_t, std::vector<uint64_t>> get_table_id_to_frag_offsets(
 std::pair<std::vector<std::vector<int64_t>>, std::vector<std::vector<uint64_t>>>
 Executor::getRowCountAndOffsetForAllFrags(
     const RelAlgExecutionUnit& ra_exe_unit,
-    const CartesianProduct<std::vector<std::vector<size_t>>>& frag_ids_crossjoin,
+    const std::vector<std::vector<size_t>>& frag_ids_crossjoin,
     const std::vector<InputDescriptor>& input_descs,
     const std::map<TableRef, const TableFragments*>& all_tables_fragments) {
   std::vector<std::vector<int64_t>> all_num_rows;
@@ -2947,6 +2947,8 @@ bool Executor::needLinearizeAllFragments(
   const auto& fragments = selected_fragments[nest_level].fragment_ids;
   auto need_linearize =
       inner_col_desc.type()->isArray() || inner_col_desc.type()->isString();
+  LOG(INFO) << inner_col_desc.type()->isArray() << " || "
+            << inner_col_desc.type()->isString() << ") && " << fragments.size() << " > 1";
   return need_linearize && fragments.size() > 1;
 }
 
@@ -2984,6 +2986,9 @@ FetchResult Executor::fetchChunks(
   std::vector<std::vector<const int8_t*>> all_frag_col_buffers;
   std::vector<std::vector<int64_t>> all_num_rows;
   std::vector<std::vector<uint64_t>> all_frag_offsets;
+
+  // in MT case we want to preserve "the order of insertion" into all_frag_col_buffers
+  std::vector<std::vector<size_t>> selected_frag_ids_vec;
   if(memory_level == Data_Namespace::MemoryLevel::GPU_LEVEL){
     std::mutex all_frag;
     std::atomic<bool> empty_frags{false};
@@ -2993,7 +2998,6 @@ FetchResult Executor::fetchChunks(
           frag_ids_crossjoin.begin(),
           frag_ids_crossjoin.end(),
           [&](const std::vector<size_t>& selected_frag_ids) {
-          // for (const auto& selected_frag_ids : frag_ids_crossjoin) {
             std::vector<const int8_t*> frag_col_buffers(
                 plan_state_->global_to_local_col_ids_.size());
             for (const auto& col_id : col_global_ids) {
@@ -3041,16 +3045,15 @@ FetchResult Executor::fetchChunks(
                       chunk_iterators,
                       for_lazy_fetch ? Data_Namespace::CPU_LEVEL : memory_level,
                       for_lazy_fetch ? 0 : device_id,
-                      device_allocator,
-                      thread_idx);
+                      device_allocator);
                 } else {
                   frag_col_buffers[it->second] =
                       column_fetcher.getAllTableColumnFragments(col_id->getColInfo(),
                                                                 all_tables_fragments,
                                                                 memory_level_for_column,
                                                                 device_id,
                                                                 device_allocator,
-                                                                thread_idx);
+                                                                /*thread_idx=*/0);
                 }
               } else {
                 frag_col_buffers[it->second] =
@@ -3065,10 +3068,11 @@ FetchResult Executor::fetchChunks(
               }
             }
             all_frag.lock();
+            selected_frag_ids_vec.push_back(selected_frag_ids);
             all_frag_col_buffers.push_back(frag_col_buffers);
             all_frag.unlock();
-        });
-      });
+          });
+    });
     if (empty_frags) {
       return {};
     }
@@ -3120,8 +3124,7 @@ FetchResult Executor::fetchChunks(
                       chunk_iterators,
                       for_lazy_fetch ? Data_Namespace::CPU_LEVEL : memory_level,
                       for_lazy_fetch ? 0 : device_id,
-                      device_allocator,
-                      thread_idx);
+                      device_allocator);
                 } else {
                   frag_col_buffers[it->second] =
                       column_fetcher.getAllTableColumnFragments(col_id->getColInfo(),
@@ -3143,11 +3146,12 @@ FetchResult Executor::fetchChunks(
                                                              device_allocator);
               }
             }
+            selected_frag_ids_vec.push_back(selected_frag_ids);
             all_frag_col_buffers.push_back(frag_col_buffers);
           }
   }
   std::tie(all_num_rows, all_frag_offsets) = getRowCountAndOffsetForAllFrags(
-      ra_exe_unit, frag_ids_crossjoin, ra_exe_unit.input_descs, all_tables_fragments);
+      ra_exe_unit, selected_frag_ids_vec, ra_exe_unit.input_descs, all_tables_fragments);
   return {all_frag_col_buffers, all_num_rows, all_frag_offsets};
 }
 
@@ -3171,6 +3175,7 @@ FetchResult Executor::fetchUnionChunks(
   std::vector<std::vector<const int8_t*>> all_frag_col_buffers;
   std::vector<std::vector<int64_t>> all_num_rows;
   std::vector<std::vector<uint64_t>> all_frag_offsets;
+  std::vector<std::vector<size_t>> selected_frag_ids_vec;
 
   CHECK(!selected_fragments.empty());
   CHECK_LE(2u, ra_exe_unit.input_descs.size());
@@ -3269,12 +3274,16 @@ FetchResult Executor::fetchUnionChunks(
                                                        device_allocator);
         }
       }
+      selected_frag_ids_vec.push_back(selected_frag_ids);
       all_frag_col_buffers.push_back(frag_col_buffers);
     }
     std::vector<std::vector<int64_t>> num_rows;
     std::vector<std::vector<uint64_t>> frag_offsets;
-    std::tie(num_rows, frag_offsets) = getRowCountAndOffsetForAllFrags(
-        ra_exe_unit, frag_ids_crossjoin, ra_exe_unit.input_descs, all_tables_fragments);
+    std::tie(num_rows, frag_offsets) =
+        getRowCountAndOffsetForAllFrags(ra_exe_unit,
+                                        selected_frag_ids_vec,
+                                        ra_exe_unit.input_descs,
+                                        all_tables_fragments);
     all_num_rows.insert(all_num_rows.end(), num_rows.begin(), num_rows.end());
     all_frag_offsets.insert(
         all_frag_offsets.end(), frag_offsets.begin(), frag_offsets.end());
diff --git a/omniscidb/QueryEngine/Execute.h b/omniscidb/QueryEngine/Execute.h
@@ -571,7 +571,7 @@ class Executor : public StringDictionaryProvider {
   std::pair<std::vector<std::vector<int64_t>>, std::vector<std::vector<uint64_t>>>
   getRowCountAndOffsetForAllFrags(
       const RelAlgExecutionUnit& ra_exe_unit,
-      const CartesianProduct<std::vector<std::vector<size_t>>>& frag_ids_crossjoin,
+      const std::vector<std::vector<size_t>>& frag_ids_crossjoin,
       const std::vector<InputDescriptor>& input_descs,
       const std::map<TableRef, const TableFragments*>& all_tables_fragments);
 
diff --git a/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp b/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp
@@ -52,7 +52,6 @@ JoinColumn HashJoin::fetchJoinColumn(
                                                            effective_memory_level,
                                                            device_id,
                                                            dev_buff_owner,
-                                                           /*thread_idx=*/0,
                                                            chunks_owner,
                                                            malloc_owner,
                                                            data_provider_,
diff --git a/omniscidb/QueryEngine/RelAlgExecutor.cpp b/omniscidb/QueryEngine/RelAlgExecutor.cpp
@@ -1154,7 +1154,6 @@ std::unique_ptr<WindowFunctionContext> RelAlgExecutor::createWindowFunctionConte
                                             memory_level,
                                             0,
                                             nullptr,
-                                            /*thread_idx=*/0,
                                             chunks_owner,
                                             data_provider_,
                                             column_cache_map);