Skip to content

Commit 3db4003

Browse files
Merge branch 'main' into sre_tidy_changes
2 parents 3bbb00d + 5917a0c commit 3db4003

File tree

7 files changed

+352
-109
lines changed

7 files changed

+352
-109
lines changed

cpp/deeplake_pg/duckdb_deeplake_scan.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -846,8 +846,10 @@ class deeplake_scan_function_helper
846846
for (duckdb::idx_t row_in_batch = 0; row_in_batch < batch_size; ++row_in_batch) {
847847
const int64_t row_idx = current_row + row_in_batch;
848848
auto value = td.get_streamers().value<std::string_view>(col_idx, row_idx);
849+
// workaround. value is not always remain valid. Trying to make a copy as soon as possible.
850+
// Most likely due to nd::array temporary object destruction.
851+
std::string str_value(value);
849852
if (is_uuid) {
850-
std::string str_value(value);
851853
// Treat empty string as NULL for UUID columns
852854
if (str_value.empty()) {
853855
duckdb::FlatVector::SetNull(output_vector, row_in_batch, true);
@@ -862,7 +864,7 @@ class deeplake_scan_function_helper
862864
} else {
863865
auto* duckdb_data = duckdb::FlatVector::GetData<duckdb::string_t>(output_vector);
864866
duckdb_data[row_in_batch] =
865-
add_string(output_vector, value.data(), value.size());
867+
add_string(output_vector, str_value.data(), str_value.size());
866868
}
867869
}
868870
}

cpp/deeplake_pg/extension_init.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ bool print_runtime_stats = false;
5151
bool support_json_index = false;
5252
bool is_filter_pushdown_enabled = true;
5353
int32_t max_streamable_column_width = 128;
54-
int32_t max_num_threads_for_global_state = 4;
54+
int32_t max_num_threads_for_global_state = std::thread::hardware_concurrency();
5555
bool treat_numeric_as_double = true; // Treat numeric types as double by default
5656
bool print_progress_during_seq_scan = false;
5757
bool use_shared_mem_for_refresh = false;
@@ -199,7 +199,7 @@ void initialize_guc_parameters()
199199
"Maximum number of threads for global state operations.",
200200
nullptr, // optional long description
201201
&pg::max_num_threads_for_global_state, // linked C variable
202-
6, // default value
202+
base::system_report::cpu_cores(), // default value
203203
1, // min value
204204
base::system_report::cpu_cores(), // max value
205205
PGC_USERSET, // context (USERSET, SUSET, etc.)
@@ -216,10 +216,10 @@ void initialize_guc_parameters()
216216
"for detecting dataset refreshes, which can improve performance but may "
217217
"have implications on concurrency. "
218218
"It make sense to disable this for OLTP workloads.",
219-
&pg::use_shared_mem_for_refresh, // linked C variable
220-
true, // default value
221-
PGC_USERSET, // context (USERSET, SUSET, etc.)
222-
0, // flags
219+
&pg::use_shared_mem_for_refresh, // linked C variable
220+
true, // default value
221+
PGC_USERSET, // context (USERSET, SUSET, etc.)
222+
0, // flags
223223
nullptr,
224224
nullptr,
225225
nullptr // check_hook, assign_hook, show_hook

cpp/deeplake_pg/pg_deeplake.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ void load_index_metadata()
302302
const uint32_t oid = is_null ? 0 : DatumGetUInt32(datum_order_type);
303303

304304
if (!pg::pg_index::has_index_info(oid)) {
305-
pg::pg_index::create_index_info(oid);
305+
pg::pg_index::load_index_info(oid);
306306
} else {
307307
pg::index_info::current().reset();
308308
}

cpp/deeplake_pg/pg_deeplake.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,14 @@ class pg_index
455455
index_info::current().reset();
456456
}
457457

458+
static void load_index_info(Oid oid)
459+
{
460+
// Load index metadata into memory cache without creating DeepLake indexes
461+
// (indexes already exist in the dataset, we're just restoring the in-memory state)
462+
instance().indexes_.emplace(oid, index_info::current());
463+
index_info::current().reset();
464+
}
465+
458466
static bool has_index_info(Oid oid)
459467
{
460468
return instance().indexes_.find(oid) != instance().indexes_.end();

cpp/deeplake_pg/table_data.hpp

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -92,48 +92,29 @@ struct table_data
9292
{
9393
struct batch_data
9494
{
95+
std::mutex mutex_;
96+
async::promise<nd::array> promise_;
9597
nd::array owner_;
9698
const uint8_t* data_ = nullptr;
97-
impl::string_stream_array_holder holder_ = impl::string_stream_array_holder();
98-
std::atomic<bool> initialized_{false};
99+
impl::string_stream_array_holder holder_;
99100

100101
batch_data() = default;
101102
batch_data(const batch_data& other) = delete;
102-
batch_data(batch_data&& other) noexcept
103-
: owner_(std::move(other.owner_))
104-
, data_(other.data_)
105-
, holder_(std::move(other.holder_))
106-
, initialized_(other.initialized_.load(std::memory_order_relaxed))
107-
{
108-
other.data_ = nullptr;
109-
}
110-
103+
batch_data(batch_data&& other) noexcept = delete;
111104
batch_data& operator=(const batch_data& other) = delete;
112-
batch_data& operator=(batch_data&& other) noexcept
113-
{
114-
if (this != &other) {
115-
owner_ = std::move(other.owner_);
116-
data_ = other.data_;
117-
holder_ = std::move(other.holder_);
118-
initialized_.store(other.initialized_.load(std::memory_order_relaxed), std::memory_order_relaxed);
119-
other.data_ = nullptr;
120-
}
121-
return *this;
122-
}
105+
batch_data& operator=(batch_data&& other) noexcept = delete;
123106
};
124107

125-
struct column_data
126-
{
127-
std::vector<batch_data> batches;
128-
base::spin_lock mutex;
129-
};
130-
131-
std::vector<std::unique_ptr<bifrost::column_streamer>> streamers;
108+
using column_data = std::vector<batch_data>;
132109
std::vector<column_data> column_to_batches;
133110

134111
inline void reset() noexcept
135112
{
136-
streamers.clear();
113+
for (auto& batches : column_to_batches) {
114+
for (auto& batch : batches) {
115+
batch.promise_.cancel();
116+
}
117+
}
137118
column_to_batches.clear();
138119
}
139120

cpp/deeplake_pg/table_data_impl.hpp

Lines changed: 52 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ inline table_data::streamer_info& table_data::get_streamers() noexcept
340340

341341
inline bool table_data::column_has_streamer(uint32_t idx) const noexcept
342342
{
343-
return streamers_.streamers.size() > idx && streamers_.streamers[idx] != nullptr;
343+
return streamers_.column_to_batches.size() > idx && !streamers_.column_to_batches[idx].empty();
344344
}
345345

346346
inline void table_data::reset_streamers() noexcept
@@ -529,45 +529,49 @@ inline std::pair<int64_t, int64_t> table_data::get_row_range(int32_t worker_id)
529529

530530
inline void table_data::create_streamer(int32_t idx, int32_t worker_id)
531531
{
532-
if (streamers_.streamers.empty()) {
533-
const auto s = num_columns();
534-
streamers_.streamers.resize(s);
535-
std::vector<streamer_info::column_data> temp_data(s);
536-
streamers_.column_to_batches.swap(temp_data);
537-
}
538-
if (!streamers_.streamers[idx]) {
539-
if (pg::memory_tracker::has_memory_limit()) {
540-
const auto column_size =
541-
pg::utils::get_column_width(get_base_atttypid(idx), get_atttypmod(idx)) * num_rows();
542-
pg::memory_tracker::ensure_memory_available(column_size);
543-
}
544-
if (worker_id != -1) {
545-
auto [start_row, end_row] = get_row_range(worker_id);
546-
auto new_column = heimdall_common::create_filtered_column(
547-
*(get_column_view(idx)), icm::index_mapping_t<int64_t>::slice({start_row, end_row, 1}));
548-
streamers_.streamers[idx] = std::make_unique<bifrost::column_streamer>(new_column, batch_size_);
549-
} else {
550-
streamers_.streamers[idx] = std::make_unique<bifrost::column_streamer>(get_column_view(idx), batch_size_);
551-
}
552-
const int64_t batch_index = (num_rows() - 1) / batch_size_;
553-
streamers_.column_to_batches[idx].batches.resize(batch_index + 1);
532+
const auto col_count = num_columns();
533+
if (streamers_.column_to_batches.empty()) {
534+
streamers_.column_to_batches.resize(col_count);
535+
}
536+
ASSERT(idx >= 0 && idx < col_count);
537+
auto& column_batches = streamers_.column_to_batches[idx];
538+
if (!column_batches.empty()) {
539+
return;
540+
}
541+
if (pg::memory_tracker::has_memory_limit()) {
542+
const auto column_size = pg::utils::get_column_width(get_base_atttypid(idx), get_atttypmod(idx)) * num_rows();
543+
pg::memory_tracker::ensure_memory_available(column_size);
544+
}
545+
heimdall::column_view_ptr cv = get_column_view(idx);
546+
if (worker_id != -1) {
547+
auto [start_row, end_row] = get_row_range(worker_id);
548+
cv = heimdall_common::create_filtered_column(*(cv),
549+
icm::index_mapping_t<int64_t>::slice({start_row, end_row, 1}));
550+
}
551+
const int64_t row_count = num_rows();
552+
const int64_t batch_count = (row_count + batch_size_ - 1) / batch_size_;
553+
column_batches = std::vector<streamer_info::batch_data>(batch_count);
554+
for (int64_t i = 0; i < batch_count; ++i) {
555+
const auto range_start = i * batch_size_;
556+
const auto range_end = std::min<int64_t>(range_start + batch_size_, row_count);
557+
auto p = async::run_on_main([cv, range_start, range_end, row_count]() {
558+
return cv->request_range(
559+
range_start, range_end, storage::fetch_options(static_cast<int>(row_count - range_start)));
560+
});
561+
column_batches[i].promise_ = std::move(p);
554562
}
555563
}
556564

557565
inline nd::array table_data::streamer_info::get_sample(int32_t column_number, int64_t row_number)
558566
{
559567
const int64_t batch_index = row_number >> batch_size_log2_;
560568
const int64_t row_in_batch = row_number & batch_mask_;
561-
562-
auto& batches = column_to_batches[column_number].batches;
563-
auto& batch = batches[batch_index];
564-
if (!batch.initialized_.load(std::memory_order_acquire)) [[unlikely]] {
565-
std::scoped_lock lock(column_to_batches[column_number].mutex);
566-
for (int64_t i = 0; i <= batch_index; ++i) {
567-
if (!batches[static_cast<size_t>(i)].initialized_.load(std::memory_order_relaxed)) {
568-
batches[static_cast<size_t>(i)].owner_ = streamers[static_cast<size_t>(column_number)]->next_batch();
569-
batches[static_cast<size_t>(i)].initialized_.store(true, std::memory_order_release);
570-
}
569+
auto& batch = column_to_batches[column_number][batch_index];
570+
if (static_cast<bool>(batch.promise_)) [[unlikely]] {
571+
std::lock_guard lock(batch.mutex_);
572+
if (static_cast<bool>(batch.promise_)) {
573+
batch.owner_ = batch.promise_.get_future().get();
574+
batch.promise_ = async::promise<nd::array>();
571575
}
572576
}
573577
return batch.owner_[static_cast<size_t>(row_in_batch)];
@@ -576,23 +580,7 @@ inline nd::array table_data::streamer_info::get_sample(int32_t column_number, in
576580
template <typename T>
577581
inline T table_data::streamer_info::value(int32_t column_number, int64_t row_number)
578582
{
579-
const int64_t batch_index = row_number >> batch_size_log2_;
580-
const int64_t row_in_batch = row_number & batch_mask_;
581-
582-
auto& batches = column_to_batches[column_number].batches;
583-
auto& batch = batches[batch_index];
584-
if (!batch.initialized_.load(std::memory_order_acquire)) [[unlikely]] {
585-
std::scoped_lock lock(column_to_batches[column_number].mutex);
586-
for (int64_t i = 0; i <= batch_index; ++i) {
587-
if (!batches[static_cast<size_t>(i)].initialized_.load(std::memory_order_relaxed)) {
588-
batches[static_cast<size_t>(i)].owner_ = utils::eval_with_nones<T>(streamers[static_cast<size_t>(column_number)]->next_batch());
589-
batches[static_cast<size_t>(i)].data_ = batches[static_cast<size_t>(i)].owner_.data().data();
590-
batches[static_cast<size_t>(i)].initialized_.store(true, std::memory_order_release);
591-
}
592-
}
593-
}
594-
595-
return reinterpret_cast<const T*>(batch.data_)[static_cast<size_t>(row_in_batch)];
583+
return *(value_ptr<T>(column_number, row_number));
596584
}
597585

598586
template <typename T>
@@ -601,16 +589,13 @@ inline const T* table_data::streamer_info::value_ptr(int32_t column_number, int6
601589
const int64_t batch_index = row_number >> batch_size_log2_;
602590
const int64_t row_in_batch = row_number & batch_mask_;
603591

604-
auto& batches = column_to_batches[column_number].batches;
605-
auto& batch = batches[batch_index];
606-
if (!batch.initialized_.load(std::memory_order_acquire)) [[unlikely]] {
607-
std::scoped_lock lock(column_to_batches[column_number].mutex);
608-
for (int64_t i = 0; i <= batch_index; ++i) {
609-
if (!batches[static_cast<size_t>(i)].initialized_.load(std::memory_order_relaxed)) {
610-
batches[static_cast<size_t>(i)].owner_ = utils::eval_with_nones<T>(streamers[static_cast<size_t>(column_number)]->next_batch());
611-
batches[static_cast<size_t>(i)].data_ = batches[static_cast<size_t>(i)].owner_.data().data();
612-
batches[static_cast<size_t>(i)].initialized_.store(true, std::memory_order_release);
613-
}
592+
auto& batch = column_to_batches[column_number][batch_index];
593+
if (static_cast<bool>(batch.promise_)) [[unlikely]] {
594+
std::lock_guard lock(batch.mutex_);
595+
if (static_cast<bool>(batch.promise_)) {
596+
batch.owner_ = utils::eval_with_nones<T>(batch.promise_.get_future().get());
597+
batch.data_ = batch.owner_.data().data();
598+
batch.promise_ = async::promise<nd::array>();
614599
}
615600
}
616601

@@ -623,16 +608,13 @@ inline std::string_view table_data::streamer_info::value(int32_t column_number,
623608
const int64_t batch_index = row_number >> batch_size_log2_;
624609
const int64_t row_in_batch = row_number & batch_mask_;
625610

626-
auto& batches = column_to_batches[column_number].batches;
627-
auto& batch = batches[batch_index];
628-
if (!batch.initialized_.load(std::memory_order_acquire)) [[unlikely]] {
629-
std::scoped_lock lock(column_to_batches[column_number].mutex);
630-
for (int64_t i = 0; i <= batch_index; ++i) {
631-
if (!batches[static_cast<size_t>(i)].initialized_.load(std::memory_order_relaxed)) {
632-
batches[static_cast<size_t>(i)].owner_ = streamers[static_cast<size_t>(column_number)]->next_batch();
633-
batches[static_cast<size_t>(i)].holder_ = impl::string_stream_array_holder(batches[static_cast<size_t>(i)].owner_);
634-
batches[static_cast<size_t>(i)].initialized_.store(true, std::memory_order_release);
635-
}
611+
auto& batch = column_to_batches[column_number][batch_index];
612+
if (static_cast<bool>(batch.promise_)) [[unlikely]] {
613+
std::lock_guard lock(batch.mutex_);
614+
if (static_cast<bool>(batch.promise_)) {
615+
batch.owner_ = batch.promise_.get_future().get();
616+
batch.holder_ = impl::string_stream_array_holder(batch.owner_);
617+
batch.promise_ = async::promise<nd::array>();
636618
}
637619
}
638620

0 commit comments

Comments
 (0)