Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit 5ac5871

Browse files
committed
Stop using negative string ids for transient strings.
Signed-off-by: ienkovich <[email protected]>
1 parent 4a48597 commit 5ac5871

10 files changed

+108
-103
lines changed

omniscidb/ResultSet/ResultSet.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,9 @@ std::string ResultSet::getStrScalarVal(const ScalarTargetValue& current_scalar,
236236
} else {
237237
if (col_type->isExtDictionary()) {
238238
const int32_t dict_id = col_type->as<hdk::ir::ExtDictionaryType>()->dictId();
239-
const auto sdp = getStringDictionaryProxy(dict_id);
239+
const auto sdp = data_mgr_ ? row_set_mem_owner_->getOrAddStringDictProxy(dict_id)
240+
: row_set_mem_owner_->getStringDictProxy(
241+
dict_id); // unit tests bypass the DataMgr
240242
const auto string_id = boost::get<int64_t>(current_scalar);
241243
oss << "idx:"
242244
<< ((string_id == inline_int_null_value<int32_t>()) ? "null"

omniscidb/StringDictionary/StringDictionaryProxy.cpp

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ std::string StringDictionaryProxy::getString(int32_t string_id) const {
142142
}
143143

144144
std::string StringDictionaryProxy::getStringUnlocked(const int32_t string_id) const {
145-
if (string_id >= 0 && storageEntryCount() > 0) {
145+
if (string_id < generation_) {
146146
return string_dict_->getString(string_id);
147147
}
148148
unsigned const string_index = transientIdToIndex(string_id);
@@ -157,10 +157,10 @@ std::vector<std::string> StringDictionaryProxy::getStrings(
157157
strings.reserve(string_ids.size());
158158
std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
159159
for (const auto string_id : string_ids) {
160-
if (string_id >= 0) {
161-
strings.emplace_back(string_dict_->getString(string_id));
162-
} else if (inline_int_null_value<int32_t>() == string_id) {
160+
if (inline_int_null_value<int32_t>() == string_id) {
163161
strings.emplace_back("");
162+
} else if (string_id < generation_) {
163+
strings.emplace_back(string_dict_->getString(string_id));
164164
} else {
165165
unsigned const string_index = transientIdToIndex(string_id);
166166
strings.emplace_back(*transient_string_vec_[string_index]);
@@ -195,7 +195,7 @@ StringDictionaryProxy::buildIntersectionTranslationMapToOtherProxyUnlocked(
195195
std::vector<std::string> transient_lookup_strings(num_transient_entries);
196196
std::transform(transient_string_vec_.cbegin(),
197197
transient_string_vec_.cend(),
198-
transient_lookup_strings.rbegin(),
198+
transient_lookup_strings.begin(),
199199
[](std::string const* ptr) { return *ptr; });
200200

201201
// This lookup may have a different snapshot of
@@ -208,14 +208,14 @@ StringDictionaryProxy::buildIntersectionTranslationMapToOtherProxyUnlocked(
208208
// a vector of pointer-to-strings so we don't have to materialize
209209
// transient_string_vec_ into transient_lookup_strings.
210210

211-
num_transient_strings_not_translated =
212-
dest_proxy->getTransientBulkImpl(transient_lookup_strings, id_map.data(), false);
211+
num_transient_strings_not_translated = dest_proxy->getTransientBulkImpl(
212+
transient_lookup_strings, id_map.transientData(), false);
213213
}
214214

215215
// Now map strings in dictionary
216216
// We place non-transient strings after the transient strings
217217
// if they exist, otherwise at index 0
218-
int32_t* translation_map_stored_entries_ptr = id_map.storageData();
218+
int32_t* translation_map_stored_entries_ptr = id_map.data();
219219

220220
auto dest_transient_lookup_callback = [dest_proxy, translation_map_stored_entries_ptr](
221221
const std::string_view& source_string,
@@ -239,8 +239,7 @@ StringDictionaryProxy::buildIntersectionTranslationMapToOtherProxyUnlocked(
239239
: 0UL;
240240

241241
const size_t num_dest_entries = dest_proxy->entryCountUnlocked();
242-
const size_t num_total_entries =
243-
id_map.getVectorMap().size() - 1UL /* account for skipped entry -1 */;
242+
const size_t num_total_entries = id_map.size();
244243
CHECK_GT(num_total_entries, 0UL);
245244
const size_t num_strings_not_translated =
246245
num_transient_strings_not_translated + num_persisted_strings_not_translated;
@@ -324,26 +323,16 @@ StringDictionaryProxy::IdMap StringDictionaryProxy::buildUnionTranslationMapToOt
324323
}
325324
const int32_t map_domain_start = id_map.domainStart();
326325
const int32_t map_domain_end = id_map.domainEnd();
327-
// First iterate over transient strings and add to dest map
328326
// Todo (todd): Add call to fetch string_views (local) or strings (distributed)
329327
// for all non-translated ids to avoid string-by-string fetch
330-
for (int32_t source_string_id = map_domain_start; source_string_id < -1;
328+
for (int32_t source_string_id = map_domain_start; source_string_id < map_domain_end;
331329
++source_string_id) {
332330
if (id_map[source_string_id] == StringDictionary::INVALID_STR_ID) {
333331
const auto source_string = getStringUnlocked(source_string_id);
334332
const auto dest_string_id = dest_proxy->getOrAddTransientUnlocked(source_string);
335333
id_map[source_string_id] = dest_string_id;
336334
}
337335
}
338-
// Now iterate over stored strings
339-
for (int32_t source_string_id = 0; source_string_id < map_domain_end;
340-
++source_string_id) {
341-
if (id_map[source_string_id] == StringDictionary::INVALID_STR_ID) {
342-
const auto source_string = string_dict_->getString(source_string_id);
343-
const auto dest_string_id = dest_proxy->getOrAddTransientUnlocked(source_string);
344-
id_map[source_string_id] = dest_string_id;
345-
}
346-
}
347336
}
348337
return id_map;
349338
}
@@ -449,7 +438,7 @@ std::vector<int32_t> StringDictionaryProxy::getRegexpLike(const std::string& pat
449438

450439
std::pair<const char*, size_t> StringDictionaryProxy::getStringBytes(
451440
int32_t string_id) const noexcept {
452-
if (string_id >= 0) {
441+
if (string_id < generation_) {
453442
return string_dict_.get()->getStringBytes(string_id);
454443
}
455444
unsigned const string_index = transientIdToIndex(string_id);
@@ -525,7 +514,7 @@ class StringLocalCallback : public StringDictionary::StringCallback {
525514
};
526515

527516
std::ostream& operator<<(std::ostream& os, StringDictionaryProxy::IdMap const& id_map) {
528-
return os << "IdMap(offset_(" << id_map.offset_ << ") vector_map_"
517+
return os << "IdMap(dict_size_(" << id_map.dict_size_ << ") vector_map_"
529518
<< shared::printContainer(id_map.vector_map_) << ')';
530519
}
531520

omniscidb/StringDictionary/StringDictionaryProxy.h

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -80,27 +80,26 @@ class StringDictionaryProxy {
8080
std::pair<const char*, size_t> getStringBytes(int32_t string_id) const noexcept;
8181

8282
class IdMap {
83-
size_t const offset_;
83+
const uint32_t dict_size_;
8484
std::vector<int32_t> vector_map_;
8585
int64_t num_untranslated_strings_{-1};
8686

8787
public:
88-
// +1 is added to skip string_id=-1 reserved for INVALID_STR_ID. id_map[-1]==-1.
8988
IdMap(uint32_t const tran_size, uint32_t const dict_size)
90-
: offset_(tran_size + 1)
91-
, vector_map_(offset_ + dict_size, StringDictionary::INVALID_STR_ID) {}
89+
: dict_size_(dict_size)
90+
, vector_map_(tran_size + dict_size, StringDictionary::INVALID_STR_ID) {}
9291
IdMap(IdMap const&) = delete;
9392
IdMap(IdMap&&) = default;
94-
bool empty() const { return vector_map_.size() == 1; }
95-
inline size_t getIndex(int32_t const id) const { return offset_ + id; }
93+
bool empty() const { return vector_map_.empty(); }
94+
inline size_t getIndex(int32_t const id) const { return id; }
9695
std::vector<int32_t> const& getVectorMap() const { return vector_map_; }
9796
size_t size() const { return vector_map_.size(); }
98-
size_t numTransients() const { return offset_ - 1; }
99-
size_t numNonTransients() const { return vector_map_.size() - offset_; }
97+
size_t numTransients() const { return vector_map_.size() - dict_size_; }
98+
size_t numNonTransients() const { return dict_size_; }
10099
int32_t* data() { return vector_map_.data(); }
101100
int32_t const* data() const { return vector_map_.data(); }
102-
int32_t domainStart() const { return -static_cast<int32_t>(offset_); }
103-
int32_t domainEnd() const { return static_cast<int32_t>(numNonTransients()); }
101+
int32_t domainStart() const { return 0; }
102+
int32_t domainEnd() const { return vector_map_.size(); }
104103
// Next two methods are currently used by buildUnionTranslationMapToOtherProxy to
105104
// short circuit iteration over ids after intersection translation if all
106105
// ids translated. Currently the private num_untranslated_strings_ is initialized
@@ -114,7 +113,7 @@ class StringDictionaryProxy {
114113
void setNumUntranslatedStrings(const size_t num_untranslated_strings) {
115114
num_untranslated_strings_ = static_cast<int64_t>(num_untranslated_strings);
116115
}
117-
int32_t* storageData() { return vector_map_.data() + offset_; }
116+
int32_t* transientData() { return vector_map_.data() + dict_size_; }
118117
int32_t& operator[](int32_t const id) { return vector_map_[getIndex(id)]; }
119118
int32_t operator[](int32_t const id) const { return vector_map_[getIndex(id)]; }
120119
friend std::ostream& operator<<(std::ostream&, IdMap const&);
@@ -187,17 +186,11 @@ class StringDictionaryProxy {
187186
void eachStringSerially(StringDictionary::StringCallback&) const;
188187

189188
private:
190-
// INVALID_STR_ID = -1 is reserved for invalid string_ids.
191-
// Thus the greatest valid transient string_id is -2.
192-
static unsigned transientIdToIndex(int32_t const id) {
193-
constexpr int max_transient_string_id = -2;
194-
return static_cast<unsigned>(max_transient_string_id - id);
189+
unsigned transientIdToIndex(int32_t const id) const {
190+
return static_cast<unsigned>(id - generation_);
195191
}
196192

197-
static int32_t transientIndexToId(unsigned const index) {
198-
constexpr int max_transient_string_id = -2;
199-
return static_cast<int32_t>(max_transient_string_id - index);
200-
}
193+
int32_t transientIndexToId(unsigned const index) const { return generation_ + index; }
201194

202195
/**
203196
* @brief Returns the number of string entries in the underlying string dictionary,

omniscidb/Tests/ColumnarResultsTest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ void test_columnar_conversion(const std::vector<TargetInfo>& target_infos,
9595
target_infos,
9696
query_mem_desc,
9797
generator,
98+
0,
9899
non_empty_step_size);
99100

100101
// Columnar Conversion:

omniscidb/Tests/GpuSharedMemoryTestHelpers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ std::vector<std::unique_ptr<ResultSet>> create_and_fill_input_result_sets(
163163
target_infos,
164164
query_mem_desc,
165165
generators[i],
166+
0,
166167
steps[i]);
167168
}
168169
return result_sets;

omniscidb/Tests/ResultSetBaselineRadixSortTest.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ void fill_storage_buffer_baseline_sort_int(int8_t* buff,
114114
sizeof(K),
115115
row_size_quad);
116116
CHECK(value_slots);
117-
fill_one_entry_baseline(value_slots, val, target_infos);
117+
fill_one_entry_baseline(value_slots, val, target_infos, 0);
118118
}
119119
}
120120

@@ -160,7 +160,8 @@ void fill_storage_buffer_baseline_sort_fp(int8_t* buff,
160160
sizeof(int64_t),
161161
key_component_count + target_slot_count);
162162
CHECK(value_slots);
163-
fill_one_entry_baseline(value_slots, val, target_infos, false, val == null_pattern);
163+
fill_one_entry_baseline(
164+
value_slots, val, target_infos, 0, false, val == null_pattern);
164165
}
165166
}
166167

omniscidb/Tests/ResultSetTest.cpp

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -420,14 +420,14 @@ void ResultSetEmulator::rse_fill_storage_buffer_perfect_hash_rowwise(
420420
// exersized for null_val test
421421
rs_values[i] = -1;
422422
key_buff = fill_one_entry_no_collisions(
423-
entries_buff, rs_query_mem_desc, v, rs_target_infos, false, true);
423+
entries_buff, rs_query_mem_desc, v, rs_target_infos, 0, false, true);
424424
} else {
425425
key_buff = fill_one_entry_no_collisions(
426-
entries_buff, rs_query_mem_desc, v, rs_target_infos, false, false);
426+
entries_buff, rs_query_mem_desc, v, rs_target_infos, 0, false, false);
427427
}
428428
} else {
429429
key_buff = fill_one_entry_no_collisions(
430-
entries_buff, rs_query_mem_desc, v, rs_target_infos, false);
430+
entries_buff, rs_query_mem_desc, v, rs_target_infos, 0, false);
431431
}
432432
} else {
433433
auto key_buff_i64 = reinterpret_cast<int64_t*>(key_buff);
@@ -441,10 +441,10 @@ void ResultSetEmulator::rse_fill_storage_buffer_perfect_hash_rowwise(
441441
rs_values[i] = -1;
442442
}
443443
key_buff = fill_one_entry_no_collisions(
444-
entries_buff, rs_query_mem_desc, 0xdeadbeef, rs_target_infos, true, true);
444+
entries_buff, rs_query_mem_desc, 0xdeadbeef, rs_target_infos, 0, true, true);
445445
} else {
446446
key_buff = fill_one_entry_no_collisions(
447-
entries_buff, rs_query_mem_desc, 0xdeadbeef, rs_target_infos, true);
447+
entries_buff, rs_query_mem_desc, 0xdeadbeef, rs_target_infos, 0, true);
448448
}
449449
}
450450
}
@@ -562,10 +562,10 @@ void ResultSetEmulator::rse_fill_storage_buffer_baseline_rowwise(
562562
if ((rs_flow == 2) &&
563563
(i >= rs_entry_count - 4)) { // null_val test-cases: last four rows
564564
rs_values[i] = -1;
565-
fill_one_entry_baseline(value_slots, v, rs_target_infos, false, true);
565+
fill_one_entry_baseline(value_slots, v, rs_target_infos, 0, false, true);
566566
} else {
567567
rs_values[i] = v;
568-
fill_one_entry_baseline(value_slots, v, rs_target_infos, false, false);
568+
fill_one_entry_baseline(value_slots, v, rs_target_infos, 0, false, false);
569569
}
570570
}
571571
}
@@ -886,8 +886,12 @@ void test_iterate(const std::vector<TargetInfo>& target_infos,
886886
}
887887
const auto storage = result_set.allocateStorage();
888888
EvenNumberGenerator generator;
889-
fill_storage_buffer(
890-
storage->getUnderlyingBuffer(), target_infos, query_mem_desc, generator, 2);
889+
fill_storage_buffer(storage->getUnderlyingBuffer(),
890+
target_infos,
891+
query_mem_desc,
892+
generator,
893+
sdp->getBaseGeneration(),
894+
2);
891895
int64_t ref_val{0};
892896
while (true) {
893897
const auto row = result_set.getNextRow(true, false);
@@ -1010,7 +1014,7 @@ void run_reduction(const std::vector<TargetInfo>& target_infos,
10101014
auto executor = Executor::getExecutor(getDataMgr());
10111015
const auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
10121016
g_data_provider.get(), Executor::getArenaBlockSize());
1013-
row_set_mem_owner->addStringDict(g_sd, 1, g_sd->storageEntryCount());
1017+
auto sdp = row_set_mem_owner->addStringDict(g_sd, 1, g_sd->storageEntryCount());
10141018
const auto rs1 = std::make_unique<ResultSet>(target_infos,
10151019
ExecutorDeviceType::CPU,
10161020
query_mem_desc,
@@ -1019,8 +1023,12 @@ void run_reduction(const std::vector<TargetInfo>& target_infos,
10191023
0,
10201024
0);
10211025
storage1 = rs1->allocateStorage();
1022-
fill_storage_buffer(
1023-
storage1->getUnderlyingBuffer(), target_infos, query_mem_desc, generator1, step);
1026+
fill_storage_buffer(storage1->getUnderlyingBuffer(),
1027+
target_infos,
1028+
query_mem_desc,
1029+
generator1,
1030+
sdp->getBaseGeneration(),
1031+
step);
10241032
const auto rs2 = std::make_unique<ResultSet>(target_infos,
10251033
ExecutorDeviceType::CPU,
10261034
query_mem_desc,
@@ -1029,8 +1037,12 @@ void run_reduction(const std::vector<TargetInfo>& target_infos,
10291037
0,
10301038
0);
10311039
storage2 = rs2->allocateStorage();
1032-
fill_storage_buffer(
1033-
storage2->getUnderlyingBuffer(), target_infos, query_mem_desc, generator2, step);
1040+
fill_storage_buffer(storage2->getUnderlyingBuffer(),
1041+
target_infos,
1042+
query_mem_desc,
1043+
generator2,
1044+
sdp->getBaseGeneration(),
1045+
step);
10341046
ResultSetManager rs_manager;
10351047
std::vector<ResultSet*> storage_set{rs1.get(), rs2.get()};
10361048
rs_manager.reduce(storage_set, config(), executor.get());
@@ -1048,7 +1060,7 @@ void test_reduce(const std::vector<TargetInfo>& target_infos,
10481060
auto executor = Executor::getExecutor(getDataMgr());
10491061
const auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
10501062
g_data_provider.get(), Executor::getArenaBlockSize());
1051-
row_set_mem_owner->addStringDict(g_sd, 1, g_sd->storageEntryCount());
1063+
auto sdp = row_set_mem_owner->addStringDict(g_sd, 1, g_sd->storageEntryCount());
10521064
const auto rs1 = std::make_unique<ResultSet>(target_infos,
10531065
ExecutorDeviceType::CPU,
10541066
query_mem_desc,
@@ -1057,8 +1069,12 @@ void test_reduce(const std::vector<TargetInfo>& target_infos,
10571069
0,
10581070
0);
10591071
storage1 = rs1->allocateStorage();
1060-
fill_storage_buffer(
1061-
storage1->getUnderlyingBuffer(), target_infos, query_mem_desc, generator1, step);
1072+
fill_storage_buffer(storage1->getUnderlyingBuffer(),
1073+
target_infos,
1074+
query_mem_desc,
1075+
generator1,
1076+
sdp->getBaseGeneration(),
1077+
step);
10621078
const auto rs2 = std::make_unique<ResultSet>(target_infos,
10631079
ExecutorDeviceType::CPU,
10641080
query_mem_desc,
@@ -1067,8 +1083,12 @@ void test_reduce(const std::vector<TargetInfo>& target_infos,
10671083
0,
10681084
0);
10691085
storage2 = rs2->allocateStorage();
1070-
fill_storage_buffer(
1071-
storage2->getUnderlyingBuffer(), target_infos, query_mem_desc, generator2, step);
1086+
fill_storage_buffer(storage2->getUnderlyingBuffer(),
1087+
target_infos,
1088+
query_mem_desc,
1089+
generator2,
1090+
sdp->getBaseGeneration(),
1091+
step);
10721092
ResultSetManager rs_manager;
10731093
std::vector<ResultSet*> storage_set{rs1.get(), rs2.get()};
10741094
auto result_rs = rs_manager.reduce(storage_set, config(), executor.get());

0 commit comments

Comments
 (0)