Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit 87977e8

Browse files
committed
Support nested string dicts for simple get/add methods.
Signed-off-by: ienkovich <[email protected]>
1 parent b82e8e0 commit 87977e8

File tree

5 files changed

+182
-29
lines changed

5 files changed

+182
-29
lines changed

omniscidb/QueryEngine/Execute.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4193,8 +4193,7 @@ StringDictionaryGenerations Executor::computeStringDictionaryGenerations(
41934193
const int dict_id = col_type->as<hdk::ir::ExtDictionaryType>()->dictId();
41944194
const auto dd = data_mgr_->getDictMetadata(dict_id);
41954195
CHECK(dd && dd->stringDict);
4196-
string_dictionary_generations.setGeneration(dict_id,
4197-
dd->stringDict->storageEntryCount());
4196+
string_dictionary_generations.setGeneration(dict_id, dd->stringDict->entryCount());
41984197
}
41994198
}
42004199
return string_dictionary_generations;

omniscidb/StringDictionary/StringDictionary.cpp

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ StringDictionary::StringDictionary(const DictRef& dict_ref,
8989
const bool materializeHashes,
9090
size_t initial_capacity)
9191
: dict_ref_(dict_ref)
92+
, base_generation_(0)
9293
, str_count_(0)
93-
, string_id_uint32_table_(initial_capacity, INVALID_STR_ID)
94-
, hash_cache_(initial_capacity)
94+
// Search code assumes non-empty table.
95+
, string_id_uint32_table_(std::max(initial_capacity, (size_t)2), INVALID_STR_ID)
96+
, hash_cache_(std::max(initial_capacity, (size_t)2))
9597
, materialize_hashes_(materializeHashes)
9698
, offset_map_(nullptr)
9799
, payload_map_(nullptr)
@@ -103,6 +105,26 @@ StringDictionary::StringDictionary(const DictRef& dict_ref,
103105
CHECK_EQ(size_t(0), (initial_capacity & (initial_capacity - 1)));
104106
}
105107

108+
StringDictionary::StringDictionary(std::shared_ptr<StringDictionary> base_dict,
109+
const int64_t generation,
110+
const bool materializeHashes,
111+
size_t initial_capacity)
112+
: dict_ref_(-1, -1)
113+
, base_dict_(base_dict)
114+
, base_generation_(generation >= 0 ? generation
115+
: static_cast<int64_t>(base_dict->entryCount()))
116+
, str_count_(0)
117+
// Search code assumes non-empty table.
118+
, string_id_uint32_table_(std::max(initial_capacity, (size_t)2), INVALID_STR_ID)
119+
, hash_cache_(std::max(initial_capacity, (size_t)2))
120+
, materialize_hashes_(materializeHashes)
121+
, offset_map_(nullptr)
122+
, payload_map_(nullptr)
123+
, offset_file_size_(0)
124+
, payload_file_size_(0)
125+
, payload_file_off_(0)
126+
, strings_cache_(nullptr) {}
127+
106128
namespace {
107129
class MapMaker : public StringDictionary::StringCallback {
108130
std::unordered_map<std::string, int32_t> map_;
@@ -122,6 +144,7 @@ class MapMaker : public StringDictionary::StringCallback {
122144
// Call serial_callback for each (string/_view, string_id). Must be called serially.
123145
void StringDictionary::eachStringSerially(int64_t const generation,
124146
StringCallback& serial_callback) const {
147+
CHECK(!base_dict_) << "Not implemented";
125148
size_t const n = std::min(static_cast<size_t>(generation), str_count_);
126149
CHECK_LE(n, static_cast<size_t>(std::numeric_limits<int32_t>::max()) + 1);
127150
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
@@ -154,6 +177,12 @@ int32_t StringDictionary::getOrAdd(const std::string_view& str) noexcept {
154177
}
155178
CHECK(str.size() <= MAX_STRLEN);
156179
const uint32_t hash = hash_string(str);
180+
if (base_dict_) {
181+
auto base_res = base_dict_->getIdOfString(str, hash);
182+
if (base_res != INVALID_STR_ID && base_res < base_generation_) {
183+
return base_res;
184+
}
185+
}
157186
{
158187
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
159188
const uint32_t bucket = computeBucket(hash, str, string_id_uint32_table_);
@@ -174,7 +203,7 @@ int32_t StringDictionary::getOrAdd(const std::string_view& str) noexcept {
174203
<< "Maximum number (" << str_count_
175204
<< ") of Dictionary encoded Strings reached for this column";
176205
appendToStorage(str);
177-
string_id_uint32_table_[bucket] = static_cast<int32_t>(str_count_);
206+
string_id_uint32_table_[bucket] = indexToId(str_count_);
178207
if (materialize_hashes_) {
179208
hash_cache_[str_count_] = hash;
180209
}
@@ -269,6 +298,7 @@ template <class T, class String>
269298
size_t StringDictionary::getBulk(const std::vector<String>& string_vec,
270299
T* encoded_vec,
271300
const int64_t generation) const {
301+
CHECK(!base_dict_) << "Not implemented";
272302
constexpr int64_t target_strings_per_thread{1000};
273303
const int64_t num_lookup_strings = string_vec.size();
274304
if (num_lookup_strings == 0) {
@@ -358,6 +388,7 @@ template size_t StringDictionary::getBulk(const std::vector<std::string>& string
358388
template <class T, class String>
359389
void StringDictionary::getOrAddBulk(const std::vector<String>& input_strings,
360390
T* output_string_ids) {
391+
CHECK(!base_dict_) << "Not implemented";
361392
if (g_enable_stringdict_parallel) {
362393
getOrAddBulkParallel(input_strings, output_string_ids);
363394
return;
@@ -414,6 +445,7 @@ void StringDictionary::getOrAddBulk(const std::vector<String>& input_strings,
414445
template <class T, class String>
415446
void StringDictionary::getOrAddBulkParallel(const std::vector<String>& input_strings,
416447
T* output_string_ids) {
448+
CHECK(!base_dict_) << "Not implemented";
417449
// Compute hashes of the input strings up front, and in parallel,
418450
// as the string hashing does not need to be behind the subsequent write_lock
419451
std::vector<uint32_t> input_strings_hashes(input_strings.size());
@@ -504,42 +536,77 @@ template void StringDictionary::getOrAddBulk(
504536

505537
template <class String>
506538
int32_t StringDictionary::getIdOfString(const String& str) const {
507-
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
508-
return getUnlocked(str);
539+
return getIdOfString(str, hash_string(str));
509540
}
510541

511542
template int32_t StringDictionary::getIdOfString(const std::string&) const;
512543
template int32_t StringDictionary::getIdOfString(const std::string_view&) const;
513544

545+
template <class String>
546+
int32_t StringDictionary::getIdOfString(const String& str, const uint32_t hash) const {
547+
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
548+
return getUnlocked(str, hash);
549+
}
550+
551+
template int32_t StringDictionary::getIdOfString(const std::string&,
552+
const uint32_t) const;
553+
template int32_t StringDictionary::getIdOfString(const std::string_view&,
554+
const uint32_t) const;
555+
514556
int32_t StringDictionary::getUnlocked(const std::string_view sv) const noexcept {
515-
const uint32_t hash = hash_string(sv);
557+
return getUnlocked(sv, hash_string(sv));
558+
}
559+
560+
int32_t StringDictionary::getUnlocked(const std::string_view sv,
561+
const uint32_t hash) const noexcept {
562+
if (base_dict_) {
563+
auto base_res = base_dict_->getIdOfString(sv, hash);
564+
if (base_res != INVALID_STR_ID && base_res < base_generation_) {
565+
return base_res;
566+
}
567+
}
516568
auto str_id = string_id_uint32_table_[computeBucket(hash, sv, string_id_uint32_table_)];
517569
return str_id;
518570
}
519571

520572
std::string StringDictionary::getString(int32_t string_id) const {
573+
if (inline_int_null_value<int32_t>() == string_id) {
574+
return "";
575+
}
576+
if (string_id < base_generation_) {
577+
return base_dict_->getString(string_id);
578+
}
521579
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
522-
return getStringUnlocked(string_id);
580+
return getOwnedStringChecked(string_id);
523581
}
524582

525583
std::string StringDictionary::getStringUnlocked(int32_t string_id) const noexcept {
526-
CHECK_LT(string_id, static_cast<int32_t>(str_count_));
527-
return getStringChecked(string_id);
584+
if (string_id < base_generation_) {
585+
return base_dict_->getString(string_id);
586+
}
587+
return getOwnedStringChecked(string_id);
528588
}
529589

530590
std::pair<char*, size_t> StringDictionary::getStringBytes(
531591
int32_t string_id) const noexcept {
592+
if (string_id < base_generation_) {
593+
return base_dict_->getStringBytes(string_id);
594+
}
532595
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
533596
CHECK_LE(0, string_id);
534-
CHECK_LT(string_id, static_cast<int32_t>(str_count_));
535-
return getStringBytesChecked(string_id);
597+
return getOwnedStringBytesChecked(string_id);
536598
}
537599

538600
size_t StringDictionary::storageEntryCount() const {
539601
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
540602
return str_count_;
541603
}
542604

605+
size_t StringDictionary::entryCount() const {
606+
mapd_shared_lock<mapd_shared_mutex> read_lock(rw_mutex_);
607+
return str_count_ + base_generation_;
608+
}
609+
543610
namespace {
544611

545612
bool is_like(const std::string& str,
@@ -571,6 +638,7 @@ std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
571638
const bool is_simple,
572639
const char escape,
573640
const size_t generation) const {
641+
CHECK(!base_dict_) << "Not implemented";
574642
mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
575643
const auto cache_key = std::make_tuple(pattern, icase, is_simple, escape);
576644
const auto it = like_cache_.find(cache_key);
@@ -619,6 +687,7 @@ std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
619687
std::vector<int32_t> StringDictionary::getEquals(std::string pattern,
620688
std::string comp_operator,
621689
size_t generation) {
690+
CHECK(!base_dict_) << "Not implemented";
622691
std::vector<int32_t> result;
623692
auto eq_id_itr = equal_cache_.find(pattern);
624693
int32_t eq_id = MAX_STRLEN + 1;
@@ -679,6 +748,7 @@ std::vector<int32_t> StringDictionary::getEquals(std::string pattern,
679748
std::vector<int32_t> StringDictionary::getCompare(const std::string& pattern,
680749
const std::string& comp_operator,
681750
const size_t generation) {
751+
CHECK(!base_dict_) << "Not implemented";
682752
mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
683753
std::vector<int32_t> ret;
684754
if (str_count_ == 0) {
@@ -837,6 +907,7 @@ bool is_regexp_like(const std::string& str,
837907
std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
838908
const char escape,
839909
const size_t generation) const {
910+
CHECK(!base_dict_) << "Not implemented";
840911
mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
841912
const auto cache_key = std::make_pair(pattern, escape);
842913
const auto it = regex_cache_.find(cache_key);
@@ -879,6 +950,7 @@ std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
879950
}
880951

881952
std::vector<std::string> StringDictionary::copyStrings() const {
953+
CHECK(!base_dict_) << "Not implemented";
882954
mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
883955

884956
if (strings_cache_) {
@@ -936,15 +1008,15 @@ void StringDictionary::increaseHashTableCapacity() noexcept {
9361008
for (size_t i = 0; i != str_count_; ++i) {
9371009
const uint32_t hash = hash_cache_[i];
9381010
const uint32_t bucket = computeUniqueBucketWithHash(hash, new_str_ids);
939-
new_str_ids[bucket] = i;
1011+
new_str_ids[bucket] = indexToId(i);
9401012
}
9411013
hash_cache_.resize(hash_cache_.size() * 2);
9421014
} else {
9431015
for (size_t i = 0; i != str_count_; ++i) {
944-
const auto str = getStringChecked(i);
1016+
const auto str = getOwnedStringChecked(indexToId(i));
9451017
const uint32_t hash = hash_string(str);
9461018
const uint32_t bucket = computeUniqueBucketWithHash(hash, new_str_ids);
947-
new_str_ids[bucket] = i;
1019+
new_str_ids[bucket] = indexToId(i);
9481020
}
9491021
}
9501022
string_id_uint32_table_.swap(new_str_ids);
@@ -958,6 +1030,7 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
9581030
const std::vector<String>& input_strings,
9591031
const std::vector<size_t>& string_memory_ids,
9601032
const std::vector<uint32_t>& input_strings_hashes) noexcept {
1033+
CHECK(!base_dict_) << "Not implemented";
9611034
std::vector<int32_t> new_str_ids(string_id_uint32_table_.size() * 2, INVALID_STR_ID);
9621035
if (materialize_hashes_) {
9631036
for (size_t i = 0; i != str_count; ++i) {
@@ -968,7 +1041,7 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
9681041
hash_cache_.resize(hash_cache_.size() * 2);
9691042
} else {
9701043
for (size_t storage_idx = 0; storage_idx != storage_high_water_mark; ++storage_idx) {
971-
const auto storage_string = getStringChecked(storage_idx);
1044+
const auto storage_string = getOwnedStringChecked(storage_idx);
9721045
const uint32_t hash = hash_string(storage_string);
9731046
const uint32_t bucket = computeUniqueBucketWithHash(hash, new_str_ids);
9741047
new_str_ids[bucket] = storage_idx;
@@ -983,13 +1056,13 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
9831056
string_id_uint32_table_.swap(new_str_ids);
9841057
}
9851058

986-
std::string StringDictionary::getStringChecked(const int string_id) const noexcept {
1059+
std::string StringDictionary::getOwnedStringChecked(const int string_id) const noexcept {
9871060
const auto str_canary = getStringFromStorage(string_id);
9881061
CHECK(!str_canary.canary);
9891062
return std::string(str_canary.c_str_ptr, str_canary.size);
9901063
}
9911064

992-
std::pair<char*, size_t> StringDictionary::getStringBytesChecked(
1065+
std::pair<char*, size_t> StringDictionary::getOwnedStringBytesChecked(
9931066
const int string_id) const noexcept {
9941067
const auto str_canary = getStringFromStorage(string_id);
9951068
CHECK(!str_canary.canary);
@@ -1009,7 +1082,7 @@ uint32_t StringDictionary::computeBucket(
10091082
INVALID_STR_ID) { // In this case it means the slot is available for use
10101083
break;
10111084
}
1012-
if ((materialize_hashes_ && hash == hash_cache_[candidate_string_id]) ||
1085+
if ((materialize_hashes_ && hash == hashById(candidate_string_id)) ||
10131086
!materialize_hashes_) {
10141087
const auto candidate_string = getStringFromStorageFast(candidate_string_id);
10151088
if (input_string.size() == candidate_string.size() &&
@@ -1034,6 +1107,7 @@ uint32_t StringDictionary::computeBucketFromStorageAndMemory(
10341107
const size_t storage_high_water_mark,
10351108
const std::vector<String>& input_strings,
10361109
const std::vector<size_t>& string_memory_ids) const noexcept {
1110+
CHECK(!base_dict_) << "Not implemented";
10371111
uint32_t bucket = input_string_hash & (string_id_uint32_table.size() - 1);
10381112
while (true) {
10391113
const int32_t candidate_string_id = string_id_uint32_table[bucket];
@@ -1153,14 +1227,13 @@ void StringDictionary::appendToStorageBulk(
11531227

11541228
std::string_view StringDictionary::getStringFromStorageFast(
11551229
const int string_id) const noexcept {
1156-
const StringIdxEntry* str_meta = offset_map_ + string_id;
1230+
const StringIdxEntry* str_meta = offset_map_ + idToIndex(string_id);
11571231
return {payload_map_ + str_meta->off, str_meta->size};
11581232
}
11591233

11601234
StringDictionary::PayloadString StringDictionary::getStringFromStorage(
11611235
const int string_id) const noexcept {
1162-
CHECK_GE(string_id, 0);
1163-
const StringIdxEntry* str_meta = offset_map_ + string_id;
1236+
const StringIdxEntry* str_meta = offset_map_ + idToIndex(string_id);
11641237
if (str_meta->size == 0xffff) {
11651238
// hit the canary
11661239
return {nullptr, 0, true};
@@ -1213,6 +1286,7 @@ void StringDictionary::invalidateInvertedIndex() noexcept {
12131286
}
12141287

12151288
void StringDictionary::buildSortedCache() {
1289+
CHECK(!base_dict_) << "Not implemented";
12161290
// This method is not thread-safe.
12171291
const auto cur_cache_size = sorted_cache.size();
12181292
std::vector<int32_t> temp_sorted_cache;
@@ -1224,6 +1298,7 @@ void StringDictionary::buildSortedCache() {
12241298
}
12251299

12261300
void StringDictionary::sortCache(std::vector<int32_t>& cache) {
1301+
CHECK(!base_dict_) << "Not implemented";
12271302
// This method is not thread-safe.
12281303

12291304
// this boost sort is creating some problems when we use UTF-8 encoded strings.
@@ -1237,6 +1312,7 @@ void StringDictionary::sortCache(std::vector<int32_t>& cache) {
12371312
}
12381313

12391314
void StringDictionary::mergeSortedCache(std::vector<int32_t>& temp_sorted_cache) {
1315+
CHECK(!base_dict_) << "Not implemented";
12401316
// this method is not thread safe
12411317
std::vector<int32_t> updated_cache(temp_sorted_cache.size() + sorted_cache.size());
12421318
size_t t_idx = 0, s_idx = 0, idx = 0;
@@ -1266,6 +1342,8 @@ std::vector<int32_t> StringDictionaryTranslator::buildDictionaryTranslationMap(
12661342
const std::shared_ptr<StringDictionary> source_dict,
12671343
const std::shared_ptr<StringDictionary> dest_dict,
12681344
StringLookupCallback const& dest_transient_lookup_callback) {
1345+
CHECK(!source_dict->getBaseDictionary());
1346+
CHECK(!dest_dict->getBaseDictionary());
12691347
auto timer = DEBUG_TIMER(__func__);
12701348
const size_t num_source_strings = source_dict->storageEntryCount();
12711349
const size_t num_dest_strings = dest_dict->storageEntryCount();
@@ -1290,6 +1368,8 @@ size_t StringDictionaryTranslator::buildDictionaryTranslationMap(
12901368
const int64_t dest_generation,
12911369
const bool dest_has_transients,
12921370
StringLookupCallback const& dest_transient_lookup_callback) {
1371+
CHECK(!source_dict->getBaseDictionary());
1372+
CHECK(!dest_dict->getBaseDictionary());
12931373
auto timer = DEBUG_TIMER(__func__);
12941374
CHECK_GE(source_generation, 0L);
12951375
CHECK_GE(dest_generation, 0L);

0 commit comments

Comments
 (0)