@@ -89,9 +89,11 @@ StringDictionary::StringDictionary(const DictRef& dict_ref,
89
89
const bool materializeHashes,
90
90
size_t initial_capacity)
91
91
: dict_ref_(dict_ref)
92
+ , base_generation_(0 )
92
93
, str_count_(0 )
93
- , string_id_uint32_table_(initial_capacity, INVALID_STR_ID)
94
- , hash_cache_(initial_capacity)
94
+ // Search code assumes non-empty table.
95
+ , string_id_uint32_table_(std::max(initial_capacity, (size_t )2 ), INVALID_STR_ID)
96
+ , hash_cache_(std::max(initial_capacity, (size_t )2 ))
95
97
, materialize_hashes_(materializeHashes)
96
98
, offset_map_(nullptr )
97
99
, payload_map_(nullptr )
@@ -103,6 +105,26 @@ StringDictionary::StringDictionary(const DictRef& dict_ref,
103
105
CHECK_EQ (size_t (0 ), (initial_capacity & (initial_capacity - 1 )));
104
106
}
105
107
108
+ StringDictionary::StringDictionary (std::shared_ptr<StringDictionary> base_dict,
109
+ const int64_t generation,
110
+ const bool materializeHashes,
111
+ size_t initial_capacity)
112
+ : dict_ref_(-1 , -1 )
113
+ , base_dict_(base_dict)
114
+ , base_generation_(generation >= 0 ? generation
115
+ : static_cast <int64_t >(base_dict->entryCount ()))
116
+ , str_count_(0 )
117
+ // Search code assumes non-empty table.
118
+ , string_id_uint32_table_(std::max(initial_capacity, (size_t )2), INVALID_STR_ID)
119
+ , hash_cache_(std::max(initial_capacity, (size_t )2))
120
+ , materialize_hashes_(materializeHashes)
121
+ , offset_map_(nullptr )
122
+ , payload_map_(nullptr )
123
+ , offset_file_size_(0 )
124
+ , payload_file_size_(0 )
125
+ , payload_file_off_(0 )
126
+ , strings_cache_(nullptr ) {}
127
+
106
128
namespace {
107
129
class MapMaker : public StringDictionary ::StringCallback {
108
130
std::unordered_map<std::string, int32_t > map_;
@@ -122,6 +144,7 @@ class MapMaker : public StringDictionary::StringCallback {
122
144
// Call serial_callback for each (string/_view, string_id). Must be called serially.
123
145
void StringDictionary::eachStringSerially (int64_t const generation,
124
146
StringCallback& serial_callback) const {
147
+ CHECK (!base_dict_) << " Not implemented" ;
125
148
size_t const n = std::min (static_cast <size_t >(generation), str_count_);
126
149
CHECK_LE (n, static_cast <size_t >(std::numeric_limits<int32_t >::max ()) + 1 );
127
150
mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
@@ -154,6 +177,12 @@ int32_t StringDictionary::getOrAdd(const std::string_view& str) noexcept {
154
177
}
155
178
CHECK (str.size () <= MAX_STRLEN);
156
179
const uint32_t hash = hash_string (str);
180
+ if (base_dict_) {
181
+ auto base_res = base_dict_->getIdOfString (str, hash);
182
+ if (base_res != INVALID_STR_ID && base_res < base_generation_) {
183
+ return base_res;
184
+ }
185
+ }
157
186
{
158
187
mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
159
188
const uint32_t bucket = computeBucket (hash, str, string_id_uint32_table_);
@@ -174,7 +203,7 @@ int32_t StringDictionary::getOrAdd(const std::string_view& str) noexcept {
174
203
<< " Maximum number (" << str_count_
175
204
<< " ) of Dictionary encoded Strings reached for this column" ;
176
205
appendToStorage (str);
177
- string_id_uint32_table_[bucket] = static_cast < int32_t > (str_count_);
206
+ string_id_uint32_table_[bucket] = indexToId (str_count_);
178
207
if (materialize_hashes_) {
179
208
hash_cache_[str_count_] = hash;
180
209
}
@@ -269,6 +298,7 @@ template <class T, class String>
269
298
size_t StringDictionary::getBulk (const std::vector<String>& string_vec,
270
299
T* encoded_vec,
271
300
const int64_t generation) const {
301
+ CHECK (!base_dict_) << " Not implemented" ;
272
302
constexpr int64_t target_strings_per_thread{1000 };
273
303
const int64_t num_lookup_strings = string_vec.size ();
274
304
if (num_lookup_strings == 0 ) {
@@ -358,6 +388,7 @@ template size_t StringDictionary::getBulk(const std::vector<std::string>& string
358
388
template <class T , class String >
359
389
void StringDictionary::getOrAddBulk (const std::vector<String>& input_strings,
360
390
T* output_string_ids) {
391
+ CHECK (!base_dict_) << " Not implemented" ;
361
392
if (g_enable_stringdict_parallel) {
362
393
getOrAddBulkParallel (input_strings, output_string_ids);
363
394
return ;
@@ -414,6 +445,7 @@ void StringDictionary::getOrAddBulk(const std::vector<String>& input_strings,
414
445
template <class T , class String >
415
446
void StringDictionary::getOrAddBulkParallel (const std::vector<String>& input_strings,
416
447
T* output_string_ids) {
448
+ CHECK (!base_dict_) << " Not implemented" ;
417
449
// Compute hashes of the input strings up front, and in parallel,
418
450
// as the string hashing does not need to be behind the subsequent write_lock
419
451
std::vector<uint32_t > input_strings_hashes (input_strings.size ());
@@ -504,42 +536,77 @@ template void StringDictionary::getOrAddBulk(
504
536
505
537
template <class String >
506
538
int32_t StringDictionary::getIdOfString (const String& str) const {
507
- mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
508
- return getUnlocked (str);
539
+ return getIdOfString (str, hash_string (str));
509
540
}
510
541
511
542
template int32_t StringDictionary::getIdOfString (const std::string&) const ;
512
543
template int32_t StringDictionary::getIdOfString (const std::string_view&) const ;
513
544
545
+ template <class String >
546
+ int32_t StringDictionary::getIdOfString (const String& str, const uint32_t hash) const {
547
+ mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
548
+ return getUnlocked (str, hash);
549
+ }
550
+
551
+ template int32_t StringDictionary::getIdOfString (const std::string&,
552
+ const uint32_t ) const ;
553
+ template int32_t StringDictionary::getIdOfString (const std::string_view&,
554
+ const uint32_t ) const ;
555
+
514
556
int32_t StringDictionary::getUnlocked (const std::string_view sv) const noexcept {
515
- const uint32_t hash = hash_string (sv);
557
+ return getUnlocked (sv, hash_string (sv));
558
+ }
559
+
560
+ int32_t StringDictionary::getUnlocked (const std::string_view sv,
561
+ const uint32_t hash) const noexcept {
562
+ if (base_dict_) {
563
+ auto base_res = base_dict_->getIdOfString (sv, hash);
564
+ if (base_res != INVALID_STR_ID && base_res < base_generation_) {
565
+ return base_res;
566
+ }
567
+ }
516
568
auto str_id = string_id_uint32_table_[computeBucket (hash, sv, string_id_uint32_table_)];
517
569
return str_id;
518
570
}
519
571
520
572
std::string StringDictionary::getString (int32_t string_id) const {
573
+ if (inline_int_null_value<int32_t >() == string_id) {
574
+ return " " ;
575
+ }
576
+ if (string_id < base_generation_) {
577
+ return base_dict_->getString (string_id);
578
+ }
521
579
mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
522
- return getStringUnlocked (string_id);
580
+ return getOwnedStringChecked (string_id);
523
581
}
524
582
525
583
std::string StringDictionary::getStringUnlocked (int32_t string_id) const noexcept {
526
- CHECK_LT (string_id, static_cast <int32_t >(str_count_));
527
- return getStringChecked (string_id);
584
+ if (string_id < base_generation_) {
585
+ return base_dict_->getString (string_id);
586
+ }
587
+ return getOwnedStringChecked (string_id);
528
588
}
529
589
530
590
std::pair<char *, size_t > StringDictionary::getStringBytes (
531
591
int32_t string_id) const noexcept {
592
+ if (string_id < base_generation_) {
593
+ return base_dict_->getStringBytes (string_id);
594
+ }
532
595
mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
533
596
CHECK_LE (0 , string_id);
534
- CHECK_LT (string_id, static_cast <int32_t >(str_count_));
535
- return getStringBytesChecked (string_id);
597
+ return getOwnedStringBytesChecked (string_id);
536
598
}
537
599
538
600
size_t StringDictionary::storageEntryCount () const {
539
601
mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
540
602
return str_count_;
541
603
}
542
604
605
+ size_t StringDictionary::entryCount () const {
606
+ mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
607
+ return str_count_ + base_generation_;
608
+ }
609
+
543
610
namespace {
544
611
545
612
bool is_like (const std::string& str,
@@ -571,6 +638,7 @@ std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
571
638
const bool is_simple,
572
639
const char escape,
573
640
const size_t generation) const {
641
+ CHECK (!base_dict_) << " Not implemented" ;
574
642
mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
575
643
const auto cache_key = std::make_tuple (pattern, icase, is_simple, escape);
576
644
const auto it = like_cache_.find (cache_key);
@@ -619,6 +687,7 @@ std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
619
687
std::vector<int32_t > StringDictionary::getEquals (std::string pattern,
620
688
std::string comp_operator,
621
689
size_t generation) {
690
+ CHECK (!base_dict_) << " Not implemented" ;
622
691
std::vector<int32_t > result;
623
692
auto eq_id_itr = equal_cache_.find (pattern);
624
693
int32_t eq_id = MAX_STRLEN + 1 ;
@@ -679,6 +748,7 @@ std::vector<int32_t> StringDictionary::getEquals(std::string pattern,
679
748
std::vector<int32_t > StringDictionary::getCompare (const std::string& pattern,
680
749
const std::string& comp_operator,
681
750
const size_t generation) {
751
+ CHECK (!base_dict_) << " Not implemented" ;
682
752
mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
683
753
std::vector<int32_t > ret;
684
754
if (str_count_ == 0 ) {
@@ -837,6 +907,7 @@ bool is_regexp_like(const std::string& str,
837
907
std::vector<int32_t > StringDictionary::getRegexpLike (const std::string& pattern,
838
908
const char escape,
839
909
const size_t generation) const {
910
+ CHECK (!base_dict_) << " Not implemented" ;
840
911
mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
841
912
const auto cache_key = std::make_pair (pattern, escape);
842
913
const auto it = regex_cache_.find (cache_key);
@@ -879,6 +950,7 @@ std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
879
950
}
880
951
881
952
std::vector<std::string> StringDictionary::copyStrings () const {
953
+ CHECK (!base_dict_) << " Not implemented" ;
882
954
mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
883
955
884
956
if (strings_cache_) {
@@ -936,15 +1008,15 @@ void StringDictionary::increaseHashTableCapacity() noexcept {
936
1008
for (size_t i = 0 ; i != str_count_; ++i) {
937
1009
const uint32_t hash = hash_cache_[i];
938
1010
const uint32_t bucket = computeUniqueBucketWithHash (hash, new_str_ids);
939
- new_str_ids[bucket] = i ;
1011
+ new_str_ids[bucket] = indexToId (i) ;
940
1012
}
941
1013
hash_cache_.resize (hash_cache_.size () * 2 );
942
1014
} else {
943
1015
for (size_t i = 0 ; i != str_count_; ++i) {
944
- const auto str = getStringChecked (i );
1016
+ const auto str = getOwnedStringChecked ( indexToId (i) );
945
1017
const uint32_t hash = hash_string (str);
946
1018
const uint32_t bucket = computeUniqueBucketWithHash (hash, new_str_ids);
947
- new_str_ids[bucket] = i ;
1019
+ new_str_ids[bucket] = indexToId (i) ;
948
1020
}
949
1021
}
950
1022
string_id_uint32_table_.swap (new_str_ids);
@@ -958,6 +1030,7 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
958
1030
const std::vector<String>& input_strings,
959
1031
const std::vector<size_t >& string_memory_ids,
960
1032
const std::vector<uint32_t >& input_strings_hashes) noexcept {
1033
+ CHECK (!base_dict_) << " Not implemented" ;
961
1034
std::vector<int32_t > new_str_ids (string_id_uint32_table_.size () * 2 , INVALID_STR_ID);
962
1035
if (materialize_hashes_) {
963
1036
for (size_t i = 0 ; i != str_count; ++i) {
@@ -968,7 +1041,7 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
968
1041
hash_cache_.resize (hash_cache_.size () * 2 );
969
1042
} else {
970
1043
for (size_t storage_idx = 0 ; storage_idx != storage_high_water_mark; ++storage_idx) {
971
- const auto storage_string = getStringChecked (storage_idx);
1044
+ const auto storage_string = getOwnedStringChecked (storage_idx);
972
1045
const uint32_t hash = hash_string (storage_string);
973
1046
const uint32_t bucket = computeUniqueBucketWithHash (hash, new_str_ids);
974
1047
new_str_ids[bucket] = storage_idx;
@@ -983,13 +1056,13 @@ void StringDictionary::increaseHashTableCapacityFromStorageAndMemory(
983
1056
string_id_uint32_table_.swap (new_str_ids);
984
1057
}
985
1058
986
- std::string StringDictionary::getStringChecked (const int string_id) const noexcept {
1059
+ std::string StringDictionary::getOwnedStringChecked (const int string_id) const noexcept {
987
1060
const auto str_canary = getStringFromStorage (string_id);
988
1061
CHECK (!str_canary.canary );
989
1062
return std::string (str_canary.c_str_ptr , str_canary.size );
990
1063
}
991
1064
992
- std::pair<char *, size_t > StringDictionary::getStringBytesChecked (
1065
+ std::pair<char *, size_t > StringDictionary::getOwnedStringBytesChecked (
993
1066
const int string_id) const noexcept {
994
1067
const auto str_canary = getStringFromStorage (string_id);
995
1068
CHECK (!str_canary.canary );
@@ -1009,7 +1082,7 @@ uint32_t StringDictionary::computeBucket(
1009
1082
INVALID_STR_ID) { // In this case it means the slot is available for use
1010
1083
break ;
1011
1084
}
1012
- if ((materialize_hashes_ && hash == hash_cache_[ candidate_string_id] ) ||
1085
+ if ((materialize_hashes_ && hash == hashById ( candidate_string_id) ) ||
1013
1086
!materialize_hashes_) {
1014
1087
const auto candidate_string = getStringFromStorageFast (candidate_string_id);
1015
1088
if (input_string.size () == candidate_string.size () &&
@@ -1034,6 +1107,7 @@ uint32_t StringDictionary::computeBucketFromStorageAndMemory(
1034
1107
const size_t storage_high_water_mark,
1035
1108
const std::vector<String>& input_strings,
1036
1109
const std::vector<size_t >& string_memory_ids) const noexcept {
1110
+ CHECK (!base_dict_) << " Not implemented" ;
1037
1111
uint32_t bucket = input_string_hash & (string_id_uint32_table.size () - 1 );
1038
1112
while (true ) {
1039
1113
const int32_t candidate_string_id = string_id_uint32_table[bucket];
@@ -1153,14 +1227,13 @@ void StringDictionary::appendToStorageBulk(
1153
1227
1154
1228
std::string_view StringDictionary::getStringFromStorageFast (
1155
1229
const int string_id) const noexcept {
1156
- const StringIdxEntry* str_meta = offset_map_ + string_id;
1230
+ const StringIdxEntry* str_meta = offset_map_ + idToIndex ( string_id) ;
1157
1231
return {payload_map_ + str_meta->off , str_meta->size };
1158
1232
}
1159
1233
1160
1234
StringDictionary::PayloadString StringDictionary::getStringFromStorage (
1161
1235
const int string_id) const noexcept {
1162
- CHECK_GE (string_id, 0 );
1163
- const StringIdxEntry* str_meta = offset_map_ + string_id;
1236
+ const StringIdxEntry* str_meta = offset_map_ + idToIndex (string_id);
1164
1237
if (str_meta->size == 0xffff ) {
1165
1238
// hit the canary
1166
1239
return {nullptr , 0 , true };
@@ -1213,6 +1286,7 @@ void StringDictionary::invalidateInvertedIndex() noexcept {
1213
1286
}
1214
1287
1215
1288
void StringDictionary::buildSortedCache () {
1289
+ CHECK (!base_dict_) << " Not implemented" ;
1216
1290
// This method is not thread-safe.
1217
1291
const auto cur_cache_size = sorted_cache.size ();
1218
1292
std::vector<int32_t > temp_sorted_cache;
@@ -1224,6 +1298,7 @@ void StringDictionary::buildSortedCache() {
1224
1298
}
1225
1299
1226
1300
void StringDictionary::sortCache (std::vector<int32_t >& cache) {
1301
+ CHECK (!base_dict_) << " Not implemented" ;
1227
1302
// This method is not thread-safe.
1228
1303
1229
1304
// this boost sort is creating some problems when we use UTF-8 encoded strings.
@@ -1237,6 +1312,7 @@ void StringDictionary::sortCache(std::vector<int32_t>& cache) {
1237
1312
}
1238
1313
1239
1314
void StringDictionary::mergeSortedCache (std::vector<int32_t >& temp_sorted_cache) {
1315
+ CHECK (!base_dict_) << " Not implemented" ;
1240
1316
// this method is not thread safe
1241
1317
std::vector<int32_t > updated_cache (temp_sorted_cache.size () + sorted_cache.size ());
1242
1318
size_t t_idx = 0 , s_idx = 0 , idx = 0 ;
@@ -1266,6 +1342,8 @@ std::vector<int32_t> StringDictionaryTranslator::buildDictionaryTranslationMap(
1266
1342
const std::shared_ptr<StringDictionary> source_dict,
1267
1343
const std::shared_ptr<StringDictionary> dest_dict,
1268
1344
StringLookupCallback const & dest_transient_lookup_callback) {
1345
+ CHECK (!source_dict->getBaseDictionary ());
1346
+ CHECK (!dest_dict->getBaseDictionary ());
1269
1347
auto timer = DEBUG_TIMER (__func__);
1270
1348
const size_t num_source_strings = source_dict->storageEntryCount ();
1271
1349
const size_t num_dest_strings = dest_dict->storageEntryCount ();
@@ -1290,6 +1368,8 @@ size_t StringDictionaryTranslator::buildDictionaryTranslationMap(
1290
1368
const int64_t dest_generation,
1291
1369
const bool dest_has_transients,
1292
1370
StringLookupCallback const & dest_transient_lookup_callback) {
1371
+ CHECK (!source_dict->getBaseDictionary ());
1372
+ CHECK (!dest_dict->getBaseDictionary ());
1293
1373
auto timer = DEBUG_TIMER (__func__);
1294
1374
CHECK_GE (source_generation, 0L );
1295
1375
CHECK_GE (dest_generation, 0L );
0 commit comments