@@ -144,26 +144,6 @@ void StringDictionary::eachStringSerially(int64_t const generation,
144
144
}
145
145
}
146
146
147
- void StringDictionary::processDictionaryFutures (
148
- std::vector<std::future<std::vector<std::pair<string_dict_hash_t , unsigned int >>>>&
149
- dictionary_futures) {
150
- for (auto & dictionary_future : dictionary_futures) {
151
- dictionary_future.wait ();
152
- const auto hashVec = dictionary_future.get ();
153
- for (const auto & hash : hashVec) {
154
- const uint32_t bucket =
155
- computeUniqueBucketWithHash (hash.first , string_id_string_dict_hash_table_);
156
- payload_file_off_ += hash.second ;
157
- string_id_string_dict_hash_table_[bucket] = static_cast <int32_t >(str_count_);
158
- if (materialize_hashes_) {
159
- hash_cache_[str_count_] = hash.first ;
160
- }
161
- ++str_count_;
162
- }
163
- }
164
- dictionary_futures.clear ();
165
- }
166
-
167
147
int32_t StringDictionary::getDbId () const noexcept {
168
148
return dict_ref_.dbId ;
169
149
}
@@ -172,36 +152,6 @@ int32_t StringDictionary::getDictId() const noexcept {
172
152
return dict_ref_.dictId ;
173
153
}
174
154
175
- /* *
176
- * Method to retrieve number of strings in storage via a binary search for the first
177
- * canary
178
- * @param storage_slots number of storage entries we should search to find the minimum
179
- * canary
180
- * @return number of strings in storage
181
- */
182
- size_t StringDictionary::getNumStringsFromStorage (
183
- const size_t storage_slots) const noexcept {
184
- if (storage_slots == 0 ) {
185
- return 0 ;
186
- }
187
- // Must use signed integers since final binary search step can wrap to max size_t value
188
- // if dictionary is empty
189
- int64_t min_bound = 0 ;
190
- int64_t max_bound = storage_slots - 1 ;
191
- int64_t guess{0 };
192
- while (min_bound <= max_bound) {
193
- guess = (max_bound + min_bound) / 2 ;
194
- CHECK_GE (guess, 0 );
195
- if (getStringFromStorage (guess).canary ) {
196
- max_bound = guess - 1 ;
197
- } else {
198
- min_bound = guess + 1 ;
199
- }
200
- }
201
- CHECK_GE (guess + (min_bound > guess ? 1 : 0 ), 0 );
202
- return guess + (min_bound > guess ? 1 : 0 );
203
- }
204
-
205
155
StringDictionary::~StringDictionary () noexcept {
206
156
free (CANARY_BUFFER);
207
157
if (payload_map_) {
@@ -236,9 +186,7 @@ int32_t StringDictionary::getOrAdd(const std::string_view& str) noexcept {
236
186
if (string_id_string_dict_hash_table_[bucket] == INVALID_STR_ID) {
237
187
CHECK_LT (str_count_, MAX_STRCOUNT)
238
188
<< " Maximum number (" << str_count_
239
- << " ) of Dictionary encoded Strings reached for this column, offset path "
240
- " for column is "
241
- << offsets_path_;
189
+ << " ) of Dictionary encoded Strings reached for this column" ;
242
190
appendToStorage (str);
243
191
string_id_string_dict_hash_table_[bucket] = static_cast <int32_t >(str_count_);
244
192
if (materialize_hashes_) {
@@ -473,8 +421,7 @@ void StringDictionary::getOrAddBulk(const std::vector<String>& input_strings,
473
421
CHECK_LT (str_count_, MAX_STRCOUNT)
474
422
<< " Maximum number (" << str_count_
475
423
<< " ) of Dictionary encoded Strings reached for this column, offset path "
476
- " for column is "
477
- << offsets_path_;
424
+ " for column is" ;
478
425
if (fillRateIsHigh (str_count_)) {
479
426
// resize when more than 50% is full
480
427
increaseHashTableCapacity ();
@@ -557,9 +504,7 @@ void StringDictionary::getOrAddBulkParallel(const std::vector<String>& input_str
557
504
CHECK_LT (shadow_str_count, MAX_STRCOUNT)
558
505
<< " Maximum number (" << shadow_str_count
559
506
<< " ) of Dictionary encoded Strings reached for this column, offset path "
560
- " for column is "
561
- << offsets_path_;
562
-
507
+ " for column is " ;
563
508
string_memory_ids.push_back (input_string_idx);
564
509
sum_new_string_lengths += input_string.size ();
565
510
string_id_string_dict_hash_table_[hash_bucket] =
@@ -1354,71 +1299,6 @@ void StringDictionary::mergeSortedCache(std::vector<int32_t>& temp_sorted_cache)
1354
1299
sorted_cache.swap (updated_cache);
1355
1300
}
1356
1301
1357
- void StringDictionary::populate_string_ids (
1358
- std::vector<int32_t >& dest_ids,
1359
- StringDictionary* dest_dict,
1360
- const std::vector<int32_t >& source_ids,
1361
- const StringDictionary* source_dict,
1362
- const std::vector<std::string const *>& transient_string_vec) {
1363
- std::vector<std::string> strings;
1364
-
1365
- for (const int32_t source_id : source_ids) {
1366
- if (source_id == std::numeric_limits<int32_t >::min ()) {
1367
- strings.emplace_back (" " );
1368
- } else if (source_id < 0 ) {
1369
- unsigned const string_index = StringDictionaryProxy::transientIdToIndex (source_id);
1370
- CHECK_LT (string_index, transient_string_vec.size ()) << " source_id=" << source_id;
1371
- strings.emplace_back (*transient_string_vec[string_index]);
1372
- } else {
1373
- strings.push_back (source_dict->getString (source_id));
1374
- }
1375
- }
1376
-
1377
- dest_ids.resize (strings.size ());
1378
- dest_dict->getOrAddBulk (strings, &dest_ids[0 ]);
1379
- }
1380
-
1381
- void StringDictionary::populate_string_array_ids (
1382
- std::vector<std::vector<int32_t >>& dest_array_ids,
1383
- StringDictionary* dest_dict,
1384
- const std::vector<std::vector<int32_t >>& source_array_ids,
1385
- const StringDictionary* source_dict) {
1386
- dest_array_ids.resize (source_array_ids.size ());
1387
-
1388
- std::atomic<size_t > row_idx{0 };
1389
- auto processor = [&row_idx, &dest_array_ids, dest_dict, &source_array_ids, source_dict](
1390
- int thread_id) {
1391
- for (;;) {
1392
- auto row = row_idx.fetch_add (1 );
1393
-
1394
- if (row >= dest_array_ids.size ()) {
1395
- return ;
1396
- }
1397
- const auto & source_ids = source_array_ids[row];
1398
- auto & dest_ids = dest_array_ids[row];
1399
- populate_string_ids (dest_ids, dest_dict, source_ids, source_dict);
1400
- }
1401
- };
1402
-
1403
- const int num_worker_threads = std::thread::hardware_concurrency ();
1404
-
1405
- if (source_array_ids.size () / num_worker_threads > 10 ) {
1406
- std::vector<std::future<void >> worker_threads;
1407
- for (int i = 0 ; i < num_worker_threads; ++i) {
1408
- worker_threads.push_back (std::async (std::launch::async, processor, i));
1409
- }
1410
-
1411
- for (auto & child : worker_threads) {
1412
- child.wait ();
1413
- }
1414
- for (auto & child : worker_threads) {
1415
- child.get ();
1416
- }
1417
- } else {
1418
- processor (0 );
1419
- }
1420
- }
1421
-
1422
1302
std::vector<std::string_view> StringDictionary::getStringViews (
1423
1303
const size_t generation) const {
1424
1304
auto timer = DEBUG_TIMER (__func__);
0 commit comments