@@ -1005,52 +1005,77 @@ std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
1005
1005
return result;
1006
1006
}
1007
1007
1008
- std::vector<std::string> StringDictionary::copyStrings () const {
1009
- CHECK (!base_dict_) << " Not implemented" ;
1010
- mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
1008
+ std::vector<std::string> StringDictionary::copyStrings (int64_t generation) const {
1009
+ generation = generation >= 0 ? std::min (generation, static_cast <int64_t >(entryCount ()))
1010
+ : static_cast <int64_t >(entryCount ());
1011
+ if (!strings_cache_) {
1012
+ strings_cache_ = std::make_shared<std::vector<std::string>>();
1013
+ strings_cache_->reserve (entryCount ());
1014
+ copyStrings (0 , entryCount (), *strings_cache_);
1015
+ } else if (strings_cache_->size () < static_cast <size_t >(generation)) {
1016
+ auto start = strings_cache_->size ();
1017
+ strings_cache_->reserve (entryCount ());
1018
+ copyStrings (start, entryCount (), *strings_cache_);
1019
+ }
1011
1020
1012
- if (strings_cache_) {
1013
- return *strings_cache_;
1021
+ return std::vector<std::string>(strings_cache_->begin (),
1022
+ strings_cache_->begin () + generation);
1023
+ }
1024
+
1025
+ void StringDictionary::copyStrings (int64_t string_id_start,
1026
+ int64_t string_id_end,
1027
+ std::vector<std::string>& out_vec) const {
1028
+ CHECK_GE (string_id_start, 0 );
1029
+ CHECK_LE (string_id_end, static_cast <int64_t >(entryCount ()));
1030
+
1031
+ if (base_dict_ && string_id_start < base_generation_) {
1032
+ base_dict_->copyStrings (
1033
+ string_id_start, std::min (base_generation_, string_id_end), out_vec);
1014
1034
}
1015
1035
1016
- strings_cache_ = std::make_shared<std::vector<std::string>>();
1017
- strings_cache_->reserve (str_count_);
1018
- const bool multithreaded = str_count_ > 10000 ;
1019
- const auto worker_count =
1020
- multithreaded ? static_cast <size_t >(cpu_threads ()) : size_t (1 );
1021
- CHECK_GT (worker_count, 0UL );
1022
- std::vector<std::vector<std::string>> worker_results (worker_count);
1036
+ int64_t local_string_id_start = std::max (string_id_start, base_generation_);
1037
+ int64_t local_string_id_end = string_id_end;
1038
+ if (local_string_id_start >= local_string_id_end) {
1039
+ return ;
1040
+ }
1041
+
1042
+ mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
1043
+ const bool multithreaded = (local_string_id_end - local_string_id_start) > 10000 ;
1023
1044
auto copy = [this ](std::vector<std::string>& str_list,
1024
- const size_t start_id,
1025
- const size_t end_id) {
1045
+ const int64_t start_id,
1046
+ const int64_t end_id) {
1026
1047
CHECK_LE (start_id, end_id);
1027
1048
str_list.reserve (end_id - start_id);
1028
- for (size_t string_id = start_id; string_id < end_id; ++string_id) {
1049
+ for (int64_t string_id = start_id; string_id < end_id; ++string_id) {
1029
1050
str_list.push_back (getStringUnlocked (string_id));
1030
1051
}
1031
1052
};
1032
1053
if (multithreaded) {
1054
+ const auto worker_count = cpu_threads ();
1055
+ CHECK_GT (worker_count, 0 );
1056
+ std::vector<std::vector<std::string>> worker_results (worker_count);
1033
1057
std::vector<std::future<void >> workers;
1034
- const auto stride = (str_count_ + (worker_count - 1 )) / worker_count;
1035
- for (size_t worker_idx = 0 , start = 0 , end = std::min (start + stride, str_count_);
1036
- worker_idx < worker_count && start < str_count_;
1037
- ++worker_idx, start += stride, end = std::min (start + stride, str_count_)) {
1058
+ const auto stride =
1059
+ (local_string_id_end - local_string_id_start + (worker_count - 1 )) / worker_count;
1060
+ for (int64_t worker_idx = 0 ,
1061
+ start = local_string_id_start,
1062
+ end = std::min (start + stride, local_string_id_end);
1063
+ worker_idx < worker_count && start < local_string_id_end;
1064
+ ++worker_idx,
1065
+ start += stride,
1066
+ end = std::min (start + stride, local_string_id_end)) {
1038
1067
workers.push_back (std::async (
1039
1068
std::launch::async, copy, std::ref (worker_results[worker_idx]), start, end));
1040
1069
}
1041
1070
for (auto & worker : workers) {
1042
1071
worker.get ();
1043
1072
}
1073
+ for (const auto & worker_result : worker_results) {
1074
+ out_vec.insert (out_vec.end (), worker_result.begin (), worker_result.end ());
1075
+ }
1044
1076
} else {
1045
- CHECK_EQ (worker_results.size (), size_t (1 ));
1046
- copy (worker_results[0 ], 0 , str_count_);
1047
- }
1048
-
1049
- for (const auto & worker_result : worker_results) {
1050
- strings_cache_->insert (
1051
- strings_cache_->end (), worker_result.begin (), worker_result.end ());
1077
+ copy (out_vec, local_string_id_start, local_string_id_end);
1052
1078
}
1053
- return *strings_cache_;
1054
1079
}
1055
1080
1056
1081
bool StringDictionary::fillRateIsHigh (const size_t num_strings) const noexcept {
0 commit comments