Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit a38b9e1

Browse files
committed
Remove storage from StringDictionary
1 parent 60d60c4 commit a38b9e1

File tree

7 files changed

+106
-422
lines changed

7 files changed

+106
-422
lines changed

omniscidb/ArrowStorage/ArrowStorage.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,8 +360,8 @@ TableInfoPtr ArrowStorage::createTable(const std::string& table_name,
360360
int dict_id = addSchemaIdChecked(next_dict_id_++, schema_id_);
361361
auto dict_desc = std::make_unique<DictDescriptor>(
362362
db_id_, dict_id, col.name, 32, true, 1, table_name, true);
363-
dict_desc->stringDict = std::make_shared<StringDictionary>(
364-
DictRef{db_id_, dict_id}, table_name + "."s + col.name, true, false);
363+
dict_desc->stringDict =
364+
std::make_shared<StringDictionary>(DictRef{db_id_, dict_id});
365365
if (sharing_id < 0) {
366366
dict_ids.emplace(sharing_id, dict_id);
367367
}

omniscidb/ResultSet/RowSetMemoryOwner.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ StringDictionaryProxy* RowSetMemoryOwner::getOrAddStringDictProxy(
2323
CHECK_EQ(dict_id, DictRef::literalsDictId);
2424
if (!lit_str_dict_proxy_) {
2525
DictRef literal_dict_ref(DictRef::invalidDbId, DictRef::literalsDictId);
26-
std::shared_ptr<StringDictionary> tsd = std::make_shared<StringDictionary>(
27-
literal_dict_ref, "", false, true, g_cache_string_hash);
26+
std::shared_ptr<StringDictionary> tsd =
27+
std::make_shared<StringDictionary>(literal_dict_ref, g_cache_string_hash);
2828
lit_str_dict_proxy_ =
2929
std::make_shared<StringDictionaryProxy>(tsd, literal_dict_ref.dictId, 0);
3030
}

omniscidb/StringDictionary/StringDictionary.cpp

Lines changed: 11 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,6 @@ namespace {
5050

5151
const int SYSTEM_PAGE_SIZE = omnisci::get_page_size();
5252

53-
int checked_open(const char* path, const bool recover) {
54-
auto fd = omnisci::open(path, O_RDWR | O_CREAT | (recover ? O_APPEND : O_TRUNC), 0644);
55-
if (fd > 0) {
56-
return fd;
57-
}
58-
auto err = std::string("Dictionary path ") + std::string(path) +
59-
std::string(" does not exist.");
60-
LOG(ERROR) << err;
61-
throw DictPayloadUnavailable(err);
62-
}
63-
6453
const uint64_t round_up_p2(const uint64_t num) {
6554
uint64_t in = num;
6655
in--;
@@ -111,124 +100,21 @@ constexpr size_t StringDictionary::MAX_STRLEN;
111100
constexpr size_t StringDictionary::MAX_STRCOUNT;
112101

113102
StringDictionary::StringDictionary(const DictRef& dict_ref,
114-
const std::string& folder,
115-
const bool isTemp,
116-
const bool recover,
117103
const bool materializeHashes,
118104
size_t initial_capacity)
119105
: dict_ref_(dict_ref)
120-
, folder_(folder)
121106
, str_count_(0)
122107
, string_id_string_dict_hash_table_(initial_capacity, INVALID_STR_ID)
123108
, hash_cache_(initial_capacity)
124-
, isTemp_(isTemp)
125109
, materialize_hashes_(materializeHashes)
126-
, payload_fd_(-1)
127-
, offset_fd_(-1)
128110
, offset_map_(nullptr)
129111
, payload_map_(nullptr)
130112
, offset_file_size_(0)
131113
, payload_file_size_(0)
132114
, payload_file_off_(0)
133115
, strings_cache_(nullptr) {
134-
if (!isTemp && folder.empty()) {
135-
return;
136-
}
137-
138116
// initial capacity must be a power of two for efficient bucket computation
139117
CHECK_EQ(size_t(0), (initial_capacity & (initial_capacity - 1)));
140-
if (!isTemp_) {
141-
boost::filesystem::path storage_path(folder);
142-
offsets_path_ = (storage_path / boost::filesystem::path("DictOffsets")).string();
143-
const auto payload_path =
144-
(storage_path / boost::filesystem::path("DictPayload")).string();
145-
payload_fd_ = checked_open(payload_path.c_str(), recover);
146-
offset_fd_ = checked_open(offsets_path_.c_str(), recover);
147-
payload_file_size_ = omnisci::file_size(payload_fd_);
148-
offset_file_size_ = omnisci::file_size(offset_fd_);
149-
}
150-
bool storage_is_empty = false;
151-
if (payload_file_size_ == 0) {
152-
storage_is_empty = true;
153-
addPayloadCapacity();
154-
}
155-
if (offset_file_size_ == 0) {
156-
addOffsetCapacity();
157-
}
158-
if (!isTemp_) { // we never mmap or recover temp dictionaries
159-
payload_map_ =
160-
reinterpret_cast<char*>(omnisci::checked_mmap(payload_fd_, payload_file_size_));
161-
offset_map_ = reinterpret_cast<StringIdxEntry*>(
162-
omnisci::checked_mmap(offset_fd_, offset_file_size_));
163-
if (recover) {
164-
const size_t bytes = omnisci::file_size(offset_fd_);
165-
if (bytes % sizeof(StringIdxEntry) != 0) {
166-
LOG(WARNING) << "Offsets " << offsets_path_ << " file is truncated";
167-
}
168-
const uint64_t str_count =
169-
storage_is_empty ? 0 : getNumStringsFromStorage(bytes / sizeof(StringIdxEntry));
170-
collisions_ = 0;
171-
// at this point we know the size of the StringDict we need to load
172-
// so lets reallocate the vector to the correct size
173-
const uint64_t max_entries =
174-
std::max(round_up_p2(str_count * 2 + 1),
175-
round_up_p2(std::max(initial_capacity, static_cast<size_t>(1))));
176-
std::vector<int32_t> new_str_ids(max_entries, INVALID_STR_ID);
177-
string_id_string_dict_hash_table_.swap(new_str_ids);
178-
if (materialize_hashes_) {
179-
std::vector<string_dict_hash_t> new_hash_cache(max_entries / 2);
180-
hash_cache_.swap(new_hash_cache);
181-
}
182-
// Bail early if we know we don't have strings to add (i.e. a new or empty
183-
// dictionary)
184-
if (str_count == 0) {
185-
return;
186-
}
187-
188-
unsigned string_id = 0;
189-
mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
190-
191-
uint32_t thread_inits = 0;
192-
const auto thread_count = std::thread::hardware_concurrency();
193-
const uint32_t items_per_thread = std::max<uint32_t>(
194-
2000, std::min<uint32_t>(200000, (str_count / thread_count) + 1));
195-
std::vector<std::future<std::vector<std::pair<string_dict_hash_t, unsigned int>>>>
196-
dictionary_futures;
197-
for (string_id = 0; string_id < str_count; string_id += items_per_thread) {
198-
dictionary_futures.emplace_back(std::async(
199-
std::launch::async, [string_id, str_count, items_per_thread, this] {
200-
std::vector<std::pair<string_dict_hash_t, unsigned int>> hashVec;
201-
for (uint32_t curr_id = string_id;
202-
curr_id < string_id + items_per_thread && curr_id < str_count;
203-
curr_id++) {
204-
const auto recovered = getStringFromStorage(curr_id);
205-
if (recovered.canary) {
206-
// hit the canary, recovery finished
207-
break;
208-
} else {
209-
std::string_view temp(recovered.c_str_ptr, recovered.size);
210-
hashVec.emplace_back(std::make_pair(hash_string(temp), temp.size()));
211-
}
212-
}
213-
return hashVec;
214-
}));
215-
thread_inits++;
216-
if (thread_inits % thread_count == 0) {
217-
processDictionaryFutures(dictionary_futures);
218-
}
219-
}
220-
// gather last few threads
221-
if (dictionary_futures.size() != 0) {
222-
processDictionaryFutures(dictionary_futures);
223-
}
224-
VLOG(1) << "Opened string dictionary " << folder << " # Strings: " << str_count_
225-
<< " Hash table size: " << string_id_string_dict_hash_table_.size()
226-
<< " Fill rate: "
227-
<< static_cast<double>(str_count_) * 100.0 /
228-
string_id_string_dict_hash_table_.size()
229-
<< "% Collisions: " << collisions_;
230-
}
231-
}
232118
}
233119

234120
namespace {
@@ -319,19 +205,9 @@ size_t StringDictionary::getNumStringsFromStorage(
319205
StringDictionary::~StringDictionary() noexcept {
320206
free(CANARY_BUFFER);
321207
if (payload_map_) {
322-
if (!isTemp_) {
323-
CHECK(offset_map_);
324-
omnisci::checked_munmap(payload_map_, payload_file_size_);
325-
omnisci::checked_munmap(offset_map_, offset_file_size_);
326-
CHECK_GE(payload_fd_, 0);
327-
omnisci::close(payload_fd_);
328-
CHECK_GE(offset_fd_, 0);
329-
omnisci::close(offset_fd_);
330-
} else {
331-
CHECK(offset_map_);
332-
free(payload_map_);
333-
free(offset_map_);
334-
}
208+
CHECK(offset_map_);
209+
free(payload_map_);
210+
free(offset_map_);
335211
}
336212
}
337213

@@ -1318,17 +1194,8 @@ void StringDictionary::checkAndConditionallyIncreasePayloadCapacity(
13181194
if (payload_file_off_ + write_length > payload_file_size_) {
13191195
const size_t min_capacity_needed =
13201196
write_length - (payload_file_size_ - payload_file_off_);
1321-
if (!isTemp_) {
1322-
CHECK_GE(payload_fd_, 0);
1323-
omnisci::checked_munmap(payload_map_, payload_file_size_);
1324-
addPayloadCapacity(min_capacity_needed);
1325-
CHECK(payload_file_off_ + write_length <= payload_file_size_);
1326-
payload_map_ =
1327-
reinterpret_cast<char*>(omnisci::checked_mmap(payload_fd_, payload_file_size_));
1328-
} else {
1329-
addPayloadCapacity(min_capacity_needed);
1330-
CHECK(payload_file_off_ + write_length <= payload_file_size_);
1331-
}
1197+
addPayloadCapacity(min_capacity_needed);
1198+
CHECK(payload_file_off_ + write_length <= payload_file_size_);
13321199
}
13331200
}
13341201

@@ -1338,17 +1205,8 @@ void StringDictionary::checkAndConditionallyIncreaseOffsetCapacity(
13381205
if (offset_file_off + write_length >= offset_file_size_) {
13391206
const size_t min_capacity_needed =
13401207
write_length - (offset_file_size_ - offset_file_off);
1341-
if (!isTemp_) {
1342-
CHECK_GE(offset_fd_, 0);
1343-
omnisci::checked_munmap(offset_map_, offset_file_size_);
1344-
addOffsetCapacity(min_capacity_needed);
1345-
CHECK(offset_file_off + write_length <= offset_file_size_);
1346-
offset_map_ = reinterpret_cast<StringIdxEntry*>(
1347-
omnisci::checked_mmap(offset_fd_, offset_file_size_));
1348-
} else {
1349-
addOffsetCapacity(min_capacity_needed);
1350-
CHECK(offset_file_off + write_length <= offset_file_size_);
1351-
}
1208+
addOffsetCapacity(min_capacity_needed);
1209+
CHECK(offset_file_off + write_length <= offset_file_size_);
13521210
}
13531211
}
13541212

@@ -1395,10 +1253,6 @@ std::string_view StringDictionary::getStringFromStorageFast(
13951253

13961254
StringDictionary::PayloadString StringDictionary::getStringFromStorage(
13971255
const int string_id) const noexcept {
1398-
if (!isTemp_) {
1399-
CHECK_GE(payload_fd_, 0);
1400-
CHECK_GE(offset_fd_, 0);
1401-
}
14021256
CHECK_GE(string_id, 0);
14031257
const StringIdxEntry* str_meta = offset_map_ + string_id;
14041258
if (str_meta->size == 0xffff) {
@@ -1409,42 +1263,13 @@ StringDictionary::PayloadString StringDictionary::getStringFromStorage(
14091263
}
14101264

14111265
void StringDictionary::addPayloadCapacity(const size_t min_capacity_requested) noexcept {
1412-
if (!isTemp_) {
1413-
payload_file_size_ += addStorageCapacity(payload_fd_, min_capacity_requested);
1414-
} else {
1415-
payload_map_ = static_cast<char*>(
1416-
addMemoryCapacity(payload_map_, payload_file_size_, min_capacity_requested));
1417-
}
1266+
payload_map_ = static_cast<char*>(
1267+
addMemoryCapacity(payload_map_, payload_file_size_, min_capacity_requested));
14181268
}
14191269

14201270
void StringDictionary::addOffsetCapacity(const size_t min_capacity_requested) noexcept {
1421-
if (!isTemp_) {
1422-
offset_file_size_ += addStorageCapacity(offset_fd_, min_capacity_requested);
1423-
} else {
1424-
offset_map_ = static_cast<StringIdxEntry*>(
1425-
addMemoryCapacity(offset_map_, offset_file_size_, min_capacity_requested));
1426-
}
1427-
}
1428-
1429-
size_t StringDictionary::addStorageCapacity(
1430-
int fd,
1431-
const size_t min_capacity_requested) noexcept {
1432-
const size_t canary_buff_size_to_add =
1433-
std::max(static_cast<size_t>(1024 * SYSTEM_PAGE_SIZE),
1434-
(min_capacity_requested / SYSTEM_PAGE_SIZE + 1) * SYSTEM_PAGE_SIZE);
1435-
1436-
if (canary_buffer_size < canary_buff_size_to_add) {
1437-
CANARY_BUFFER = static_cast<char*>(realloc(CANARY_BUFFER, canary_buff_size_to_add));
1438-
canary_buffer_size = canary_buff_size_to_add;
1439-
CHECK(CANARY_BUFFER);
1440-
memset(CANARY_BUFFER, 0xff, canary_buff_size_to_add);
1441-
}
1442-
1443-
CHECK_NE(lseek(fd, 0, SEEK_END), -1);
1444-
const auto write_return = write(fd, CANARY_BUFFER, canary_buff_size_to_add);
1445-
CHECK(write_return > 0 &&
1446-
(static_cast<size_t>(write_return) == canary_buff_size_to_add));
1447-
return canary_buff_size_to_add;
1271+
offset_map_ = static_cast<StringIdxEntry*>(
1272+
addMemoryCapacity(offset_map_, offset_file_size_, min_capacity_requested));
14481273
}
14491274

14501275
void* StringDictionary::addMemoryCapacity(void* addr,
@@ -1481,22 +1306,6 @@ void StringDictionary::invalidateInvertedIndex() noexcept {
14811306
compare_cache_.invalidateInvertedIndex();
14821307
}
14831308

1484-
// TODO 5 Mar 2021 Nothing will undo the writes to dictionary currently on a failed
1485-
// load. The next write to the dictionary that does checkpoint will make the
1486-
// uncheckpointed data be written to disk. Only option is a table truncate, and thats
1487-
// assuming not replicated dictionary
1488-
bool StringDictionary::checkpoint() noexcept {
1489-
CHECK(!isTemp_);
1490-
bool ret = true;
1491-
ret = ret &&
1492-
(omnisci::msync((void*)offset_map_, offset_file_size_, /*async=*/false) == 0);
1493-
ret = ret &&
1494-
(omnisci::msync((void*)payload_map_, payload_file_size_, /*async=*/false) == 0);
1495-
ret = ret && (omnisci::fsync(offset_fd_) == 0);
1496-
ret = ret && (omnisci::fsync(payload_fd_) == 0);
1497-
return ret;
1498-
}
1499-
15001309
void StringDictionary::buildSortedCache() {
15011310
// This method is not thread-safe.
15021311
const auto cur_cache_size = sorted_cache.size();
@@ -1703,9 +1512,6 @@ size_t StringDictionary::buildDictionaryTranslationMap(
17031512
return 0;
17041513
}
17051514

1706-
// Sort this/source dict and dest dict on folder_ so we can enforce
1707-
// lock ordering and avoid deadlocks
1708-
17091515
const int32_t dest_db_id = dest_dict->getDbId();
17101516
const int32_t dest_dict_id = dest_dict->getDictId();
17111517
if (getDbId() == dest_db_id && getDictId() == dest_dict_id) {

omniscidb/StringDictionary/StringDictionary.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ using StringLookupCallback = std::function<bool(std::string_view, int32_t string
4545
class StringDictionary {
4646
public:
4747
StringDictionary(const DictRef& dict_ref,
48-
const std::string& folder,
49-
const bool isTemp,
50-
const bool recover,
5148
const bool materializeHashes = false,
5249
size_t initial_capacity = 256);
5350
~StringDictionary() noexcept;
@@ -119,8 +116,6 @@ class StringDictionary {
119116
const bool dest_has_transients,
120117
StringLookupCallback const& dest_transient_lookup_callback) const;
121118

122-
bool checkpoint() noexcept;
123-
124119
/**
125120
* @brief Populates provided \p dest_ids vector with string ids corresponding to given
126121
* source strings
@@ -226,7 +221,6 @@ class StringDictionary {
226221
std::string_view getStringFromStorageFast(const int string_id) const noexcept;
227222
void addPayloadCapacity(const size_t min_capacity_requested = 0) noexcept;
228223
void addOffsetCapacity(const size_t min_capacity_requested = 0) noexcept;
229-
size_t addStorageCapacity(int fd, const size_t min_capacity_requested = 0) noexcept;
230224
void* addMemoryCapacity(void* addr,
231225
size_t& mem_size,
232226
const size_t min_capacity_requested = 0) noexcept;
@@ -241,7 +235,6 @@ class StringDictionary {
241235
compare_cache_value_t* binary_search_cache(const std::string& pattern) const;
242236

243237
const DictRef dict_ref_;
244-
const std::string folder_;
245238
size_t str_count_;
246239
size_t collisions_;
247240
std::vector<int32_t> string_id_string_dict_hash_table_;
@@ -250,8 +243,6 @@ class StringDictionary {
250243
bool isTemp_;
251244
bool materialize_hashes_;
252245
std::string offsets_path_;
253-
int payload_fd_;
254-
int offset_fd_;
255246
StringIdxEntry* offset_map_;
256247
char* payload_map_;
257248
size_t offset_file_size_;

omniscidb/Tests/ResultSetTest.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -879,8 +879,7 @@ bool approx_eq(const double v, const double target, const double eps = EPS) {
879879
return target - eps < v && v < target + eps;
880880
}
881881

882-
std::shared_ptr<StringDictionary> g_sd =
883-
std::make_shared<StringDictionary>(DictRef(), "", false, true);
882+
std::shared_ptr<StringDictionary> g_sd = std::make_shared<StringDictionary>(DictRef());
884883

885884
void test_iterate(const std::vector<TargetInfo>& target_infos,
886885
const QueryMemoryDescriptor& query_mem_desc) {

0 commit comments

Comments
 (0)