Skip to content

Commit 0ee679c

Browse files
Mark inoperable cache as corrupted (#1203)
Cache could become inoperable due to several reasons: - interrupted compaction leads to files on level-0, and this leads to excessive memory usage during read operations; - different IO errors for protected cache couldn't be solved with RepairDb due to read-only nature of this cache (for mutable cache there will be an attempt to repair it); Do not remove corrupted cache and just notify a user about it. Relates-To: OLPSUP-14088 Signed-off-by: Andrey Kashcheev <[email protected]>
1 parent 8c1dad3 commit 0ee679c

File tree

4 files changed

+83
-19
lines changed

4 files changed

+83
-19
lines changed

olp-cpp-sdk-core/include/olp/core/cache/DefaultCache.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ class CORE_API DefaultCache : public KeyValueCache {
6060
* @brief The storage open result type.
6161
*/
6262
enum StorageOpenResult {
63-
Success, /*!< The operation succeeded. */
64-
OpenDiskPathFailure, /*!< The disk cache failure. */
65-
NotReady /*!< The DefaultCache is closed. */
63+
Success, /*!< The operation succeeded. */
64+
OpenDiskPathFailure, /*!< The disk cache failure. */
65+
ProtectedCacheCorrupted, /*!< The protected disk cache is corrupted. */
66+
NotReady /*!< The DefaultCache is closed. */
6667
};
6768

6869
/**
@@ -140,6 +141,10 @@ class CORE_API DefaultCache : public KeyValueCache {
140141
* operation in parallel for the time of the compacting operation. Be aware
141142
* that automatic asynchronous compacting operation is triggered internally
142143
* once the database size exceeds the CacheSettings::max_disk_storage size.
144+
*
145+
* @note After the compaction is finished the cache is checked on level-0 file
146+
* presence. If there are still some files present another round of compaction
147+
* is performed.
143148
*/
144149
void Compact();
145150

olp-cpp-sdk-core/src/cache/DefaultCacheImpl.cpp

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
namespace {
3333
using CacheType = olp::cache::DefaultCache::CacheType;
34+
using StorageOpenResult = olp::cache::DefaultCache::StorageOpenResult;
3435

3536
constexpr auto kLogTag = "DefaultCache";
3637
constexpr auto kExpirySuffix = "::expiry";
@@ -119,6 +120,21 @@ int64_t GetElapsedTime(std::chrono::steady_clock::time_point start) {
119120
bool IsInternalKey(const std::string& key) {
120121
return key.find(kInternalKeysPrefix) == 0u;
121122
}
123+
124+
olp::cache::DefaultCache::StorageOpenResult ToStorageOpenResult(
125+
olp::cache::OpenResult input) {
126+
switch (input) {
127+
case olp::cache::OpenResult::Fail:
128+
return StorageOpenResult::OpenDiskPathFailure;
129+
case olp::cache::OpenResult::Corrupted:
130+
return StorageOpenResult::ProtectedCacheCorrupted;
131+
case olp::cache::OpenResult::Repaired:
132+
case olp::cache::OpenResult::Success:
133+
return StorageOpenResult::Success;
134+
}
135+
136+
return {};
137+
}
122138
} // namespace
123139

124140
namespace olp {
@@ -817,13 +833,13 @@ DefaultCache::StorageOpenResult DefaultCacheImpl::SetupProtectedCache() {
817833
auto status = protected_cache_->Open(
818834
settings_.disk_path_protected.get(), settings_.disk_path_protected.get(),
819835
protected_storage_settings, OpenOptions::ReadOnly);
820-
if (status == OpenResult::Fail) {
821-
OLP_SDK_LOG_ERROR_F(kLogTag, "Failed to reopen protected cache %s",
836+
if (status != OpenResult::Success) {
837+
OLP_SDK_LOG_ERROR_F(kLogTag, "Failed to open protected cache %s",
822838
settings_.disk_path_protected.get().c_str());
823839

824840
protected_cache_.reset();
825841
settings_.disk_path_protected = boost::none;
826-
return DefaultCache::OpenDiskPathFailure;
842+
return ToStorageOpenResult(status);
827843
}
828844

829845
return DefaultCache::Success;
@@ -836,13 +852,15 @@ DefaultCache::StorageOpenResult DefaultCacheImpl::SetupMutableCache() {
836852
auto status = mutable_cache_->Open(settings_.disk_path_mutable.get(),
837853
settings_.disk_path_mutable.get(),
838854
storage_settings, OpenOptions::Default);
839-
if (status == OpenResult::Fail) {
855+
if (status == OpenResult::Repaired) {
856+
OLP_SDK_LOG_INFO(kLogTag, "Mutable cache was repaired");
857+
} else if (status == OpenResult::Fail) {
840858
OLP_SDK_LOG_ERROR_F(kLogTag, "Failed to open the mutable cache %s",
841859
settings_.disk_path_mutable.get().c_str());
842860

843861
mutable_cache_.reset();
844862
settings_.disk_path_mutable = boost::none;
845-
return DefaultCache::OpenDiskPathFailure;
863+
return StorageOpenResult::OpenDiskPathFailure;
846864
}
847865

848866
// read protected keys

olp-cpp-sdk-core/src/cache/DiskCache.cpp

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ namespace cache {
4242

4343
namespace {
4444
constexpr auto kLogTag = "DiskCache";
45+
constexpr auto kLevelDbLostFolder = "/lost";
4546

4647
leveldb::Slice ToLeveldbSlice(const std::string& slice) {
4748
return leveldb::Slice(slice);
@@ -52,7 +53,13 @@ static bool RepairCache(const std::string& data_path) {
5253
auto status = leveldb::RepairDB(data_path, leveldb::Options());
5354
if (status.ok()) {
5455
OLP_SDK_LOG_INFO(kLogTag, "RepairCache: repaired - " << data_path);
55-
DiskCacheEnv::Env()->DeleteDir(data_path + "/lost");
56+
const auto lost_folder_path = data_path + kLevelDbLostFolder;
57+
if (utils::Dir::Exists(lost_folder_path)) {
58+
OLP_SDK_LOG_INFO_F(
59+
kLogTag, "RepairCache: some data may have been lost - deleting '%s'",
60+
kLevelDbLostFolder);
61+
utils::Dir::Remove(lost_folder_path);
62+
}
5663
return true;
5764
}
5865
OLP_SDK_LOG_ERROR(kLogTag,
@@ -120,6 +127,22 @@ client::ApiError GetApiError(const leveldb::Status& status) {
120127
return client::ApiError(code, status.ToString());
121128
}
122129

130+
int CheckCompactionFinished(leveldb::DB& db) {
131+
std::string property_result;
132+
db.GetProperty("leveldb.num-files-at-level0", &property_result);
133+
134+
const auto files_at_level0 = std::stoi(property_result);
135+
if (files_at_level0 == 0) {
136+
return true;
137+
}
138+
139+
OLP_SDK_LOG_INFO_F(
140+
kLogTag, "CheckCompactionFinished: L0 files present, files_at_level0=%d",
141+
files_at_level0);
142+
143+
return false;
144+
}
145+
123146
} // anonymous namespace
124147

125148
DiskCache::DiskCache() = default;
@@ -153,8 +176,13 @@ void DiskCache::Compact() {
153176
// doing it already. We don't need two at the same time.
154177
if (database_ && !compacting_.exchange(true)) {
155178
OLP_SDK_LOG_INFO(kLogTag, "Compact: Compacting database started");
156-
database_->CompactRange(nullptr, nullptr);
179+
180+
do {
181+
database_->CompactRange(nullptr, nullptr);
182+
} while (!CheckCompactionFinished(*database_));
183+
157184
compacting_ = false;
185+
158186
OLP_SDK_LOG_INFO(kLogTag, "Compact: Compacting database finished");
159187
}
160188
}
@@ -216,13 +244,18 @@ OpenResult DiskCache::Open(const std::string& data_path,
216244
status = leveldb::DB::Open(open_options, versioned_data_path, &db);
217245
}
218246

219-
// If the database is r/w and corrupted, attempt to repair & reopen
220-
if ((status.IsCorruption() || status.IsIOError()) &&
221-
RepairCache(versioned_data_path) == true) {
222-
status = leveldb::DB::Open(open_options, versioned_data_path, &db);
223-
if (status.ok()) {
224-
database_.reset(db);
225-
return OpenResult::Repaired;
247+
if (status.IsCorruption() || status.IsIOError()) {
248+
if (is_read_only) {
249+
OLP_SDK_LOG_ERROR_F(
250+
kLogTag, "Open: cache corrupted, cache_path='%s', error='%s'",
251+
versioned_data_path.c_str(), status.ToString().c_str());
252+
return OpenResult::Corrupted;
253+
} else if (RepairCache(versioned_data_path)) {
254+
status = leveldb::DB::Open(open_options, versioned_data_path, &db);
255+
if (status.ok()) {
256+
database_.reset(db);
257+
return OpenResult::Repaired;
258+
}
226259
}
227260
}
228261

@@ -235,7 +268,13 @@ OpenResult DiskCache::Open(const std::string& data_path,
235268
error_ = NoError{};
236269
}
237270

271+
if (is_read_only && !CheckCompactionFinished(*db)) {
272+
OLP_SDK_LOG_ERROR(kLogTag, "Open: interrupted compaction detected");
273+
return OpenResult::Corrupted;
274+
}
275+
238276
database_.reset(db);
277+
239278
return OpenResult::Success;
240279
}
241280

olp-cpp-sdk-core/src/cache/DiskCache.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919

2020
#pragma once
2121

22-
#include <time.h>
2322
#include <atomic>
23+
#include <ctime>
2424
#include <functional>
2525
#include <limits>
2626
#include <map>
@@ -55,6 +55,8 @@ class SizeCountingEnv;
5555
enum class OpenResult {
5656
/// Opening the store failed. Use openError() for details.
5757
Fail,
58+
/// The store was corrupted or store compaction was interrupted.
59+
Corrupted,
5860
/// The store was corrupted and has been repaired. Internal integrity might be
5961
/// broken.
6062
Repaired,
@@ -138,7 +140,7 @@ class DiskCache {
138140
/// scans.
139141
std::unique_ptr<leveldb::Iterator> NewIterator(leveldb::ReadOptions options);
140142

141-
/// Allow batch writting so that we can delete and write multiple values at
143+
/// Allow batch writing so that we can delete and write multiple values at
142144
/// the same time.
143145
OperationOutcome ApplyBatch(std::unique_ptr<leveldb::WriteBatch> batch);
144146

0 commit comments

Comments
 (0)