Skip to content

Commit 8a72bb1

Browse files
committed
More fixes to auto-GarbageCollect in BackupEngine (facebook#6023)
Summary: Production: * Fixes GarbageCollect (and auto-GC triggered by PurgeOldBackups, DeleteBackup, or CreateNewBackup) to clean up backup directory independent of current settings (except max_valid_backups_to_open; see issue facebook#4997) and prior settings used with same backup directory. * Fixes GarbageCollect (and auto-GC) not to attempt to remove "." and ".." entries from directories. * Clarifies contract with users in modifying BackupEngine operations. In short, leftovers from any incomplete operation are cleaned up by any subsequent call to that same kind of operation (PurgeOldBackups and DeleteBackup considered the same kind of operation). GarbageCollect is available to clean up after all kinds. (NB: right now PurgeOldBackups and DeleteBackup will clean up after incomplete CreateNewBackup, but we aren't promising to continue that behavior.) Pull Request resolved: facebook#6023 Test Plan: * Refactors open parameters to use an option enum, for readability, etc. (Also fixes an unused parameter bug in the redundant OpenDBAndBackupEngineShareWithChecksum.) * Fixes an apparent bug in ShareTableFilesWithChecksumsTransition in which old backup data was destroyed in the transition to be tested. That test is now augmented to ensure GarbageCollect (or auto-GC) does not remove shared files when BackupEngine is opened with share_table_files=false. * Augments DeleteTmpFiles test to ensure that CreateNewBackup does auto-GC when an incompletely created backup is detected. Differential Revision: D18453559 Pulled By: pdillinger fbshipit-source-id: 5e54e7b08d711b161bc9c656181012b69a8feac4
1 parent a6d4183 commit 8a72bb1

File tree

3 files changed

+162
-95
lines changed

3 files changed

+162
-95
lines changed

include/rocksdb/utilities/backupable_db.h

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -276,19 +276,23 @@ class BackupEngine {
276276
progress_callback);
277277
}
278278

279-
// deletes old backups, keeping latest num_backups_to_keep alive
279+
// Deletes old backups, keeping latest num_backups_to_keep alive.
280+
// See also DeleteBackup.
280281
virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
281282

282-
// deletes a specific backup
283+
// Deletes a specific backup. If this operation (or PurgeOldBackups)
284+
// is not completed due to crash, power failure, etc. the state
285+
// will be cleaned up the next time you call DeleteBackup,
286+
// PurgeOldBackups, or GarbageCollect.
283287
virtual Status DeleteBackup(BackupID backup_id) = 0;
284288

285289
// Call this from another thread if you want to stop the backup
286290
// that is currently happening. It will return immediatelly, will
287291
// not wait for the backup to stop.
288292
// The backup will stop ASAP and the call to CreateNewBackup will
289293
// return Status::Incomplete(). It will not clean up after itself, but
290-
// the state will remain consistent. The state will be cleaned up
291-
// next time you create BackupableDB or RestoreBackupableDB.
294+
// the state will remain consistent. The state will be cleaned up the
295+
// next time you call CreateNewBackup or GarbageCollect.
292296
virtual void StopBackup() = 0;
293297

294298
// Returns info about backups in backup_info
@@ -323,12 +327,13 @@ class BackupEngine {
323327
// Returns Status::OK() if all checks are good
324328
virtual Status VerifyBackup(BackupID backup_id) = 0;
325329

326-
// Will delete all the files we don't need anymore
327-
// It will do the full scan of the files/ directory and delete all the
328-
// files that are not referenced. PurgeOldBackups() and DeleteBackup()
329-
// will do a similar operation as needed to clean up from any incomplete
330-
// deletions, so this function is not really needed if calling one of
331-
// those.
330+
// Will delete any files left over from incomplete creation or deletion of
331+
// a backup. This is not normally needed as those operations also clean up
332+
// after prior incomplete calls to the same kind of operation (create or
333+
// delete).
334+
// NOTE: This is not designed to delete arbitrary files added to the backup
335+
// directory outside of BackupEngine, and clean-up is always subject to
336+
// permissions on and availability of the underlying filesystem.
332337
virtual Status GarbageCollect() = 0;
333338
};
334339

utilities/backupable/backupable_db.cc

Lines changed: 54 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,11 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
763763
if (s.ok()) {
764764
// maybe last backup failed and left partial state behind, clean it up.
765765
// need to do this before updating backups_ such that a private dir
766-
// named after new_backup_id will be cleaned up
766+
// named after new_backup_id will be cleaned up.
767+
// (If an incomplete new backup is followed by an incomplete delete
768+
// of the latest full backup, then there could be more than one next
769+
// id with a private dir, the last thing to be deleted in delete
770+
// backup, but all will be cleaned up with a GarbageCollect.)
767771
s = GarbageCollect();
768772
} else if (s.IsNotFound()) {
769773
// normal case, the new backup's private dir doesn't exist yet
@@ -1571,53 +1575,57 @@ Status BackupEngineImpl::GarbageCollect() {
15711575
"constrains how many backups the engine knows about");
15721576
}
15731577

1574-
if (options_.share_table_files &&
1575-
options_.max_valid_backups_to_open == port::kMaxInt32) {
1578+
if (options_.max_valid_backups_to_open == port::kMaxInt32) {
15761579
// delete obsolete shared files
15771580
// we cannot do this when BackupEngine has `max_valid_backups_to_open` set
15781581
// as those engines don't know about all shared files.
1579-
std::vector<std::string> shared_children;
1580-
{
1581-
std::string shared_path;
1582-
if (options_.share_files_with_checksum) {
1583-
shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
1584-
} else {
1585-
shared_path = GetAbsolutePath(GetSharedFileRel());
1586-
}
1587-
auto s = backup_env_->FileExists(shared_path);
1588-
if (s.ok()) {
1589-
s = backup_env_->GetChildren(shared_path, &shared_children);
1590-
} else if (s.IsNotFound()) {
1591-
s = Status::OK();
1592-
}
1593-
if (!s.ok()) {
1594-
overall_status = s;
1595-
// Trying again later might work
1596-
might_need_garbage_collect_ = true;
1597-
}
1598-
}
1599-
for (auto& child : shared_children) {
1600-
std::string rel_fname;
1601-
if (options_.share_files_with_checksum) {
1602-
rel_fname = GetSharedFileWithChecksumRel(child);
1603-
} else {
1604-
rel_fname = GetSharedFileRel(child);
1605-
}
1606-
auto child_itr = backuped_file_infos_.find(rel_fname);
1607-
// if it's not refcounted, delete it
1608-
if (child_itr == backuped_file_infos_.end() ||
1609-
child_itr->second->refs == 0) {
1610-
// this might be a directory, but DeleteFile will just fail in that
1611-
// case, so we're good
1612-
Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
1613-
ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
1614-
rel_fname.c_str(), s.ToString().c_str());
1615-
backuped_file_infos_.erase(rel_fname);
1582+
for (bool with_checksum : {false, true}) {
1583+
std::vector<std::string> shared_children;
1584+
{
1585+
std::string shared_path;
1586+
if (with_checksum) {
1587+
shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
1588+
} else {
1589+
shared_path = GetAbsolutePath(GetSharedFileRel());
1590+
}
1591+
auto s = backup_env_->FileExists(shared_path);
1592+
if (s.ok()) {
1593+
s = backup_env_->GetChildren(shared_path, &shared_children);
1594+
} else if (s.IsNotFound()) {
1595+
s = Status::OK();
1596+
}
16161597
if (!s.ok()) {
1598+
overall_status = s;
16171599
// Trying again later might work
16181600
might_need_garbage_collect_ = true;
16191601
}
16201602
}
1603+
for (auto& child : shared_children) {
1604+
if (child == "." || child == "..") {
1605+
continue;
1606+
}
1607+
std::string rel_fname;
1608+
if (with_checksum) {
1609+
rel_fname = GetSharedFileWithChecksumRel(child);
1610+
} else {
1611+
rel_fname = GetSharedFileRel(child);
1612+
}
1613+
auto child_itr = backuped_file_infos_.find(rel_fname);
1614+
// if it's not refcounted, delete it
1615+
if (child_itr == backuped_file_infos_.end() ||
1616+
child_itr->second->refs == 0) {
1617+
// this might be a directory, but DeleteFile will just fail in that
1618+
// case, so we're good
1619+
Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
1620+
ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
1621+
rel_fname.c_str(), s.ToString().c_str());
1622+
backuped_file_infos_.erase(rel_fname);
1623+
if (!s.ok()) {
1624+
// Trying again later might work
1625+
might_need_garbage_collect_ = true;
1626+
}
1627+
}
1628+
}
16211629
}
16221630
}
16231631

@@ -1633,6 +1641,9 @@ Status BackupEngineImpl::GarbageCollect() {
16331641
}
16341642
}
16351643
for (auto& child : private_children) {
1644+
if (child == "." || child == "..") {
1645+
continue;
1646+
}
16361647
// it's ok to do this when BackupEngine has `max_valid_backups_to_open` set
16371648
// as the engine always knows all valid backup numbers.
16381649
BackupID backup_id = 0;
@@ -1649,6 +1660,9 @@ Status BackupEngineImpl::GarbageCollect() {
16491660
std::vector<std::string> subchildren;
16501661
backup_env_->GetChildren(full_private_path, &subchildren);
16511662
for (auto& subchild : subchildren) {
1663+
if (subchild == "." || subchild == "..") {
1664+
continue;
1665+
}
16521666
Status s = backup_env_->DeleteFile(full_private_path + subchild);
16531667
ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
16541668
(full_private_path + subchild).c_str(),

utilities/backupable/backupable_db_test.cc

Lines changed: 93 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
1111

1212
#include <algorithm>
13+
#include <limits>
1314
#include <string>
15+
#include <utility>
1416

1517
#include "db/db_impl/db_impl.h"
1618
#include "env/env_chroot.h"
@@ -516,6 +518,15 @@ static void AssertEmpty(DB* db, int from, int to) {
516518

517519
class BackupableDBTest : public testing::Test {
518520
public:
521+
enum ShareOption {
522+
kNoShare,
523+
kShareNoChecksum,
524+
kShareWithChecksum,
525+
};
526+
527+
const std::vector<ShareOption> kAllShareOptions = {kNoShare, kShareNoChecksum,
528+
kShareWithChecksum};
529+
519530
BackupableDBTest() {
520531
// set up files
521532
std::string db_chroot = test::PerThreadDBPath("backupable_db");
@@ -561,15 +572,8 @@ class BackupableDBTest : public testing::Test {
561572
return db;
562573
}
563574

564-
void OpenDBAndBackupEngineShareWithChecksum(
565-
bool destroy_old_data = false, bool dummy = false,
566-
bool /*share_table_files*/ = true, bool share_with_checksums = false) {
567-
backupable_options_->share_files_with_checksum = share_with_checksums;
568-
OpenDBAndBackupEngine(destroy_old_data, dummy, share_with_checksums);
569-
}
570-
571575
void OpenDBAndBackupEngine(bool destroy_old_data = false, bool dummy = false,
572-
bool share_table_files = true) {
576+
ShareOption shared_option = kShareNoChecksum) {
573577
// reset all the defaults
574578
test_backup_env_->SetLimitWrittenFiles(1000000);
575579
test_db_env_->SetLimitWrittenFiles(1000000);
@@ -584,7 +588,9 @@ class BackupableDBTest : public testing::Test {
584588
}
585589
db_.reset(db);
586590
backupable_options_->destroy_old_data = destroy_old_data;
587-
backupable_options_->share_table_files = share_table_files;
591+
backupable_options_->share_table_files = shared_option != kNoShare;
592+
backupable_options_->share_files_with_checksum =
593+
shared_option == kShareWithChecksum;
588594
BackupEngine* backup_engine;
589595
ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
590596
&backup_engine));
@@ -1205,7 +1211,7 @@ TEST_F(BackupableDBTest, FailOverwritingBackups) {
12051211

12061212
TEST_F(BackupableDBTest, NoShareTableFiles) {
12071213
const int keys_iteration = 5000;
1208-
OpenDBAndBackupEngine(true, false, false);
1214+
OpenDBAndBackupEngine(true, false, kNoShare);
12091215
for (int i = 0; i < 5; ++i) {
12101216
FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
12111217
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1221,7 +1227,7 @@ TEST_F(BackupableDBTest, NoShareTableFiles) {
12211227
// Verify that you can backup and restore with share_files_with_checksum on
12221228
TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
12231229
const int keys_iteration = 5000;
1224-
OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
1230+
OpenDBAndBackupEngine(true, false, kShareWithChecksum);
12251231
for (int i = 0; i < 5; ++i) {
12261232
FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
12271233
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1239,7 +1245,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
12391245
TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
12401246
const int keys_iteration = 5000;
12411247
// set share_files_with_checksum to false
1242-
OpenDBAndBackupEngineShareWithChecksum(true, false, true, false);
1248+
OpenDBAndBackupEngine(true, false, kShareNoChecksum);
12431249
for (int i = 0; i < 5; ++i) {
12441250
FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
12451251
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
@@ -1252,65 +1258,107 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
12521258
}
12531259

12541260
// set share_files_with_checksum to true and do some more backups
1255-
OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
1261+
OpenDBAndBackupEngine(false /* destroy_old_data */, false,
1262+
kShareWithChecksum);
12561263
for (int i = 5; i < 10; ++i) {
12571264
FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
12581265
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
12591266
}
12601267
CloseDBAndBackupEngine();
12611268

1262-
for (int i = 0; i < 5; ++i) {
1263-
AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1),
1269+
// Verify first (about to delete)
1270+
AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11);
1271+
1272+
// For an extra challenge, make sure that GarbageCollect / DeleteBackup
1273+
// is OK even if we open without share_table_files
1274+
OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
1275+
backup_engine_->DeleteBackup(1);
1276+
backup_engine_->GarbageCollect();
1277+
CloseDBAndBackupEngine();
1278+
1279+
// Verify rest (not deleted)
1280+
for (int i = 1; i < 10; ++i) {
1281+
AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
12641282
keys_iteration * 11);
12651283
}
12661284
}
12671285

1286+
// This test simulates cleaning up after aborted or incomplete creation
1287+
// of a new backup.
12681288
TEST_F(BackupableDBTest, DeleteTmpFiles) {
1269-
for (int cleanup_fn : {1, 2, 3}) {
1270-
for (bool shared_checksum : {false, true}) {
1271-
OpenDBAndBackupEngineShareWithChecksum(
1272-
false /* destroy_old_data */, false /* dummy */,
1273-
true /* share_table_files */, shared_checksum);
1289+
for (int cleanup_fn : {1, 2, 3, 4}) {
1290+
for (ShareOption shared_option : kAllShareOptions) {
1291+
OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
1292+
shared_option);
1293+
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
1294+
BackupID next_id = 1;
1295+
BackupID oldest_id = std::numeric_limits<BackupID>::max();
1296+
{
1297+
std::vector<BackupInfo> backup_info;
1298+
backup_engine_->GetBackupInfo(&backup_info);
1299+
for (const auto& bi : backup_info) {
1300+
next_id = std::max(next_id, bi.backup_id + 1);
1301+
oldest_id = std::min(oldest_id, bi.backup_id);
1302+
}
1303+
}
12741304
CloseDBAndBackupEngine();
1275-
std::string shared_tmp = backupdir_;
1276-
if (shared_checksum) {
1277-
shared_tmp += "/shared_checksum";
1278-
} else {
1279-
shared_tmp += "/shared";
1305+
1306+
// An aborted or incomplete new backup will always be in the next
1307+
// id (maybe more)
1308+
std::string next_private = "private/" + std::to_string(next_id);
1309+
1310+
// NOTE: both shared and shared_checksum should be cleaned up
1311+
// regardless of how the backup engine is opened.
1312+
std::vector<std::string> tmp_files_and_dirs;
1313+
for (const auto& dir_and_file : {
1314+
std::make_pair(std::string("shared"),
1315+
std::string(".00006.sst.tmp")),
1316+
std::make_pair(std::string("shared_checksum"),
1317+
std::string(".00007.sst.tmp")),
1318+
std::make_pair(next_private, std::string("00003.sst")),
1319+
}) {
1320+
std::string dir = backupdir_ + "/" + dir_and_file.first;
1321+
file_manager_->CreateDir(dir);
1322+
ASSERT_OK(file_manager_->FileExists(dir));
1323+
1324+
std::string file = dir + "/" + dir_and_file.second;
1325+
file_manager_->WriteToFile(file, "tmp");
1326+
ASSERT_OK(file_manager_->FileExists(file));
1327+
1328+
tmp_files_and_dirs.push_back(file);
12801329
}
1281-
shared_tmp += "/.00006.sst.tmp";
1282-
std::string private_tmp_dir = backupdir_ + "/private/10";
1283-
std::string private_tmp_file = private_tmp_dir + "/00003.sst";
1284-
file_manager_->WriteToFile(shared_tmp, "tmp");
1285-
file_manager_->CreateDir(private_tmp_dir);
1286-
file_manager_->WriteToFile(private_tmp_file, "tmp");
1287-
ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
1288-
if (shared_checksum) {
1289-
OpenDBAndBackupEngineShareWithChecksum(
1290-
false /* destroy_old_data */, false /* dummy */,
1291-
true /* share_table_files */, true /* share_with_checksums */);
1292-
} else {
1293-
OpenDBAndBackupEngine();
1330+
if (cleanup_fn != /*CreateNewBackup*/ 4) {
1331+
// This exists after CreateNewBackup because it's deleted then
1332+
// re-created.
1333+
tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private);
12941334
}
1335+
1336+
OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
1337+
shared_option);
12951338
// Need to call one of these explicitly to delete tmp files
12961339
switch (cleanup_fn) {
12971340
case 1:
1298-
(void)backup_engine_->GarbageCollect();
1341+
ASSERT_OK(backup_engine_->GarbageCollect());
12991342
break;
13001343
case 2:
1301-
(void)backup_engine_->DeleteBackup(1);
1344+
ASSERT_OK(backup_engine_->DeleteBackup(oldest_id));
13021345
break;
13031346
case 3:
1304-
(void)backup_engine_->PurgeOldBackups(1);
1347+
ASSERT_OK(backup_engine_->PurgeOldBackups(1));
1348+
break;
1349+
case 4:
1350+
// Does a garbage collect if it sees that next private dir exists
1351+
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
13051352
break;
13061353
default:
13071354
assert(false);
13081355
}
13091356
CloseDBAndBackupEngine();
1310-
ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
1311-
ASSERT_EQ(Status::NotFound(),
1312-
file_manager_->FileExists(private_tmp_file));
1313-
ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
1357+
for (std::string file_or_dir : tmp_files_and_dirs) {
1358+
if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) {
1359+
FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn;
1360+
}
1361+
}
13141362
}
13151363
}
13161364
}

0 commit comments

Comments
 (0)