Skip to content

Commit 11086d8

Browse files
authored
Merge pull request ceph#60556 from aclamk/wip-aclamk-bluefs-truncate-allocations-main
os/bluestore: Make truncate() drop unused allocations - addendum
2 parents 91591a6 + 612f24b commit 11086d8

File tree

2 files changed

+110
-31
lines changed

2 files changed

+110
-31
lines changed

src/os/bluestore/BlueFS.cc

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3794,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
37943794
if (offset > fnode.size) {
37953795
ceph_abort_msg("truncate up not supported");
37963796
}
3797-
ceph_assert(offset <= fnode.size);
3797+
37983798
_flush_bdev(h);
37993799
{
38003800
std::lock_guard ll(log.lock);
@@ -3803,44 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
38033803
vselector->sub_usage(h->file->vselector_hint, fnode);
38043804
uint64_t x_off = 0;
38053805
auto p = fnode.seek(offset, &x_off);
3806-
uint64_t cut_off =
3807-
(p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
3808-
uint64_t new_allocated;
3809-
if (0 == cut_off) {
3810-
// whole pextent to remove
3811-
changed_extents = true;
3812-
new_allocated = offset;
3813-
} else if (cut_off < p->length) {
3814-
dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
3815-
new_allocated = (offset - x_off) + cut_off;
3816-
p->length = cut_off;
3817-
changed_extents = true;
3818-
++p;
3819-
} else {
3820-
ceph_assert(cut_off >= p->length);
3821-
new_allocated = (offset - x_off) + p->length;
3822-
// just leave it here
3823-
++p;
3824-
}
3825-
while (p != fnode.extents.end()) {
3826-
dirty.pending_release[p->bdev].insert(p->offset, p->length);
3827-
p = fnode.extents.erase(p);
3828-
changed_extents = true;
3806+
if (p != fnode.extents.end()) {
3807+
uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]);
3808+
if (0 == cut_off) {
3809+
// whole pextent to remove
3810+
fnode.allocated = offset;
3811+
changed_extents = true;
3812+
} else if (cut_off < p->length) {
3813+
dirty.pending_release[p->bdev].insert(p->offset + cut_off,
3814+
p->length - cut_off);
3815+
fnode.allocated = (offset - x_off) + cut_off;
3816+
p->length = cut_off;
3817+
changed_extents = true;
3818+
++p;
3819+
} else {
3820+
// cut_off > p->length means that we misaligned the extent
3821+
ceph_assert(cut_off == p->length);
3822+
fnode.allocated = (offset - x_off) + p->length;
3823+
++p; // leave extent untouched
3824+
}
3825+
while (p != fnode.extents.end()) {
3826+
dirty.pending_release[p->bdev].insert(p->offset, p->length);
3827+
p = fnode.extents.erase(p);
3828+
changed_extents = true;
3829+
}
38293830
}
38303831
if (changed_extents) {
38313832
fnode.size = offset;
3832-
fnode.allocated = new_allocated;
38333833
fnode.reset_delta();
38343834
fnode.recalc_allocated();
38353835
log.t.op_file_update(fnode);
38363836
// sad, but is_dirty must be set to signal flushing of the log
38373837
h->file->is_dirty = true;
3838-
} else {
3839-
if (offset != fnode.size) {
3840-
fnode.size = offset;
3841-
//skipping log.t.op_file_update_inc, it will be done by flush()
3842-
h->file->is_dirty = true;
3843-
}
3838+
} else if (offset != fnode.size) {
3839+
fnode.size = offset;
3840+
// skipping log.t.op_file_update_inc, it will be done by flush()
3841+
h->file->is_dirty = true;
38443842
}
38453843
vselector->add_usage(h->file->vselector_hint, fnode);
38463844
}

src/test/objectstore/test_bluefs.cc

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) {
14261426
}
14271427
}
14281428

1429+
TEST(BlueFS, truncate_drops_allocations) {
1430+
constexpr uint64_t K = 1024;
1431+
constexpr uint64_t M = 1024 * K;
1432+
uuid_d fsid;
1433+
const char* DIR_NAME="dir";
1434+
const char* FILE_NAME="file1";
1435+
struct {
1436+
uint64_t preallocated_size;
1437+
uint64_t write_size;
1438+
uint64_t truncate_to;
1439+
uint64_t allocated_after_truncate;
1440+
uint64_t slow_size = 0;
1441+
uint64_t slow_alloc_size = 64*K;
1442+
uint64_t db_size = 128*M;
1443+
uint64_t db_alloc_size = 1*M;
1444+
} scenarios [] = {
1445+
// on DB(which is SLOW) : 1 => 1, 64K remains
1446+
{ 1*M, 1, 1, 64*K },
1447+
// on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains
1448+
{ 1*M, 1, 1, 4*K, 0, 4*K },
1449+
// on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains
1450+
{ 1*M, 128*K, 128*K, 128*K },
1451+
// on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains
1452+
{ 0, 1666*K, 0, 0 },
1453+
// on DB, truncate to 123K, expect 1M occupied
1454+
{ 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M },
1455+
// on DB, truncate to 0, expect 0 occupied
1456+
{ 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M },
1457+
// on DB, truncate to AU boundary, expect exactly 1M occupied
1458+
{ 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M },
1459+
// on DB and SLOW, truncate only data on SLOW
1460+
{ 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
1461+
// on DB and SLOW, preallocate and truncate only data on SLOW
1462+
{ 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
1463+
// on DB and SLOW, preallocate and truncate all in SLOW and some on DB
1464+
// note! prealloc 6M is important, one allocation for 12M will fallback to SLOW
1465+
// in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW
1466+
{ 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M },
1467+
};
1468+
for (auto& s : scenarios) {
1469+
ConfSaver conf(g_ceph_context->_conf);
1470+
conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str());
1471+
conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str());
1472+
1473+
g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size));
1474+
g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size));
1475+
TempBdev bdev_db{s.db_size};
1476+
TempBdev bdev_slow{s.slow_size};
1477+
1478+
BlueFS fs(g_ceph_context);
1479+
if (s.db_size != 0) {
1480+
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
1481+
}
1482+
if (s.slow_size != 0) {
1483+
ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
1484+
}
1485+
1486+
ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false}));
1487+
ASSERT_EQ(0, fs.mount());
1488+
ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false}));
1489+
BlueFS::FileWriter *h;
1490+
ASSERT_EQ(0, fs.mkdir("dir"));
1491+
ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false));
1492+
uint64_t pre = fs.get_used();
1493+
ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size));
1494+
const std::string content(s.write_size, 'x');
1495+
h->append(content.c_str(), content.length());
1496+
fs.fsync(h);
1497+
ASSERT_EQ(0, fs.truncate(h, s.truncate_to));
1498+
fs.fsync(h);
1499+
uint64_t post = fs.get_used();
1500+
fs.close_writer(h);
1501+
EXPECT_EQ(pre, post - s.allocated_after_truncate);
1502+
1503+
fs.umount();
1504+
}
1505+
}
1506+
1507+
1508+
1509+
14291510
TEST(BlueFS, test_log_runway) {
14301511
uint64_t max_log_runway = 65536;
14311512
ConfSaver conf(g_ceph_context->_conf);

0 commit comments

Comments
 (0)