Skip to content

Commit 8cc05b1

Browse files
committed
mds: add blockdiff operation support
Fixes: http://tracker.ceph.com/issues/69791 Signed-off-by: Venky Shankar <[email protected]>
1 parent ab05bf9 commit 8cc05b1

File tree

9 files changed

+388
-1
lines changed

9 files changed

+388
-1
lines changed

src/common/ceph_strings.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ const char *ceph_mds_op_name(int op)
328328
case CEPH_MDS_OP_QUIESCE_INODE: return "quiesce_inode";
329329
case CEPH_MDS_OP_LOCK_PATH: return "lock_path";
330330
case CEPH_MDS_OP_UNINLINE_DATA: return "uninline_data";
331+
case CEPH_MDS_OP_FILE_BLOCKDIFF: return "blockdiff";
331332
}
332333
return "???";
333334
}

src/common/options/mds.yaml.in

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1754,3 +1754,12 @@ options:
17541754
services:
17551755
- mds
17561756
min: 4
1757+
- name: mds_file_blockdiff_max_concurrent_object_scans
1758+
type: uint
1759+
level: advanced
1760+
desc: maximum number of concurrent object scans
1761+
long_desc: Maximum number of concurrent listsnaps operations sent to RADOS.
1762+
default: 16
1763+
services:
1764+
- mds
1765+
min: 1

src/include/ceph_fs.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ enum {
428428
CEPH_MDS_OP_LSSNAP = 0x00402,
429429
CEPH_MDS_OP_RENAMESNAP = 0x01403,
430430
CEPH_MDS_OP_READDIR_SNAPDIFF = 0x01404,
431+
CEPH_MDS_OP_FILE_BLOCKDIFF = 0x01405,
431432

432433
// internal op
433434
CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
@@ -650,6 +651,12 @@ union ceph_mds_request_args {
650651
__le32 offset_hash;
651652
__le64 snap_other;
652653
} __attribute__ ((packed)) snapdiff;
654+
struct {
655+
// latest scan "pointer"
656+
__le64 scan_idx;
657+
// how many data objects to scan in one invocation (capped by the mds).
658+
__le64 max_objects;
659+
} __attribute__ ((packed)) blockdiff;
653660
} __attribute__ ((packed));
654661

655662
#define CEPH_MDS_REQUEST_HEAD_VERSION 3

src/mds/MDCache.cc

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14470,3 +14470,202 @@ void MDCache::upkeep_main(void)
1447014470
upkeep_cvar.wait_for(lock, interval);
1447114471
}
1447214472
}
14473+
14474+
struct C_ListSnapsAggregator : public MDSIOContext {
14475+
C_ListSnapsAggregator(MDSRank *mds, CInode *in1, CInode *in2, BlockDiff *block_diff,
14476+
Context *on_finish)
14477+
: MDSIOContext(mds),
14478+
in1(in1),
14479+
in2(in2),
14480+
block_diff(block_diff),
14481+
on_finish(on_finish) {
14482+
}
14483+
14484+
void finish(int r) override {
14485+
mds->mdcache->aggregate_snap_sets(snap_set_context, in1, in2,
14486+
block_diff, on_finish);
14487+
}
14488+
14489+
virtual void print(std::ostream& os) const {
14490+
os << "listsnaps";
14491+
}
14492+
14493+
void add_snap_set_context(std::unique_ptr<MDCache::SnapSetContext> ssc) {
14494+
snap_set_context.push_back(std::move(ssc));
14495+
}
14496+
14497+
CInode *in1;
14498+
CInode *in2;
14499+
BlockDiff *block_diff;
14500+
Context *on_finish;
14501+
std::vector<std::unique_ptr<MDCache::SnapSetContext>> snap_set_context;
14502+
};
14503+
14504+
void MDCache::file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
14505+
MDSContext *ctx) {
14506+
ceph_assert(in1->last <= in2->last);
14507+
14508+
// I think this is not required since the MDS disallows setting
14509+
// layout when truncate_seq > 1.
14510+
if (in1->get_inode()->layout != in2->get_inode()->layout) {
14511+
dout(20) << __func__ << ": snaps have different layout: " << in1->get_inode()->layout
14512+
<< " vs " << in2->get_inode()->layout << dendl;
14513+
block_diff->blocks.union_insert(0, in2->get_inode()->size);
14514+
ctx->complete(0);
14515+
return;
14516+
}
14517+
14518+
uint64_t scan_idx = block_diff->scan_idx;
14519+
uint64_t num_objects1 = Striper::get_num_objects(in1->get_inode()->layout,
14520+
in1->get_inode()->size);
14521+
uint64_t num_objects2 = Striper::get_num_objects(in2->get_inode()->layout,
14522+
in2->get_inode()->size);
14523+
uint64_t num_objects_pending1 = num_objects1 - scan_idx;
14524+
uint64_t num_objects_pending2 = num_objects2 - scan_idx;
14525+
14526+
uint64_t scans = std::min(
14527+
std::min(num_objects_pending1, num_objects_pending2),
14528+
std::min((uint64_t)(g_conf().get_val<uint64_t>("mds_file_blockdiff_max_concurrent_object_scans")),
14529+
max_objects));
14530+
14531+
dout(20) << __func__ << ": scanning " << scans << " objects" << dendl;
14532+
if (scans == 0) {
14533+
// we ran out of objects to scan - figure which ones
14534+
if (num_objects_pending1 == 0 && num_objects_pending2 == 0) {
14535+
// easy - both snaps have same number of objects
14536+
dout(20) << __func__ << ": equal extent" << dendl;
14537+
ctx->complete(0);
14538+
} else {
14539+
if (num_objects_pending1 == 0) {
14540+
// first snapshot has lesser number of objects - return
14541+
// an extent covering EOF.
14542+
dout(20) << __func__ << ": EOF extent" << dendl;
14543+
uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
14544+
scan_idx, 0);
14545+
block_diff->blocks.union_insert(offset, in2->get_inode()->size - offset);
14546+
ctx->complete(0);
14547+
} else {
14548+
// num_objects_pending2 == 0
14549+
dout(20) << __func__ << ": truncated extent" << dendl;
14550+
ctx->complete(0);
14551+
}
14552+
}
14553+
14554+
return;
14555+
}
14556+
14557+
C_ListSnapsAggregator *on_finish = new C_ListSnapsAggregator(mds, in1, in2, block_diff, ctx);
14558+
MDSGatherBuilder gather_ctx(g_ceph_context, on_finish);
14559+
14560+
while (scans > 0) {
14561+
ObjectOperation op;
14562+
std::unique_ptr<SnapSetContext> ssc(new SnapSetContext());
14563+
op.list_snaps(&ssc->snaps, &ssc->r);
14564+
ssc->objectid = scan_idx;
14565+
14566+
mds->objecter->read(file_object_t(in1->ino(), scan_idx),
14567+
OSDMap::file_to_object_locator(in2->get_inode()->layout),
14568+
op, LIBRADOS_SNAP_DIR, NULL, 0, gather_ctx.new_sub());
14569+
on_finish->add_snap_set_context(std::move(ssc));
14570+
++scan_idx;
14571+
--scans;
14572+
}
14573+
14574+
gather_ctx.activate();
14575+
}
14576+
14577+
void MDCache::aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
14578+
CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish) {
14579+
dout(20) << __func__ << dendl;
14580+
14581+
// always signal to the client to request again since request
14582+
// completion is signalled in file_blockdiff().
14583+
int r = 1;
14584+
snapid_t snapid1 = in1->last;
14585+
snapid_t snapid2 = in2->last;
14586+
uint64_t scans = snap_set_ctx.size();
14587+
14588+
interval_set<uint64_t> extents;
14589+
for (auto &snap_set : snap_set_ctx) {
14590+
dout(20) << __func__ << ": objectid=" << snap_set->objectid << ", r=" << snap_set->r
14591+
<< dendl;
14592+
if (snap_set->r != 0 && snap_set->r != -ENOENT) {
14593+
derr << ": failed to get snap set for objectid=" << snap_set->objectid
14594+
<< ", r=" << snap_set->r << dendl;
14595+
r = snap_set->r;
14596+
break;
14597+
}
14598+
14599+
if (snap_set->r == 0) {
14600+
auto &clones = snap_set->snaps.clones;
14601+
auto it1 = std::find_if(clones.begin(), clones.end(),
14602+
[snapid1](const librados::clone_info_t &clone)
14603+
{
14604+
return snapid1 == clone.cloneid ||
14605+
(std::find(clone.snaps.begin(), clone.snaps.end(), snapid1) != clone.snaps.end());
14606+
});
14607+
// point to "head" if not found
14608+
if (it1 == clones.end()) {
14609+
it1 = std::prev(it1);
14610+
}
14611+
auto it2 = std::find_if(clones.begin(), clones.end(),
14612+
[snapid2](const librados::clone_info_t &clone)
14613+
{
14614+
return snapid2 == clone.cloneid ||
14615+
(std::find(clone.snaps.begin(), clone.snaps.end(), snapid2) != clone.snaps.end());
14616+
});
14617+
// point to "head" if not found
14618+
if (it2 == clones.end()) {
14619+
it2 = std::prev(it2);
14620+
}
14621+
14622+
if (it1 == it2) {
14623+
dout(10) << __func__ << ": both snaps in same clone" << dendl;
14624+
continue;
14625+
}
14626+
14627+
interval_set<uint64_t> extent;
14628+
uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
14629+
snap_set->objectid, 0);
14630+
14631+
for (auto hops = std::distance(it1, it2); hops > 0; --hops) {
14632+
dout(20) << __func__ << ": [cloneid: " << it1->cloneid << " snaps: " << it1->snaps
14633+
<< " overlap: " << it1->overlap << "]" << dendl;
14634+
auto next_it = it1 + 1;
14635+
dout(20) << __func__ << ": [next cloneid: " << next_it->cloneid << " snaps: " << next_it->snaps
14636+
<< " overlap: " << next_it->overlap << "]" << dendl;
14637+
auto sz = next_it->size;
14638+
if (sz == 0) {
14639+
// this object is a hole in the file.
14640+
// TODO: report holes in blockdiff strucuter. that way,
14641+
// caller can optimize and punch holes rather than writing
14642+
// zeros.
14643+
dout(10) << __func__ << ": hole: [" << offset << "~" << it1->size << "]" << dendl;
14644+
dout(10) << __func__ << ": adding whole extent - reader will read zeros" << dendl;
14645+
sz = it1->size;
14646+
}
14647+
14648+
extent.clear();
14649+
extent.union_insert(offset, sz);
14650+
for (auto &overlap_region : it1->overlap) {
14651+
uint64_t overlap_offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
14652+
snap_set->objectid, overlap_region.first);
14653+
extent.erase(overlap_offset, overlap_region.second);
14654+
}
14655+
14656+
dout(20) << __func__ << ": (non overlapping) extent=" << extent << dendl;
14657+
extents.union_of(extent);
14658+
dout(20) << __func__ << ": (modified) extents=" << extents << dendl;
14659+
++it1;
14660+
}
14661+
}
14662+
}
14663+
14664+
block_diff->rval = r;
14665+
if (r >= 0) {
14666+
r = 0;
14667+
block_diff->scan_idx += scans;
14668+
block_diff->blocks = extents;
14669+
}
14670+
on_finish->complete(r);
14671+
}

src/mds/MDCache.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "include/types.h"
2727
#include "include/filepath.h"
2828
#include "include/elist.h"
29+
#include "include/rados/rados_types.hpp"
2930

3031
#include "messages/MCacheExpire.h"
3132
#include "messages/MClientQuota.h"
@@ -1160,6 +1161,17 @@ class MDCache {
11601161

11611162
double export_ephemeral_random_max = 0.0;
11621163

1164+
struct SnapSetContext { /* not to be confused with SnapContext */
1165+
int r;
1166+
uint64_t objectid;
1167+
librados::snap_set_t snaps;
1168+
};
1169+
1170+
void file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
1171+
MDSContext *ctx);
1172+
void aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
1173+
CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish);
1174+
11631175
protected:
11641176
// track leader requests whose peers haven't acknowledged commit
11651177
struct uleader {

0 commit comments

Comments
 (0)