Skip to content

Commit 2e9c723

Browse files
committed
osd: improve OSD robustness.
Achieved by 1. osd superblock data is replicated in onode's OMAP - hence one can recover from that after onode's content is corrupted. 2. pg_num_history object gets full overwrite which eliminatess the need to merge with previous data (and hence reading corrupted data wouldn't kill OSD). Signed-off-by: Igor Fedotov <[email protected]>
1 parent 0ad6038 commit 2e9c723

File tree

3 files changed

+67
-31
lines changed

3 files changed

+67
-31
lines changed

src/osd/OSD.cc

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,22 @@ int heap(CephContext& cct,
20562056

20572057
} // namespace ceph::osd_cmds
20582058

2059+
void OSD::write_superblock(CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t)
2060+
{
2061+
dout(10) << "write_superblock " << sb << dendl;
2062+
2063+
//hack: at minimum it's using the baseline feature set
2064+
if (!sb.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
2065+
sb.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
2066+
2067+
bufferlist bl;
2068+
encode(sb, bl);
2069+
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2070+
std::map<std::string, ceph::buffer::list> attrs;
2071+
attrs.emplace(OSD_SUPERBLOCK_OMAP_KEY, bl);
2072+
t.omap_setkeys(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, attrs);
2073+
}
2074+
20592075
int OSD::mkfs(CephContext *cct,
20602076
std::unique_ptr<ObjectStore> store,
20612077
uuid_d fsid,
@@ -2117,15 +2133,11 @@ int OSD::mkfs(CephContext *cct,
21172133
sb.osd_fsid = store->get_fsid();
21182134
sb.whoami = whoami;
21192135
sb.compat_features = get_osd_initial_compat_set();
2120-
2121-
bufferlist bl;
2122-
encode(sb, bl);
2123-
21242136
ObjectStore::CollectionHandle ch = store->create_new_collection(
21252137
coll_t::meta());
21262138
ObjectStore::Transaction t;
21272139
t.create_collection(coll_t::meta(), 0);
2128-
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2140+
write_superblock(cct, sb, t);
21292141
ret = store->queue_transaction(ch, std::move(t));
21302142
if (ret) {
21312143
derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
@@ -3726,7 +3738,7 @@ int OSD::init()
37263738
}
37273739

37283740
ObjectStore::Transaction t;
3729-
write_superblock(t);
3741+
write_superblock(cct, superblock, t);
37303742
r = store->queue_transaction(service.meta_ch, std::move(t));
37313743
if (r < 0)
37323744
goto out;
@@ -4536,7 +4548,7 @@ int OSD::shutdown()
45364548
superblock.mounted = service.get_boot_epoch();
45374549
superblock.clean_thru = get_osdmap_epoch();
45384550
ObjectStore::Transaction t;
4539-
write_superblock(t);
4551+
write_superblock(cct, superblock, t);
45404552
int r = store->queue_transaction(service.meta_ch, std::move(t));
45414553
if (r) {
45424554
derr << "OSD::shutdown: error writing superblock: "
@@ -4733,25 +4745,35 @@ int OSD::update_crush_device_class()
47334745
}
47344746
}
47354747

4736-
void OSD::write_superblock(ObjectStore::Transaction& t)
4737-
{
4738-
dout(10) << "write_superblock " << superblock << dendl;
4739-
4740-
//hack: at minimum it's using the baseline feature set
4741-
if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4742-
superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4743-
4744-
bufferlist bl;
4745-
encode(superblock, bl);
4746-
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4747-
}
47484748

47494749
int OSD::read_superblock()
47504750
{
47514751
bufferlist bl;
4752-
int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4753-
if (r < 0)
4754-
return r;
4752+
4753+
set<string> keys;
4754+
keys.insert(OSD_SUPERBLOCK_OMAP_KEY);
4755+
map<string, bufferlist> vals;
4756+
// Let's read from OMAP first to be able to better handle
4757+
// "recover-after-an-error' case when main OSD volume data
4758+
// is partially corrupted (csums don't match for a bunch of onodes).
4759+
// As a result we might want to set bluestore_ignore_csum_error option which
4760+
// will silent disk read errors.
4761+
// Clearly such a reading from corrupted superblock will miss an error as well
4762+
// and it wouldn't attempt to use still valid OMAP's replica.
4763+
// Hence preferring omap reading over disk one.
4764+
int r = store->omap_get_values(
4765+
service.meta_ch, OSD_SUPERBLOCK_GOBJECT, keys, &vals);
4766+
if (r < 0 || vals.size() == 0) {
4767+
dout(10) << __func__ << " attempt reading from disk replica" << dendl;
4768+
4769+
r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4770+
if (r < 0) {
4771+
return -ENOENT;
4772+
}
4773+
dout(10) << __func__ << " got disk replica" << dendl;
4774+
} else {
4775+
std::swap(bl, vals.begin()->second);
4776+
}
47554777

47564778
auto p = bl.cbegin();
47574779
decode(superblock, p);
@@ -6659,7 +6681,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
66596681
m->purged_snaps);
66606682
}
66616683
superblock.purged_snaps_last = m->last;
6662-
write_superblock(t);
6684+
write_superblock(cct, superblock, t);
66636685
store->queue_transaction(
66646686
service.meta_ch,
66656687
std::move(t));
@@ -7143,7 +7165,7 @@ void OSD::scrub_purged_snaps()
71437165
dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
71447166
ObjectStore::Transaction t;
71457167
superblock.last_purged_snaps_scrub = ceph_clock_now();
7146-
write_superblock(t);
7168+
write_superblock(cct, superblock, t);
71477169
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
71487170
ceph_assert(tr == 0);
71497171
if (is_active()) {
@@ -7856,7 +7878,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78567878
num++;
78577879
if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
78587880
service.publish_superblock(superblock);
7859-
write_superblock(t);
7881+
write_superblock(cct, superblock, t);
78607882
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
78617883
ceph_assert(tr == 0);
78627884
num = 0;
@@ -7872,7 +7894,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78727894
}
78737895
if (num > 0) {
78747896
service.publish_superblock(superblock);
7875-
write_superblock(t);
7897+
write_superblock(cct, superblock, t);
78767898
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
78777899
ceph_assert(tr == 0);
78787900
}
@@ -8184,7 +8206,19 @@ void OSD::handle_osd_map(MOSDMap *m)
81848206
{
81858207
bufferlist bl;
81868208
::encode(pg_num_history, bl);
8187-
t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8209+
auto oid = make_pg_num_history_oid();
8210+
t.truncate(coll_t::meta(), oid, 0); // we don't need bytes left if new data
8211+
// block is shorter than the previous
8212+
// one. And better to trim them, e.g.
8213+
// this allows to avoid csum eroors
8214+
// when issuing overwrite
8215+
// (which happens to be partial)
8216+
// and original data is corrupted.
8217+
// Another side effect is that the
8218+
// superblock is not permanently
8219+
// anchored to a fixed disk location
8220+
// any more.
8221+
t.write(coll_t::meta(), oid, 0, bl.length(), bl);
81888222
dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
81898223
}
81908224

@@ -8204,7 +8238,7 @@ void OSD::handle_osd_map(MOSDMap *m)
82048238
}
82058239

82068240
// superblock and commit
8207-
write_superblock(t);
8241+
write_superblock(cct, superblock, t);
82088242
t.register_on_commit(new C_OnMapCommit(this, start, last, m));
82098243
store->queue_transaction(
82108244
service.meta_ch,
@@ -8522,7 +8556,7 @@ void OSD::check_osdmap_features()
85228556
dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
85238557
superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
85248558
ObjectStore::Transaction t;
8525-
write_superblock(t);
8559+
write_superblock(cct, superblock, t);
85268560
int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
85278561
ceph_assert(err == 0);
85288562
}

src/osd/OSD.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,8 +1217,9 @@ class OSD : public Dispatcher,
12171217
// -- superblock --
12181218
OSDSuperblock superblock;
12191219

1220-
void write_superblock();
1221-
void write_superblock(ObjectStore::Transaction& t);
1220+
static void write_superblock(CephContext* cct,
1221+
OSDSuperblock& sb,
1222+
ObjectStore::Transaction& t);
12221223
int read_superblock();
12231224

12241225
void clear_temp_objects();

src/osd/osd_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ enum {
373373
// pg stuff
374374

375375
#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
376+
#define OSD_SUPERBLOCK_OMAP_KEY "osd_superblock"
376377

377378
// placement seed (a hash value)
378379
typedef uint32_t ps_t;

0 commit comments

Comments
 (0)