Skip to content

Commit f4d9ed8

Browse files
authored
Merge pull request ceph#50326 from ifed01/wip-ifed-better-osd-robust
osd: improve OSD robustness. Reviewed-by: Adam Kupczyk <[email protected]>
2 parents 60367d5 + e7c08ec commit f4d9ed8

File tree

3 files changed

+110
-34
lines changed

3 files changed

+110
-34
lines changed

src/osd/OSD.cc

Lines changed: 106 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,6 +2098,22 @@ int heap(CephContext& cct,
20982098

20992099
} // namespace ceph::osd_cmds
21002100

2101+
void OSD::write_superblock(CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t)
2102+
{
2103+
dout(10) << "write_superblock " << sb << dendl;
2104+
2105+
//hack: at minimum it's using the baseline feature set
2106+
if (!sb.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
2107+
sb.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
2108+
2109+
bufferlist bl;
2110+
encode(sb, bl);
2111+
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2112+
std::map<std::string, ceph::buffer::list> attrs;
2113+
attrs.emplace(OSD_SUPERBLOCK_OMAP_KEY, bl);
2114+
t.omap_setkeys(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, attrs);
2115+
}
2116+
21012117
int OSD::mkfs(CephContext *cct,
21022118
std::unique_ptr<ObjectStore> store,
21032119
uuid_d fsid,
@@ -2159,15 +2175,11 @@ int OSD::mkfs(CephContext *cct,
21592175
sb.osd_fsid = store->get_fsid();
21602176
sb.whoami = whoami;
21612177
sb.compat_features = get_osd_initial_compat_set();
2162-
2163-
bufferlist bl;
2164-
encode(sb, bl);
2165-
21662178
ObjectStore::CollectionHandle ch = store->create_new_collection(
21672179
coll_t::meta());
21682180
ObjectStore::Transaction t;
21692181
t.create_collection(coll_t::meta(), 0);
2170-
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2182+
write_superblock(cct, sb, t);
21712183
ret = store->queue_transaction(ch, std::move(t));
21722184
if (ret) {
21732185
derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
@@ -3768,7 +3780,7 @@ int OSD::init()
37683780
}
37693781

37703782
ObjectStore::Transaction t;
3771-
write_superblock(t);
3783+
write_superblock(cct, superblock, t);
37723784
r = store->queue_transaction(service.meta_ch, std::move(t));
37733785
if (r < 0)
37743786
goto out;
@@ -4578,7 +4590,7 @@ int OSD::shutdown()
45784590
superblock.mounted = service.get_boot_epoch();
45794591
superblock.clean_thru = get_osdmap_epoch();
45804592
ObjectStore::Transaction t;
4581-
write_superblock(t);
4593+
write_superblock(cct, superblock, t);
45824594
int r = store->queue_transaction(service.meta_ch, std::move(t));
45834595
if (r) {
45844596
derr << "OSD::shutdown: error writing superblock: "
@@ -4775,31 +4787,81 @@ int OSD::update_crush_device_class()
47754787
}
47764788
}
47774789

4778-
void OSD::write_superblock(ObjectStore::Transaction& t)
4779-
{
4780-
dout(10) << "write_superblock " << superblock << dendl;
4781-
4782-
//hack: at minimum it's using the baseline feature set
4783-
if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4784-
superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4785-
4786-
bufferlist bl;
4787-
encode(superblock, bl);
4788-
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4789-
}
47904790

47914791
int OSD::read_superblock()
47924792
{
4793+
// Read superblock from both object data and omap metadata
4794+
// for better robustness.
4795+
// Use the most recent superblock replica if obtained versions
4796+
// mismatch.
47934797
bufferlist bl;
4794-
int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4795-
if (r < 0)
4796-
return r;
47974798

4798-
auto p = bl.cbegin();
4799-
decode(superblock, p);
4799+
set<string> keys;
4800+
keys.insert(OSD_SUPERBLOCK_OMAP_KEY);
4801+
map<string, bufferlist> vals;
4802+
OSDSuperblock super_omap;
4803+
OSDSuperblock super_disk;
4804+
int r_omap = store->omap_get_values(
4805+
service.meta_ch, OSD_SUPERBLOCK_GOBJECT, keys, &vals);
4806+
if (r_omap >= 0 && vals.size() > 0) {
4807+
try {
4808+
auto p = vals.begin()->second.cbegin();
4809+
decode(super_omap, p);
4810+
} catch(...) {
4811+
derr << __func__ << " omap replica is corrupted."
4812+
<< dendl;
4813+
r_omap = -EFAULT;
4814+
}
4815+
} else {
4816+
derr << __func__ << " omap replica is missing."
4817+
<< dendl;
4818+
r_omap = -ENOENT;
4819+
}
4820+
int r_disk = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4821+
if (r_disk >= 0) {
4822+
try {
4823+
auto p = bl.cbegin();
4824+
decode(super_disk, p);
4825+
} catch(...) {
4826+
derr << __func__ << " disk replica is corrupted."
4827+
<< dendl;
4828+
r_disk = -EFAULT;
4829+
}
4830+
} else {
4831+
derr << __func__ << " disk replica is missing."
4832+
<< dendl;
4833+
r_disk = -ENOENT;
4834+
}
48004835

4801-
dout(10) << "read_superblock " << superblock << dendl;
4836+
if (r_omap >= 0 && r_disk < 0) {
4837+
std::swap(superblock, super_omap);
4838+
dout(1) << __func__ << " got omap replica but failed to get disk one."
4839+
<< dendl;
4840+
} else if (r_omap < 0 && r_disk >= 0) {
4841+
std::swap(superblock, super_disk);
4842+
dout(1) << __func__ << " got disk replica but failed to get omap one."
4843+
<< dendl;
4844+
} else if (r_omap < 0 && r_disk < 0) {
4845+
// error to be logged by the caller
4846+
return -ENOENT;
4847+
} else {
4848+
std::swap(superblock, super_omap); // let omap be the primary source
4849+
if (superblock.current_epoch != super_disk.current_epoch) {
4850+
derr << __func__ << " got mismatching superblocks, omap:"
4851+
<< superblock << " vs. disk:" << super_disk
4852+
<< dendl;
4853+
if (superblock.current_epoch < super_disk.current_epoch) {
4854+
std::swap(superblock, super_disk);
4855+
dout(0) << __func__ << " using disk superblock"
4856+
<< dendl;
4857+
} else {
4858+
dout(0) << __func__ << " using omap superblock"
4859+
<< dendl;
4860+
}
4861+
}
4862+
}
48024863

4864+
dout(10) << "read_superblock " << superblock << dendl;
48034865
return 0;
48044866
}
48054867

@@ -6701,7 +6763,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
67016763
m->purged_snaps);
67026764
}
67036765
superblock.purged_snaps_last = m->last;
6704-
write_superblock(t);
6766+
write_superblock(cct, superblock, t);
67056767
store->queue_transaction(
67066768
service.meta_ch,
67076769
std::move(t));
@@ -7185,7 +7247,7 @@ void OSD::scrub_purged_snaps()
71857247
dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
71867248
ObjectStore::Transaction t;
71877249
superblock.last_purged_snaps_scrub = ceph_clock_now();
7188-
write_superblock(t);
7250+
write_superblock(cct, superblock, t);
71897251
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
71907252
ceph_assert(tr == 0);
71917253
if (is_active()) {
@@ -7899,7 +7961,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78997961
num++;
79007962
if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
79017963
service.publish_superblock(superblock);
7902-
write_superblock(t);
7964+
write_superblock(cct, superblock, t);
79037965
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
79047966
ceph_assert(tr == 0);
79057967
num = 0;
@@ -7915,7 +7977,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
79157977
}
79167978
if (num > 0) {
79177979
service.publish_superblock(superblock);
7918-
write_superblock(t);
7980+
write_superblock(cct, superblock, t);
79197981
int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
79207982
ceph_assert(tr == 0);
79217983
}
@@ -8231,7 +8293,19 @@ void OSD::handle_osd_map(MOSDMap *m)
82318293
{
82328294
bufferlist bl;
82338295
::encode(pg_num_history, bl);
8234-
t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8296+
auto oid = make_pg_num_history_oid();
8297+
t.truncate(coll_t::meta(), oid, 0); // we don't need bytes left if new data
8298+
// block is shorter than the previous
8299+
// one. And better to trim them, e.g.
8300+
// this allows to avoid csum eroors
8301+
// when issuing overwrite
8302+
// (which happens to be partial)
8303+
// and original data is corrupted.
8304+
// Another side effect is that the
8305+
// superblock is not permanently
8306+
// anchored to a fixed disk location
8307+
// any more.
8308+
t.write(coll_t::meta(), oid, 0, bl.length(), bl);
82358309
dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
82368310
}
82378311

@@ -8251,7 +8325,7 @@ void OSD::handle_osd_map(MOSDMap *m)
82518325
}
82528326

82538327
// superblock and commit
8254-
write_superblock(t);
8328+
write_superblock(cct, superblock, t);
82558329
t.register_on_commit(new C_OnMapCommit(this, start, last, m));
82568330
store->queue_transaction(
82578331
service.meta_ch,
@@ -8569,7 +8643,7 @@ void OSD::check_osdmap_features()
85698643
dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
85708644
superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
85718645
ObjectStore::Transaction t;
8572-
write_superblock(t);
8646+
write_superblock(cct, superblock, t);
85738647
int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
85748648
ceph_assert(err == 0);
85758649
}

src/osd/OSD.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,8 +1235,9 @@ class OSD : public Dispatcher,
12351235
// -- superblock --
12361236
OSDSuperblock superblock;
12371237

1238-
void write_superblock();
1239-
void write_superblock(ObjectStore::Transaction& t);
1238+
static void write_superblock(CephContext* cct,
1239+
OSDSuperblock& sb,
1240+
ObjectStore::Transaction& t);
12401241
int read_superblock();
12411242

12421243
void clear_temp_objects();

src/osd/osd_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ enum {
374374
// pg stuff
375375

376376
#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
377+
#define OSD_SUPERBLOCK_OMAP_KEY "osd_superblock"
377378

378379
// placement seed (a hash value)
379380
typedef uint32_t ps_t;

0 commit comments

Comments
 (0)