@@ -2098,6 +2098,22 @@ int heap(CephContext& cct,
20982098
20992099} // namespace ceph::osd_cmds
21002100
2101+ void OSD::write_superblock (CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t)
2102+ {
2103+ dout (10 ) << " write_superblock " << sb << dendl;
2104+
2105+ // hack: at minimum it's using the baseline feature set
2106+ if (!sb.compat_features .incompat .contains (CEPH_OSD_FEATURE_INCOMPAT_BASE))
2107+ sb.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_BASE);
2108+
2109+ bufferlist bl;
2110+ encode (sb, bl);
2111+ t.write (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl.length (), bl);
2112+ std::map<std::string, ceph::buffer::list> attrs;
2113+ attrs.emplace (OSD_SUPERBLOCK_OMAP_KEY, bl);
2114+ t.omap_setkeys (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, attrs);
2115+ }
2116+
21012117int OSD::mkfs (CephContext *cct,
21022118 std::unique_ptr<ObjectStore> store,
21032119 uuid_d fsid,
@@ -2159,15 +2175,11 @@ int OSD::mkfs(CephContext *cct,
21592175 sb.osd_fsid = store->get_fsid ();
21602176 sb.whoami = whoami;
21612177 sb.compat_features = get_osd_initial_compat_set ();
2162-
2163- bufferlist bl;
2164- encode (sb, bl);
2165-
21662178 ObjectStore::CollectionHandle ch = store->create_new_collection (
21672179 coll_t::meta ());
21682180 ObjectStore::Transaction t;
21692181 t.create_collection (coll_t::meta (), 0 );
2170- t. write ( coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl. length (), bl );
2182+ write_superblock (cct, sb, t );
21712183 ret = store->queue_transaction (ch, std::move (t));
21722184 if (ret) {
21732185 derr << " OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
@@ -3768,7 +3780,7 @@ int OSD::init()
37683780 }
37693781
37703782 ObjectStore::Transaction t;
3771- write_superblock (t);
3783+ write_superblock (cct, superblock, t);
37723784 r = store->queue_transaction (service.meta_ch , std::move (t));
37733785 if (r < 0 )
37743786 goto out;
@@ -4578,7 +4590,7 @@ int OSD::shutdown()
45784590 superblock.mounted = service.get_boot_epoch ();
45794591 superblock.clean_thru = get_osdmap_epoch ();
45804592 ObjectStore::Transaction t;
4581- write_superblock (t);
4593+ write_superblock (cct, superblock, t);
45824594 int r = store->queue_transaction (service.meta_ch , std::move (t));
45834595 if (r) {
45844596 derr << " OSD::shutdown: error writing superblock: "
@@ -4775,31 +4787,81 @@ int OSD::update_crush_device_class()
47754787 }
47764788}
47774789
4778- void OSD::write_superblock (ObjectStore::Transaction& t)
4779- {
4780- dout (10 ) << " write_superblock " << superblock << dendl;
4781-
4782- // hack: at minimum it's using the baseline feature set
4783- if (!superblock.compat_features .incompat .contains (CEPH_OSD_FEATURE_INCOMPAT_BASE))
4784- superblock.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_BASE);
4785-
4786- bufferlist bl;
4787- encode (superblock, bl);
4788- t.write (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl.length (), bl);
4789- }
47904790
47914791int OSD::read_superblock ()
47924792{
4793+ // Read superblock from both object data and omap metadata
4794+ // for better robustness.
4795+ // Use the most recent superblock replica if obtained versions
4796+ // mismatch.
47934797 bufferlist bl;
4794- int r = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4795- if (r < 0 )
4796- return r;
47974798
4798- auto p = bl.cbegin ();
4799- decode (superblock, p);
4799+ set<string> keys;
4800+ keys.insert (OSD_SUPERBLOCK_OMAP_KEY);
4801+ map<string, bufferlist> vals;
4802+ OSDSuperblock super_omap;
4803+ OSDSuperblock super_disk;
4804+ int r_omap = store->omap_get_values (
4805+ service.meta_ch , OSD_SUPERBLOCK_GOBJECT, keys, &vals);
4806+ if (r_omap >= 0 && vals.size () > 0 ) {
4807+ try {
4808+ auto p = vals.begin ()->second .cbegin ();
4809+ decode (super_omap, p);
4810+ } catch (...) {
4811+ derr << __func__ << " omap replica is corrupted."
4812+ << dendl;
4813+ r_omap = -EFAULT;
4814+ }
4815+ } else {
4816+ derr << __func__ << " omap replica is missing."
4817+ << dendl;
4818+ r_omap = -ENOENT;
4819+ }
4820+ int r_disk = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4821+ if (r_disk >= 0 ) {
4822+ try {
4823+ auto p = bl.cbegin ();
4824+ decode (super_disk, p);
4825+ } catch (...) {
4826+ derr << __func__ << " disk replica is corrupted."
4827+ << dendl;
4828+ r_disk = -EFAULT;
4829+ }
4830+ } else {
4831+ derr << __func__ << " disk replica is missing."
4832+ << dendl;
4833+ r_disk = -ENOENT;
4834+ }
48004835
4801- dout (10 ) << " read_superblock " << superblock << dendl;
4836+ if (r_omap >= 0 && r_disk < 0 ) {
4837+ std::swap (superblock, super_omap);
4838+ dout (1 ) << __func__ << " got omap replica but failed to get disk one."
4839+ << dendl;
4840+ } else if (r_omap < 0 && r_disk >= 0 ) {
4841+ std::swap (superblock, super_disk);
4842+ dout (1 ) << __func__ << " got disk replica but failed to get omap one."
4843+ << dendl;
4844+ } else if (r_omap < 0 && r_disk < 0 ) {
4845+ // error to be logged by the caller
4846+ return -ENOENT;
4847+ } else {
4848+ std::swap (superblock, super_omap); // let omap be the primary source
4849+ if (superblock.current_epoch != super_disk.current_epoch ) {
4850+ derr << __func__ << " got mismatching superblocks, omap:"
4851+ << superblock << " vs. disk:" << super_disk
4852+ << dendl;
4853+ if (superblock.current_epoch < super_disk.current_epoch ) {
4854+ std::swap (superblock, super_disk);
4855+ dout (0 ) << __func__ << " using disk superblock"
4856+ << dendl;
4857+ } else {
4858+ dout (0 ) << __func__ << " using omap superblock"
4859+ << dendl;
4860+ }
4861+ }
4862+ }
48024863
4864+ dout (10 ) << " read_superblock " << superblock << dendl;
48034865 return 0 ;
48044866}
48054867
@@ -6701,7 +6763,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
67016763 m->purged_snaps );
67026764 }
67036765 superblock.purged_snaps_last = m->last ;
6704- write_superblock (t);
6766+ write_superblock (cct, superblock, t);
67056767 store->queue_transaction (
67066768 service.meta_ch ,
67076769 std::move (t));
@@ -7185,7 +7247,7 @@ void OSD::scrub_purged_snaps()
71857247 dout (10 ) << __func__ << " done queueing pgs, updating superblock" << dendl;
71867248 ObjectStore::Transaction t;
71877249 superblock.last_purged_snaps_scrub = ceph_clock_now ();
7188- write_superblock (t);
7250+ write_superblock (cct, superblock, t);
71897251 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
71907252 ceph_assert (tr == 0 );
71917253 if (is_active ()) {
@@ -7899,7 +7961,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78997961 num++;
79007962 if (num >= cct->_conf ->osd_target_transaction_size && num >= nreceived) {
79017963 service.publish_superblock (superblock);
7902- write_superblock (t);
7964+ write_superblock (cct, superblock, t);
79037965 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
79047966 ceph_assert (tr == 0 );
79057967 num = 0 ;
@@ -7915,7 +7977,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
79157977 }
79167978 if (num > 0 ) {
79177979 service.publish_superblock (superblock);
7918- write_superblock (t);
7980+ write_superblock (cct, superblock, t);
79197981 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
79207982 ceph_assert (tr == 0 );
79217983 }
@@ -8231,7 +8293,19 @@ void OSD::handle_osd_map(MOSDMap *m)
82318293 {
82328294 bufferlist bl;
82338295 ::encode (pg_num_history, bl);
8234- t.write (coll_t::meta (), make_pg_num_history_oid (), 0 , bl.length (), bl);
8296+ auto oid = make_pg_num_history_oid ();
8297+ t.truncate (coll_t::meta (), oid, 0 ); // we don't need bytes left if new data
8298+ // block is shorter than the previous
8299+ // one. And better to trim them, e.g.
8300+ // this allows to avoid csum eroors
8301+ // when issuing overwrite
8302+ // (which happens to be partial)
8303+ // and original data is corrupted.
8304+ // Another side effect is that the
8305+ // superblock is not permanently
8306+ // anchored to a fixed disk location
8307+ // any more.
8308+ t.write (coll_t::meta (), oid, 0 , bl.length (), bl);
82358309 dout (20 ) << __func__ << " pg_num_history " << pg_num_history << dendl;
82368310 }
82378311
@@ -8251,7 +8325,7 @@ void OSD::handle_osd_map(MOSDMap *m)
82518325 }
82528326
82538327 // superblock and commit
8254- write_superblock (t);
8328+ write_superblock (cct, superblock, t);
82558329 t.register_on_commit (new C_OnMapCommit (this , start, last, m));
82568330 store->queue_transaction (
82578331 service.meta_ch ,
@@ -8569,7 +8643,7 @@ void OSD::check_osdmap_features()
85698643 dout (0 ) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
85708644 superblock.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
85718645 ObjectStore::Transaction t;
8572- write_superblock (t);
8646+ write_superblock (cct, superblock, t);
85738647 int err = store->queue_transaction (service.meta_ch , std::move (t), NULL );
85748648 ceph_assert (err == 0 );
85758649 }
0 commit comments