@@ -2056,6 +2056,22 @@ int heap(CephContext& cct,
20562056
20572057} // namespace ceph::osd_cmds
20582058
2059+ void OSD::write_superblock (CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t)
2060+ {
2061+ dout (10 ) << " write_superblock " << sb << dendl;
2062+
2063+ // hack: at minimum it's using the baseline feature set
2064+ if (!sb.compat_features .incompat .contains (CEPH_OSD_FEATURE_INCOMPAT_BASE))
2065+ sb.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_BASE);
2066+
2067+ bufferlist bl;
2068+ encode (sb, bl);
2069+ t.write (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl.length (), bl);
2070+ std::map<std::string, ceph::buffer::list> attrs;
2071+ attrs.emplace (OSD_SUPERBLOCK_OMAP_KEY, bl);
2072+ t.omap_setkeys (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, attrs);
2073+ }
2074+
20592075int OSD::mkfs (CephContext *cct,
20602076 std::unique_ptr<ObjectStore> store,
20612077 uuid_d fsid,
@@ -2117,15 +2133,11 @@ int OSD::mkfs(CephContext *cct,
21172133 sb.osd_fsid = store->get_fsid ();
21182134 sb.whoami = whoami;
21192135 sb.compat_features = get_osd_initial_compat_set ();
2120-
2121- bufferlist bl;
2122- encode (sb, bl);
2123-
21242136 ObjectStore::CollectionHandle ch = store->create_new_collection (
21252137 coll_t::meta ());
21262138 ObjectStore::Transaction t;
21272139 t.create_collection (coll_t::meta (), 0 );
2128- t. write ( coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl. length (), bl );
2140+ write_superblock (cct, sb, t );
21292141 ret = store->queue_transaction (ch, std::move (t));
21302142 if (ret) {
21312143 derr << " OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
@@ -3726,7 +3738,7 @@ int OSD::init()
37263738 }
37273739
37283740 ObjectStore::Transaction t;
3729- write_superblock (t);
3741+ write_superblock (cct, superblock, t);
37303742 r = store->queue_transaction (service.meta_ch , std::move (t));
37313743 if (r < 0 )
37323744 goto out;
@@ -4536,7 +4548,7 @@ int OSD::shutdown()
45364548 superblock.mounted = service.get_boot_epoch ();
45374549 superblock.clean_thru = get_osdmap_epoch ();
45384550 ObjectStore::Transaction t;
4539- write_superblock (t);
4551+ write_superblock (cct, superblock, t);
45404552 int r = store->queue_transaction (service.meta_ch , std::move (t));
45414553 if (r) {
45424554 derr << " OSD::shutdown: error writing superblock: "
@@ -4733,25 +4745,35 @@ int OSD::update_crush_device_class()
47334745 }
47344746}
47354747
4736- void OSD::write_superblock (ObjectStore::Transaction& t)
4737- {
4738- dout (10 ) << " write_superblock " << superblock << dendl;
4739-
4740- // hack: at minimum it's using the baseline feature set
4741- if (!superblock.compat_features .incompat .contains (CEPH_OSD_FEATURE_INCOMPAT_BASE))
4742- superblock.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_BASE);
4743-
4744- bufferlist bl;
4745- encode (superblock, bl);
4746- t.write (coll_t::meta (), OSD_SUPERBLOCK_GOBJECT, 0 , bl.length (), bl);
4747- }
47484748
47494749int OSD::read_superblock ()
47504750{
47514751 bufferlist bl;
4752- int r = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4753- if (r < 0 )
4754- return r;
4752+
4753+ set<string> keys;
4754+ keys.insert (OSD_SUPERBLOCK_OMAP_KEY);
4755+ map<string, bufferlist> vals;
4756+ // Let's read from OMAP first to be able to better handle
4757+ // "recover-after-an-error' case when main OSD volume data
4758+ // is partially corrupted (csums don't match for a bunch of onodes).
4759+ // As a result we might want to set bluestore_ignore_csum_error option which
4760+ // will silent disk read errors.
4761+ // Clearly such a reading from corrupted superblock will miss an error as well
4762+ // and it wouldn't attempt to use still valid OMAP's replica.
4763+ // Hence preferring omap reading over disk one.
4764+ int r = store->omap_get_values (
4765+ service.meta_ch , OSD_SUPERBLOCK_GOBJECT, keys, &vals);
4766+ if (r < 0 || vals.size () == 0 ) {
4767+ dout (10 ) << __func__ << " attempt reading from disk replica" << dendl;
4768+
4769+ r = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4770+ if (r < 0 ) {
4771+ return -ENOENT;
4772+ }
4773+ dout (10 ) << __func__ << " got disk replica" << dendl;
4774+ } else {
4775+ std::swap (bl, vals.begin ()->second );
4776+ }
47554777
47564778 auto p = bl.cbegin ();
47574779 decode (superblock, p);
@@ -6659,7 +6681,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
66596681 m->purged_snaps );
66606682 }
66616683 superblock.purged_snaps_last = m->last ;
6662- write_superblock (t);
6684+ write_superblock (cct, superblock, t);
66636685 store->queue_transaction (
66646686 service.meta_ch ,
66656687 std::move (t));
@@ -7143,7 +7165,7 @@ void OSD::scrub_purged_snaps()
71437165 dout (10 ) << __func__ << " done queueing pgs, updating superblock" << dendl;
71447166 ObjectStore::Transaction t;
71457167 superblock.last_purged_snaps_scrub = ceph_clock_now ();
7146- write_superblock (t);
7168+ write_superblock (cct, superblock, t);
71477169 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
71487170 ceph_assert (tr == 0 );
71497171 if (is_active ()) {
@@ -7856,7 +7878,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78567878 num++;
78577879 if (num >= cct->_conf ->osd_target_transaction_size && num >= nreceived) {
78587880 service.publish_superblock (superblock);
7859- write_superblock (t);
7881+ write_superblock (cct, superblock, t);
78607882 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
78617883 ceph_assert (tr == 0 );
78627884 num = 0 ;
@@ -7872,7 +7894,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
78727894 }
78737895 if (num > 0 ) {
78747896 service.publish_superblock (superblock);
7875- write_superblock (t);
7897+ write_superblock (cct, superblock, t);
78767898 int tr = store->queue_transaction (service.meta_ch , std::move (t), nullptr );
78777899 ceph_assert (tr == 0 );
78787900 }
@@ -8184,7 +8206,19 @@ void OSD::handle_osd_map(MOSDMap *m)
81848206 {
81858207 bufferlist bl;
81868208 ::encode (pg_num_history, bl);
8187- t.write (coll_t::meta (), make_pg_num_history_oid (), 0 , bl.length (), bl);
8209+ auto oid = make_pg_num_history_oid ();
8210+ t.truncate (coll_t::meta (), oid, 0 ); // we don't need bytes left if new data
8211+ // block is shorter than the previous
8212+ // one. And better to trim them, e.g.
8213+ // this allows to avoid csum eroors
8214+ // when issuing overwrite
8215+ // (which happens to be partial)
8216+ // and original data is corrupted.
8217+ // Another side effect is that the
8218+ // superblock is not permanently
8219+ // anchored to a fixed disk location
8220+ // any more.
8221+ t.write (coll_t::meta (), oid, 0 , bl.length (), bl);
81888222 dout (20 ) << __func__ << " pg_num_history " << pg_num_history << dendl;
81898223 }
81908224
@@ -8204,7 +8238,7 @@ void OSD::handle_osd_map(MOSDMap *m)
82048238 }
82058239
82068240 // superblock and commit
8207- write_superblock (t);
8241+ write_superblock (cct, superblock, t);
82088242 t.register_on_commit (new C_OnMapCommit (this , start, last, m));
82098243 store->queue_transaction (
82108244 service.meta_ch ,
@@ -8522,7 +8556,7 @@ void OSD::check_osdmap_features()
85228556 dout (0 ) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
85238557 superblock.compat_features .incompat .insert (CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
85248558 ObjectStore::Transaction t;
8525- write_superblock (t);
8559+ write_superblock (cct, superblock, t);
85268560 int err = store->queue_transaction (service.meta_ch , std::move (t), NULL );
85278561 ceph_assert (err == 0 );
85288562 }
0 commit comments