@@ -4748,38 +4748,78 @@ int OSD::update_crush_device_class()
47484748
47494749int OSD::read_superblock ()
47504750{
4751+ // Read superblock from both object data and omap metadata
4752+ // for better robustness.
4753+ // Use the most recent superblock replica if obtained versions
4754+ // mismatch.
47514755 bufferlist bl;
4752-
4756+
47534757 set<string> keys;
47544758 keys.insert (OSD_SUPERBLOCK_OMAP_KEY);
47554759 map<string, bufferlist> vals;
4756- // Let's read from OMAP first to be able to better handle
4757- // "recover-after-an-error' case when main OSD volume data
4758- // is partially corrupted (csums don't match for a bunch of onodes).
4759- // As a result we might want to set bluestore_ignore_csum_error option which
4760- // will silent disk read errors.
4761- // Clearly such a reading from corrupted superblock will miss an error as well
4762- // and it wouldn't attempt to use still valid OMAP's replica.
4763- // Hence preferring omap reading over disk one.
4764- int r = store->omap_get_values (
4760+ OSDSuperblock super_omap;
4761+ OSDSuperblock super_disk;
4762+ int r_omap = store->omap_get_values (
47654763 service.meta_ch , OSD_SUPERBLOCK_GOBJECT, keys, &vals);
4766- if (r < 0 || vals.size () == 0 ) {
4767- dout (10 ) << __func__ << " attempt reading from disk replica" << dendl;
4768-
4769- r = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4770- if (r < 0 ) {
4771- return -ENOENT;
4764+ if (r_omap >= 0 && vals.size () > 0 ) {
4765+ try {
4766+ auto p = vals.begin ()->second .cbegin ();
4767+ decode (super_omap, p);
4768+ } catch (...) {
4769+ derr << __func__ << " omap replica is corrupted."
4770+ << dendl;
4771+ r_omap = -EFAULT;
4772+ }
4773+ } else {
4774+ derr << __func__ << " omap replica is missing."
4775+ << dendl;
4776+ r_omap = -ENOENT;
4777+ }
4778+ int r_disk = store->read (service.meta_ch , OSD_SUPERBLOCK_GOBJECT, 0 , 0 , bl);
4779+ if (r_disk >= 0 ) {
4780+ try {
4781+ auto p = bl.cbegin ();
4782+ decode (super_disk, p);
4783+ } catch (...) {
4784+ derr << __func__ << " disk replica is corrupted."
4785+ << dendl;
4786+ r_disk = -EFAULT;
47724787 }
4773- dout (10 ) << __func__ << " got disk replica" << dendl;
47744788 } else {
4775- std::swap (bl, vals.begin ()->second );
4789+ derr << __func__ << " disk replica is missing."
4790+ << dendl;
4791+ r_disk = -ENOENT;
47764792 }
47774793
4778- auto p = bl.cbegin ();
4779- decode (superblock, p);
4794+ if (r_omap >= 0 && r_disk < 0 ) {
4795+ std::swap (superblock, super_omap);
4796+ dout (1 ) << __func__ << " got omap replica but failed to get disk one."
4797+ << dendl;
4798+ } else if (r_omap < 0 && r_disk >= 0 ) {
4799+ std::swap (superblock, super_disk);
4800+ dout (1 ) << __func__ << " got disk replica but failed to get omap one."
4801+ << dendl;
4802+ } else if (r_omap < 0 && r_disk < 0 ) {
4803+ // error to be logged by the caller
4804+ return -ENOENT;
4805+ } else {
4806+ std::swap (superblock, super_omap); // let omap be the primary source
4807+ if (superblock.current_epoch != super_disk.current_epoch ) {
4808+ derr << __func__ << " got mismatching superblocks, omap:"
4809+ << superblock << " vs. disk:" << super_disk
4810+ << dendl;
4811+ if (superblock.current_epoch < super_disk.current_epoch ) {
4812+ std::swap (superblock, super_disk);
4813+ dout (0 ) << __func__ << " using disk superblock"
4814+ << dendl;
4815+ } else {
4816+ dout (0 ) << __func__ << " using omap superblock"
4817+ << dendl;
4818+ }
4819+ }
4820+ }
47804821
47814822 dout (10 ) << " read_superblock " << superblock << dendl;
4782-
47834823 return 0 ;
47844824}
47854825
0 commit comments