@@ -99,15 +99,30 @@ Store::Store(
9999{
100100 ceph_assert (t);
101101
102- const auto err_obj = pgid.make_temp_ghobject (fmt::format (" scrub_{}" , pgid));
103- t->touch (coll, err_obj);
104- errors_db.emplace (pgid, err_obj, OSDriver{&object_store, coll, err_obj});
102+ // shallow errors DB object
103+ const auto sh_err_obj =
104+ pgid.make_temp_ghobject (fmt::format (" scrub_{}" , pgid));
105+ t->touch (coll, sh_err_obj);
106+ shallow_db.emplace (
107+ pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
108+
109+ // and the DB for deep errors
110+ const auto dp_err_obj =
111+ pgid.make_temp_ghobject (fmt::format (" deep_scrub_{}" , pgid));
112+ t->touch (coll, dp_err_obj);
113+ deep_db.emplace (pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
114+
115+ dout (20 ) << fmt::format (
116+ " created Scrub::Store for pg[{}], shallow: {}, deep: {}" ,
117+ pgid, sh_err_obj, dp_err_obj)
118+ << dendl;
105119}
106120
107121
108122Store::~Store ()
109123{
110- ceph_assert (!errors_db || errors_db->results .empty ());
124+ ceph_assert (!shallow_db || shallow_db->results .empty ());
125+ ceph_assert (!deep_db || deep_db->results .empty ());
111126}
112127
113128
@@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
127142 add_object_error (pool, e);
128143}
129144
145+ namespace {
146+
147+ inconsistent_obj_wrapper create_filtered_copy (
148+ const inconsistent_obj_wrapper& obj,
149+ uint64_t obj_err_mask,
150+ uint64_t shard_err_mask)
151+ {
152+ inconsistent_obj_wrapper dup = obj;
153+ dup.errors &= obj_err_mask;
154+ for (auto & [shard, si] : dup.shards ) {
155+ si.errors &= shard_err_mask;
156+ }
157+ return dup;
158+ }
159+
160+ } // namespace
161+
162+
130163void Store::add_object_error (int64_t pool, const inconsistent_obj_wrapper& e)
131164{
132165 const auto key = to_object_key (pool, e.object );
133- bufferlist bl;
134- e.encode (bl);
135- errors_db->results [key] = bl;
166+ dout (20 ) << fmt::format (
167+ " adding error for object {} ({}). Errors: {} ({}/{}) wr:{}" ,
168+ e.object , key, librados::err_t {e.errors },
169+ librados::err_t {e.errors & librados::err_t ::SHALLOW_ERRORS},
170+ librados::err_t {e.errors & librados::err_t ::DEEP_ERRORS}, e)
171+ << dendl;
172+
173+ // divide the errors & shard errors into shallow and deep.
174+ {
175+ bufferlist bl;
176+ create_filtered_copy (
177+ e, librados::obj_err_t ::SHALLOW_ERRORS, librados::err_t ::SHALLOW_ERRORS)
178+ .encode (bl);
179+ shallow_db->results [key] = bl;
180+ }
181+ {
182+ bufferlist bl;
183+ create_filtered_copy (
184+ e, librados::obj_err_t ::DEEP_ERRORS, librados::err_t ::DEEP_ERRORS)
185+ .encode (bl);
186+ deep_db->results [key] = bl;
187+ }
136188}
137189
138190
@@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
144196
145197void Store::add_snap_error (int64_t pool, const inconsistent_snapset_wrapper& e)
146198{
147- errors_db->results [to_snap_key (pool, e.object )] = e.encode ();
199+ // note: snap errors are only placed in the shallow store
200+ shallow_db->results [to_snap_key (pool, e.object )] = e.encode ();
148201}
149202
150203
151204bool Store::is_empty () const
152205{
153- return !errors_db || errors_db->results .empty ();
206+ return (!shallow_db || shallow_db->results .empty ()) &&
207+ (!deep_db || deep_db->results .empty ());
154208}
155209
156210
157211void Store::flush (ObjectStore::Transaction* t)
158212{
159213 if (t) {
160- auto txn = errors_db->driver .get_transaction (t);
161- errors_db->backend .set_keys (errors_db->results , &txn);
214+ auto txn = shallow_db->driver .get_transaction (t);
215+ shallow_db->backend .set_keys (shallow_db->results , &txn);
216+ txn = deep_db->driver .get_transaction (t);
217+ deep_db->backend .set_keys (deep_db->results , &txn);
162218 }
163- errors_db->results .clear ();
219+
220+ shallow_db->results .clear ();
221+ deep_db->results .clear ();
164222}
165223
166224
@@ -184,18 +242,23 @@ void Store::clear_level_db(
184242
185243void Store::reinit (
186244 ObjectStore::Transaction* t,
187- [[maybe_unused]] scrub_level_t level)
245+ scrub_level_t level)
188246{
247+ // Note: only one caller, and it creates the transaction passed to reinit().
248+ // No need to assert on 't'
189249 dout (20 ) << fmt::format (
190250 " re-initializing the Scrub::Store (for {} scrub)" ,
191251 (level == scrub_level_t ::deep ? " deep" : " shallow" ))
192252 << dendl;
193253
194- // Note: only one caller, and it creates the transaction passed to reinit().
195- // No need to assert on 't'
196-
197- if (errors_db) {
198- clear_level_db (t, *errors_db, " scrub" );
254+ // always clear the known shallow errors DB (as both shallow and deep scrubs
255+ // would recreate it)
256+ if (shallow_db) {
257+ clear_level_db (t, *shallow_db, " shallow" );
258+ }
259+ // only a deep scrub recreates the deep errors DB
260+ if (level == scrub_level_t ::deep && deep_db) {
261+ clear_level_db (t, *deep_db, " deep" );
199262 }
200263}
201264
@@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t)
204267{
205268 dout (20 ) << " discarding error DBs" << dendl;
206269 ceph_assert (t);
207- if (errors_db)
208- t->remove (coll, errors_db->errors_hoid );
270+ if (shallow_db)
271+ t->remove (coll, shallow_db->errors_hoid );
272+ if (deep_db)
273+ t->remove (coll, deep_db->errors_hoid );
209274}
210275
211276
@@ -214,42 +279,180 @@ std::vector<bufferlist> Store::get_snap_errors(
214279 const librados::object_id_t & start,
215280 uint64_t max_return) const
216281{
217- const string begin = (start.name .empty () ?
218- first_snap_key (pool) : to_snap_key (pool, start));
282+ vector<bufferlist> errors;
283+ const string begin =
284+ (start.name .empty () ? first_snap_key (pool) : to_snap_key (pool, start));
219285 const string end = last_snap_key (pool);
220- return get_errors (begin, end, max_return);
286+
287+ // the snap errors are stored only in the shallow store
288+ ExpCacherPosData latest_sh = shallow_db->backend .get_1st_after_key (begin);
289+
290+ while (max_return-- && latest_sh.has_value () && latest_sh->last_key < end) {
291+ errors.push_back (latest_sh->data );
292+ latest_sh = shallow_db->backend .get_1st_after_key (latest_sh->last_key );
293+ }
294+
295+ return errors;
221296}
222297
223- std::vector<bufferlist>
224- Store::get_object_errors (int64_t pool,
225- const librados::object_id_t & start,
226- uint64_t max_return) const
298+
299+ std::vector<bufferlist> Store::get_object_errors (
300+ int64_t pool,
301+ const librados::object_id_t & start,
302+ uint64_t max_return) const
227303{
228- const string begin = (start.name .empty () ?
229- first_object_key (pool) : to_object_key (pool, start));
304+ const string begin =
305+ (start.name .empty () ? first_object_key (pool)
306+ : to_object_key (pool, start));
230307 const string end = last_object_key (pool);
308+ dout (20 ) << fmt::format (" fetching errors, from {} to {}" , begin, end)
309+ << dendl;
231310 return get_errors (begin, end, max_return);
232311}
233312
234- std::vector<bufferlist>
235- Store::get_errors ( const string& begin,
236- const string& end ,
237- uint64_t max_return) const
313+
314+ inline void decode (
315+ librados:: inconsistent_obj_t & obj ,
316+ ceph::buffer::list::const_iterator& bp)
238317{
318+ reinterpret_cast <inconsistent_obj_wrapper&>(obj).decode (bp);
319+ }
320+
321+
322+ inconsistent_obj_wrapper decode_wrapper (
323+ hobject_t obj,
324+ ceph::buffer::list::const_iterator bp)
325+ {
326+ inconsistent_obj_wrapper iow{obj};
327+ iow.decode (bp);
328+ return iow;
329+ }
330+
331+
332+ void Store::collect_specific_store (
333+ MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
334+ Store::ExpCacherPosData& latest,
335+ std::vector<bufferlist>& errors,
336+ std::string_view end_key,
337+ uint64_t max_return) const
338+ {
339+ while (max_return-- && latest.has_value () &&
340+ latest.value ().last_key < end_key) {
341+ errors.push_back (latest->data );
342+ latest = backend.get_1st_after_key (latest->last_key );
343+ }
344+ }
345+
346+
347+ bufferlist Store::merge_encoded_error_wrappers (
348+ hobject_t obj,
349+ ExpCacherPosData& latest_sh,
350+ ExpCacherPosData& latest_dp) const
351+ {
352+ // decode both error wrappers
353+ auto sh_wrap = decode_wrapper (obj, latest_sh->data .cbegin ());
354+ auto dp_wrap = decode_wrapper (obj, latest_dp->data .cbegin ());
355+ dout (20 ) << fmt::format (
356+ " merging errors {}. Shallow: {}-({}), Deep: {}-({})" ,
357+ sh_wrap.object , sh_wrap.errors , dp_wrap.errors , sh_wrap,
358+ dp_wrap)
359+ << dendl;
360+
361+ // merge the object errors (a simple OR of the two error bit-sets)
362+ sh_wrap.errors |= dp_wrap.errors ;
363+
364+ // merge the two shard error maps
365+ for (const auto & [shard, si] : dp_wrap.shards ) {
366+ dout (20 ) << fmt::format (
367+ " shard {} dp-errors: {} sh-errors:{}" , shard, si.errors ,
368+ sh_wrap.shards [shard].errors )
369+ << dendl;
370+ // note: we may be creating the shallow shard entry here. This is OK
371+ sh_wrap.shards [shard].errors |= si.errors ;
372+ }
373+
374+ return sh_wrap.encode ();
375+ }
376+
377+
378+ // a better way to implement get_errors(): use two generators, one for each store.
379+ // and sort-merge the results. Almost like a merge-sort, but with equal
380+ // keys combined. 'todo' once 'ranges' are really working.
381+
382+ std::vector<bufferlist> Store::get_errors (
383+ const std::string& from_key,
384+ const std::string& end_key,
385+ uint64_t max_return) const
386+ {
387+ // merge the input from the two sorted DBs into 'errors' (until
388+ // enough errors are collected)
239389 vector<bufferlist> errors;
240- if (!errors_db )
241- return errors ;
390+ dout ( 20 ) << fmt::format ( " getting errors from {} to {} " , from_key, end_key )
391+ << dendl ;
242392
243- auto next = std::make_pair (begin, bufferlist{});
244- while (max_return && !errors_db->backend .get_next (next.first , &next)) {
245- if (next.first >= end)
393+ ceph_assert (shallow_db);
394+ ceph_assert (deep_db);
395+ ExpCacherPosData latest_sh = shallow_db->backend .get_1st_after_key (from_key);
396+ ExpCacherPosData latest_dp = deep_db->backend .get_1st_after_key (from_key);
397+
398+ while (max_return) {
399+ dout (20 ) << fmt::format (
400+ " n:{} latest_sh: {}, latest_dp: {}" , max_return,
401+ (latest_sh ? latest_sh->last_key : " (none)" ),
402+ (latest_dp ? latest_dp->last_key : " (none)" ))
403+ << dendl;
404+
405+ // keys not smaller than end_key are not interesting
406+ if (latest_sh.has_value () && latest_sh->last_key >= end_key) {
407+ latest_sh = tl::unexpected (-EINVAL);
408+ }
409+ if (latest_dp.has_value () && latest_dp->last_key >= end_key) {
410+ latest_dp = tl::unexpected (-EINVAL);
411+ }
412+
413+ if (!latest_sh && !latest_dp) {
414+ // both stores are exhausted
415+ break ;
416+ }
417+ if (!latest_sh.has_value ()) {
418+ // continue with the deep store
419+ dout (10 ) << fmt::format (" collecting from deep store" ) << dendl;
420+ collect_specific_store (
421+ deep_db->backend , latest_dp, errors, end_key, max_return);
246422 break ;
247- errors.push_back (next.second );
423+ }
424+ if (!latest_dp.has_value ()) {
425+ // continue with the shallow store
426+ dout (10 ) << fmt::format (" collecting from shallow store" ) << dendl;
427+ collect_specific_store (
428+ shallow_db->backend , latest_sh, errors, end_key, max_return);
429+ break ;
430+ }
431+
432+ // we have results from both stores. Select the one with a lower key.
433+ // If the keys are equal, combine the errors.
434+ if (latest_sh->last_key == latest_dp->last_key ) {
435+ auto bl = merge_encoded_error_wrappers (
436+ shallow_db->errors_hoid .hobj , latest_sh, latest_dp);
437+ errors.push_back (bl);
438+ latest_sh = shallow_db->backend .get_1st_after_key (latest_sh->last_key );
439+ latest_dp = deep_db->backend .get_1st_after_key (latest_dp->last_key );
440+
441+ } else if (latest_sh->last_key < latest_dp->last_key ) {
442+ dout (20 ) << fmt::format (" shallow store element ({})" , latest_sh->last_key )
443+ << dendl;
444+ errors.push_back (latest_sh->data );
445+ latest_sh = shallow_db->backend .get_1st_after_key (latest_sh->last_key );
446+ } else {
447+ dout (20 ) << fmt::format (" deep store element ({})" , latest_dp->last_key )
448+ << dendl;
449+ errors.push_back (latest_dp->data );
450+ latest_dp = deep_db->backend .get_1st_after_key (latest_dp->last_key );
451+ }
248452 max_return--;
249453 }
250454
251455 dout (10 ) << fmt::format (" {} errors reported" , errors.size ()) << dendl;
252456 return errors;
253457}
254-
255- } // namespace Scrub
458+ } // namespace Scrub
0 commit comments