Skip to content

Commit daf848f

Browse files
committed
osd/scrub: separate shallow vs deep errors storage
The ScrubStore now holds two ScrubStore::at_level_t objects, one for the shallow errors and one for the deep errors. The shallow errors DB is recreated at the start of every scrub, while the deep errors DB is only recreated at the start of a deep scrub. When queried by the operator for known scrub errors, the ScrubStore will return the union of the errors from both DBs. Signed-off-by: Ronen Friedman <[email protected]>
1 parent 031580f commit daf848f

File tree

3 files changed

+285
-44
lines changed

3 files changed

+285
-44
lines changed

src/osd/scrubber/ScrubStore.cc

Lines changed: 244 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,30 @@ Store::Store(
9999
{
100100
ceph_assert(t);
101101

102-
const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
103-
t->touch(coll, err_obj);
104-
errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj});
102+
// shallow errors DB object
103+
const auto sh_err_obj =
104+
pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
105+
t->touch(coll, sh_err_obj);
106+
shallow_db.emplace(
107+
pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
108+
109+
// and the DB for deep errors
110+
const auto dp_err_obj =
111+
pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid));
112+
t->touch(coll, dp_err_obj);
113+
deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
114+
115+
dout(20) << fmt::format(
116+
"created Scrub::Store for pg[{}], shallow: {}, deep: {}",
117+
pgid, sh_err_obj, dp_err_obj)
118+
<< dendl;
105119
}
106120

107121

108122
Store::~Store()
109123
{
110-
ceph_assert(!errors_db || errors_db->results.empty());
124+
ceph_assert(!shallow_db || shallow_db->results.empty());
125+
ceph_assert(!deep_db || deep_db->results.empty());
111126
}
112127

113128

@@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
127142
add_object_error(pool, e);
128143
}
129144

145+
namespace {
146+
147+
inconsistent_obj_wrapper create_filtered_copy(
148+
const inconsistent_obj_wrapper& obj,
149+
uint64_t obj_err_mask,
150+
uint64_t shard_err_mask)
151+
{
152+
inconsistent_obj_wrapper dup = obj;
153+
dup.errors &= obj_err_mask;
154+
for (auto& [shard, si] : dup.shards) {
155+
si.errors &= shard_err_mask;
156+
}
157+
return dup;
158+
}
159+
160+
} // namespace
161+
162+
130163
void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
131164
{
132165
const auto key = to_object_key(pool, e.object);
133-
bufferlist bl;
134-
e.encode(bl);
135-
errors_db->results[key] = bl;
166+
dout(20) << fmt::format(
167+
"adding error for object {} ({}). Errors: {} ({}/{}) wr:{}",
168+
e.object, key, librados::err_t{e.errors},
169+
librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS},
170+
librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e)
171+
<< dendl;
172+
173+
// divide the errors & shard errors into shallow and deep.
174+
{
175+
bufferlist bl;
176+
create_filtered_copy(
177+
e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS)
178+
.encode(bl);
179+
shallow_db->results[key] = bl;
180+
}
181+
{
182+
bufferlist bl;
183+
create_filtered_copy(
184+
e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS)
185+
.encode(bl);
186+
deep_db->results[key] = bl;
187+
}
136188
}
137189

138190

@@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
144196

145197
void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
146198
{
147-
errors_db->results[to_snap_key(pool, e.object)] = e.encode();
199+
// note: snap errors are only placed in the shallow store
200+
shallow_db->results[to_snap_key(pool, e.object)] = e.encode();
148201
}
149202

150203

151204
bool Store::is_empty() const
152205
{
153-
return !errors_db || errors_db->results.empty();
206+
return (!shallow_db || shallow_db->results.empty()) &&
207+
(!deep_db || deep_db->results.empty());
154208
}
155209

156210

157211
void Store::flush(ObjectStore::Transaction* t)
158212
{
159213
if (t) {
160-
auto txn = errors_db->driver.get_transaction(t);
161-
errors_db->backend.set_keys(errors_db->results, &txn);
214+
auto txn = shallow_db->driver.get_transaction(t);
215+
shallow_db->backend.set_keys(shallow_db->results, &txn);
216+
txn = deep_db->driver.get_transaction(t);
217+
deep_db->backend.set_keys(deep_db->results, &txn);
162218
}
163-
errors_db->results.clear();
219+
220+
shallow_db->results.clear();
221+
deep_db->results.clear();
164222
}
165223

166224

@@ -184,18 +242,23 @@ void Store::clear_level_db(
184242

185243
void Store::reinit(
186244
ObjectStore::Transaction* t,
187-
[[maybe_unused]] scrub_level_t level)
245+
scrub_level_t level)
188246
{
247+
// Note: only one caller, and it creates the transaction passed to reinit().
248+
// No need to assert on 't'
189249
dout(20) << fmt::format(
190250
"re-initializing the Scrub::Store (for {} scrub)",
191251
(level == scrub_level_t::deep ? "deep" : "shallow"))
192252
<< dendl;
193253

194-
// Note: only one caller, and it creates the transaction passed to reinit().
195-
// No need to assert on 't'
196-
197-
if (errors_db) {
198-
clear_level_db(t, *errors_db, "scrub");
254+
// always clear the known shallow errors DB (as both shallow and deep scrubs
255+
// would recreate it)
256+
if (shallow_db) {
257+
clear_level_db(t, *shallow_db, "shallow");
258+
}
259+
// only a deep scrub recreates the deep errors DB
260+
if (level == scrub_level_t::deep && deep_db) {
261+
clear_level_db(t, *deep_db, "deep");
199262
}
200263
}
201264

@@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t)
204267
{
205268
dout(20) << "discarding error DBs" << dendl;
206269
ceph_assert(t);
207-
if (errors_db)
208-
t->remove(coll, errors_db->errors_hoid);
270+
if (shallow_db)
271+
t->remove(coll, shallow_db->errors_hoid);
272+
if (deep_db)
273+
t->remove(coll, deep_db->errors_hoid);
209274
}
210275

211276

@@ -214,42 +279,180 @@ std::vector<bufferlist> Store::get_snap_errors(
214279
const librados::object_id_t& start,
215280
uint64_t max_return) const
216281
{
217-
const string begin = (start.name.empty() ?
218-
first_snap_key(pool) : to_snap_key(pool, start));
282+
vector<bufferlist> errors;
283+
const string begin =
284+
(start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start));
219285
const string end = last_snap_key(pool);
220-
return get_errors(begin, end, max_return);
286+
287+
// the snap errors are stored only in the shallow store
288+
ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin);
289+
290+
while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) {
291+
errors.push_back(latest_sh->data);
292+
latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
293+
}
294+
295+
return errors;
221296
}
222297

223-
std::vector<bufferlist>
224-
Store::get_object_errors(int64_t pool,
225-
const librados::object_id_t& start,
226-
uint64_t max_return) const
298+
299+
std::vector<bufferlist> Store::get_object_errors(
300+
int64_t pool,
301+
const librados::object_id_t& start,
302+
uint64_t max_return) const
227303
{
228-
const string begin = (start.name.empty() ?
229-
first_object_key(pool) : to_object_key(pool, start));
304+
const string begin =
305+
(start.name.empty() ? first_object_key(pool)
306+
: to_object_key(pool, start));
230307
const string end = last_object_key(pool);
308+
dout(20) << fmt::format("fetching errors, from {} to {}", begin, end)
309+
<< dendl;
231310
return get_errors(begin, end, max_return);
232311
}
233312

234-
std::vector<bufferlist>
235-
Store::get_errors(const string& begin,
236-
const string& end,
237-
uint64_t max_return) const
313+
314+
inline void decode(
315+
librados::inconsistent_obj_t& obj,
316+
ceph::buffer::list::const_iterator& bp)
238317
{
318+
reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
319+
}
320+
321+
322+
inconsistent_obj_wrapper decode_wrapper(
323+
hobject_t obj,
324+
ceph::buffer::list::const_iterator bp)
325+
{
326+
inconsistent_obj_wrapper iow{obj};
327+
iow.decode(bp);
328+
return iow;
329+
}
330+
331+
332+
void Store::collect_specific_store(
333+
MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
334+
Store::ExpCacherPosData& latest,
335+
std::vector<bufferlist>& errors,
336+
std::string_view end_key,
337+
uint64_t max_return) const
338+
{
339+
while (max_return-- && latest.has_value() &&
340+
latest.value().last_key < end_key) {
341+
errors.push_back(latest->data);
342+
latest = backend.get_1st_after_key(latest->last_key);
343+
}
344+
}
345+
346+
347+
bufferlist Store::merge_encoded_error_wrappers(
348+
hobject_t obj,
349+
ExpCacherPosData& latest_sh,
350+
ExpCacherPosData& latest_dp) const
351+
{
352+
// decode both error wrappers
353+
auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
354+
auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
355+
dout(20) << fmt::format(
356+
"merging errors {}. Shallow: {}-({}), Deep: {}-({})",
357+
sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap,
358+
dp_wrap)
359+
<< dendl;
360+
361+
// merge the object errors (a simple OR of the two error bit-sets)
362+
sh_wrap.errors |= dp_wrap.errors;
363+
364+
// merge the two shard error maps
365+
for (const auto& [shard, si] : dp_wrap.shards) {
366+
dout(20) << fmt::format(
367+
"shard {} dp-errors: {} sh-errors:{}", shard, si.errors,
368+
sh_wrap.shards[shard].errors)
369+
<< dendl;
370+
// note: we may be creating the shallow shard entry here. This is OK
371+
sh_wrap.shards[shard].errors |= si.errors;
372+
}
373+
374+
return sh_wrap.encode();
375+
}
376+
377+
378+
// a better way to implement get_errors(): use two generators, one for each store.
379+
// and sort-merge the results. Almost like a merge-sort, but with equal
380+
// keys combined. 'todo' once 'ranges' are really working.
381+
382+
std::vector<bufferlist> Store::get_errors(
383+
const std::string& from_key,
384+
const std::string& end_key,
385+
uint64_t max_return) const
386+
{
387+
// merge the input from the two sorted DBs into 'errors' (until
388+
// enough errors are collected)
239389
vector<bufferlist> errors;
240-
if (!errors_db)
241-
return errors;
390+
dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key)
391+
<< dendl;
242392

243-
auto next = std::make_pair(begin, bufferlist{});
244-
while (max_return && !errors_db->backend.get_next(next.first, &next)) {
245-
if (next.first >= end)
393+
ceph_assert(shallow_db);
394+
ceph_assert(deep_db);
395+
ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key);
396+
ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key);
397+
398+
while (max_return) {
399+
dout(20) << fmt::format(
400+
"n:{} latest_sh: {}, latest_dp: {}", max_return,
401+
(latest_sh ? latest_sh->last_key : "(none)"),
402+
(latest_dp ? latest_dp->last_key : "(none)"))
403+
<< dendl;
404+
405+
// keys not smaller than end_key are not interesting
406+
if (latest_sh.has_value() && latest_sh->last_key >= end_key) {
407+
latest_sh = tl::unexpected(-EINVAL);
408+
}
409+
if (latest_dp.has_value() && latest_dp->last_key >= end_key) {
410+
latest_dp = tl::unexpected(-EINVAL);
411+
}
412+
413+
if (!latest_sh && !latest_dp) {
414+
// both stores are exhausted
415+
break;
416+
}
417+
if (!latest_sh.has_value()) {
418+
// continue with the deep store
419+
dout(10) << fmt::format("collecting from deep store") << dendl;
420+
collect_specific_store(
421+
deep_db->backend, latest_dp, errors, end_key, max_return);
246422
break;
247-
errors.push_back(next.second);
423+
}
424+
if (!latest_dp.has_value()) {
425+
// continue with the shallow store
426+
dout(10) << fmt::format("collecting from shallow store") << dendl;
427+
collect_specific_store(
428+
shallow_db->backend, latest_sh, errors, end_key, max_return);
429+
break;
430+
}
431+
432+
// we have results from both stores. Select the one with a lower key.
433+
// If the keys are equal, combine the errors.
434+
if (latest_sh->last_key == latest_dp->last_key) {
435+
auto bl = merge_encoded_error_wrappers(
436+
shallow_db->errors_hoid.hobj, latest_sh, latest_dp);
437+
errors.push_back(bl);
438+
latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
439+
latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
440+
441+
} else if (latest_sh->last_key < latest_dp->last_key) {
442+
dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key)
443+
<< dendl;
444+
errors.push_back(latest_sh->data);
445+
latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
446+
} else {
447+
dout(20) << fmt::format("deep store element ({})", latest_dp->last_key)
448+
<< dendl;
449+
errors.push_back(latest_dp->data);
450+
latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
451+
}
248452
max_return--;
249453
}
250454

251455
dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
252456
return errors;
253457
}
254-
255-
} // namespace Scrub
458+
} // namespace Scrub

0 commit comments

Comments
 (0)