Skip to content

Commit 0b7128e

Browse files
authored
Merge pull request ceph#60872 from ronen-fr/wip-rf-repaironly
osd/scrub: remove config option osd_repair_during_recovery Reviewed-by: Samuel Just <[email protected]>
2 parents 64849a8 + 50c2f07 commit 0b7128e

File tree

6 files changed

+95
-60
lines changed

6 files changed

+95
-60
lines changed

qa/standalone/scrub/osd-scrub-repair.sh

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,30 @@ function TEST_corrupt_and_repair_replicated() {
107107
corrupt_and_repair_one $dir $poolname $(get_primary $poolname SOMETHING) || return 1
108108
}
109109

110+
#
111+
# Allow operator-initiated scrubs to be scheduled even when some recovering is still
112+
# undergoing on the same OSD
113+
#
114+
function TEST_allow_oper_initiated_scrub_during_recovery() {
115+
local dir=$1
116+
local -A cluster_conf=(
117+
['osds_num']="2"
118+
['pgs_in_pool']="4"
119+
['pool_name']="nopool"
120+
['pool_default_size']="2"
121+
['extras']="--osd_scrub_during_recovery=false \
122+
--osd_debug_pretend_recovery_active=true"
123+
)
124+
125+
standard_scrub_cluster $dir cluster_conf
126+
local poolname=rbd
127+
create_rbd_pool || return 1
128+
wait_for_clean || return 1
129+
130+
add_something $dir $poolname || return 1
131+
oper_scrub_and_schedule $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
132+
}
133+
110134
#
111135
# Allow repair to be scheduled when some recovering is still undergoing on the same OSD
112136
#
@@ -117,10 +141,8 @@ function TEST_allow_repair_during_recovery() {
117141
run_mon $dir a --osd_pool_default_size=2 || return 1
118142
run_mgr $dir x || return 1
119143
run_osd $dir 0 --osd_scrub_during_recovery=false \
120-
--osd_repair_during_recovery=true \
121144
--osd_debug_pretend_recovery_active=true || return 1
122145
run_osd $dir 1 --osd_scrub_during_recovery=false \
123-
--osd_repair_during_recovery=true \
124146
--osd_debug_pretend_recovery_active=true || return 1
125147
create_rbd_pool || return 1
126148
wait_for_clean || return 1
@@ -132,25 +154,62 @@ function TEST_allow_repair_during_recovery() {
132154
#
133155
# Skip non-repair scrub correctly during recovery
134156
#
157+
# Note: forgoing the automatic creation of a pool in standard_scrub_cluster as
158+
# the test requires a specific RBD pool.
135159
function TEST_skip_non_repair_during_recovery() {
136160
local dir=$1
137-
local poolname=rbd
161+
local -A cluster_conf=(
162+
['osds_num']="2"
163+
['pgs_in_pool']="4"
164+
['pool_name']="nopool"
165+
['pool_default_size']="2"
166+
['extras']="--osd_scrub_during_recovery=false --osd_debug_pretend_recovery_active=true"
167+
)
138168

139-
run_mon $dir a --osd_pool_default_size=2 || return 1
140-
run_mgr $dir x || return 1
141-
run_osd $dir 0 --osd_scrub_during_recovery=false \
142-
--osd_repair_during_recovery=true \
143-
--osd_debug_pretend_recovery_active=true || return 1
144-
run_osd $dir 1 --osd_scrub_during_recovery=false \
145-
--osd_repair_during_recovery=true \
146-
--osd_debug_pretend_recovery_active=true || return 1
169+
standard_scrub_cluster $dir cluster_conf
170+
local poolname=rbd
147171
create_rbd_pool || return 1
148172
wait_for_clean || return 1
149173

150174
add_something $dir $poolname || return 1
151175
scrub_and_not_schedule $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
152176
}
153177

178+
179+
function oper_scrub_and_schedule() {
180+
local dir=$1
181+
local poolname=$2
182+
local osd=$3
183+
184+
#
185+
# 1) start an operator-initiated scrub
186+
#
187+
local pg=$(get_pg $poolname SOMETHING)
188+
local last_scrub=$(get_last_scrub_stamp $pg)
189+
ceph tell $pg scrub
190+
191+
#
192+
# 2) Assure the scrub was executed
193+
#
194+
sleep 3
195+
for ((i=0; i < 3; i++)); do
196+
if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
197+
break
198+
fi
199+
if test "$(get_last_scrub_stamp $pg)" '==' "$last_scrub" ; then
200+
return 1
201+
fi
202+
sleep 1
203+
done
204+
205+
#
206+
# 3) Access to the file must OK
207+
#
208+
objectstore_tool $dir $osd SOMETHING list-attrs || return 1
209+
rados --pool $poolname get SOMETHING $dir/COPY || return 1
210+
diff $dir/ORIGINAL $dir/COPY || return 1
211+
}
212+
154213
function scrub_and_not_schedule() {
155214
local dir=$1
156215
local poolname=$2
@@ -166,6 +225,7 @@ function scrub_and_not_schedule() {
166225
#
167226
# 2) Assure the scrub is not scheduled
168227
#
228+
sleep 3
169229
for ((i=0; i < 3; i++)); do
170230
if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
171231
return 1
@@ -218,9 +278,9 @@ function corrupt_and_repair_two() {
218278

219279
#
220280
# 1) add an object
221-
# 2) remove the corresponding file from a designated OSD
281+
# 2) remove the corresponding object from a designated OSD
222282
# 3) repair the PG
223-
# 4) check that the file has been restored in the designated OSD
283+
# 4) check that the object has been restored in the designated OSD
224284
#
225285
function corrupt_and_repair_one() {
226286
local dir=$1

qa/standalone/scrub/scrub-helpers.sh

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -231,16 +231,21 @@ function standard_scrub_cluster() {
231231

232232
local OSDS=${args['osds_num']:-"3"}
233233
local pg_num=${args['pgs_in_pool']:-"8"}
234+
234235
local poolname="${args['pool_name']:-test}"
235236
args['pool_name']=$poolname
237+
238+
local pool_default_size=${args['pool_default_size']:-3}
239+
args['pool_default_size']=$pool_default_size
240+
236241
local extra_pars=${args['extras']}
237242
local debug_msg=${args['msg']:-"dbg"}
238243

239244
# turn off '-x' (but remember previous state)
240245
local saved_echo_flag=${-//[^x]/}
241246
set +x
242247

243-
run_mon $dir a --osd_pool_default_size=3 || return 1
248+
run_mon $dir a --osd_pool_default_size=$pool_default_size || return 1
244249
run_mgr $dir x --mgr_stats_period=1 || return 1
245250

246251
local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
@@ -254,22 +259,26 @@ function standard_scrub_cluster() {
254259
--osd_scrub_retry_after_noscrub=5 \
255260
--osd_scrub_retry_pg_state=5 \
256261
--osd_scrub_retry_delay=3 \
257-
--osd_pool_default_size=3 \
262+
--osd_pool_default_size=$pool_default_size \
258263
$extra_pars"
259264

260265
for osd in $(seq 0 $(expr $OSDS - 1))
261266
do
262267
run_osd $dir $osd $(echo $ceph_osd_args) || return 1
263268
done
264269

265-
create_pool $poolname $pg_num $pg_num
266-
wait_for_clean || return 1
270+
if [[ "$poolname" != "nopool" ]]; then
271+
create_pool $poolname $pg_num $pg_num
272+
wait_for_clean || return 1
273+
fi
267274

268275
# update the in/out 'args' with the ID of the new pool
269276
sleep 1
270-
name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'`
271-
echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id"
272-
args['pool_id']="${name_n_id##* }"
277+
if [[ "$poolname" != "nopool" ]]; then
278+
name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'`
279+
echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id"
280+
args['pool_id']="${name_n_id##* }"
281+
fi
273282
args['osd_args']=$ceph_osd_args
274283
if [[ -n "$saved_echo_flag" ]]; then set -x; fi
275284
}

src/common/options/osd.yaml.in

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,12 +211,6 @@ options:
211211
level: advanced
212212
desc: Asserts that no clone-objects were added to a snap after we start trimming it
213213
default: false
214-
- name: osd_repair_during_recovery
215-
type: bool
216-
level: advanced
217-
desc: Allow requested repairing when PGs on the OSD are undergoing recovery
218-
default: false
219-
with_legacy: true
220214
- name: osd_scrub_begin_hour
221215
type: int
222216
level: advanced

src/osd/scrubber/osd_scrub.cc

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -201,18 +201,9 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing(
201201
env_conditions.random_backoff_active = true;
202202

203203
} else if (is_recovery_active && !conf->osd_scrub_during_recovery) {
204-
if (conf->osd_repair_during_recovery) {
205-
dout(15)
206-
<< "will only schedule explicitly requested repair due to active "
207-
"recovery"
208-
<< dendl;
209-
env_conditions.allow_requested_repair_only = true;
210-
211-
} else {
212-
dout(15) << "recovery in progress. Operator-initiated scrubs only."
213-
<< dendl;
214-
env_conditions.recovery_in_progress = true;
215-
}
204+
dout(15) << "recovery in progress. Operator-initiated scrubs only."
205+
<< dendl;
206+
env_conditions.recovery_in_progress = true;
216207
} else {
217208

218209
// regular, i.e. non-high-priority scrubs are allowed

src/osd/scrubber/pg_scrubber.cc

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2256,20 +2256,6 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session(
22562256
}
22572257
}
22582258

2259-
// if only explicitly requested repairing is allowed - skip other types
2260-
// of scrubbing
2261-
if (osd_restrictions.allow_requested_repair_only &&
2262-
ScrubJob::observes_recovery(trgt.urgency())) {
2263-
dout(10) << __func__
2264-
<< ": skipping this PG as repairing was not explicitly "
2265-
"requested for it"
2266-
<< dendl;
2267-
requeue_penalized(
2268-
s_or_d, delay_both_targets_t::yes, delay_cause_t::scrub_params,
2269-
clock_now);
2270-
return schedule_result_t::target_specific_failure;
2271-
}
2272-
22732259
// try to reserve the local OSD resources. If failing: no harm. We will
22742260
// be retried by the OSD later on.
22752261
if (!reserve_local(trgt)) {

src/osd/scrubber_common.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -89,17 +89,13 @@ struct OSDRestrictions {
8989
/// rolled a dice, and decided not to scrub in this tick
9090
bool random_backoff_active{false};
9191

92-
/// the OSD is performing recovery & osd_repair_during_recovery is 'true'
93-
bool allow_requested_repair_only:1{false};
94-
9592
/// the CPU load is high. No regular scrubs are allowed.
9693
bool cpu_overloaded:1{false};
9794

9895
/// outside of allowed scrubbing hours/days
9996
bool restricted_time:1{false};
10097

101-
/// the OSD is performing a recovery, osd_scrub_during_recovery is 'false',
102-
/// and so is osd_repair_during_recovery
98+
/// the OSD is performing a recovery & osd_scrub_during_recovery is 'false'
10399
bool recovery_in_progress:1{false};
104100
};
105101
static_assert(sizeof(Scrub::OSDRestrictions) <= sizeof(uint32_t));
@@ -193,13 +189,12 @@ struct formatter<Scrub::OSDRestrictions> {
193189
auto format(const Scrub::OSDRestrictions& conds, FormatContext& ctx) const
194190
{
195191
return fmt::format_to(
196-
ctx.out(), "<{}.{}.{}.{}.{}.{}>",
192+
ctx.out(), "<{}.{}.{}.{}.{}>",
197193
conds.max_concurrency_reached ? "max-scrubs" : "",
198194
conds.random_backoff_active ? "backoff" : "",
199195
conds.cpu_overloaded ? "high-load" : "",
200196
conds.restricted_time ? "time-restrict" : "",
201-
conds.recovery_in_progress ? "recovery" : "",
202-
conds.allow_requested_repair_only ? "repair-only" : "");
197+
conds.recovery_in_progress ? "recovery" : "");
203198
}
204199
};
205200

0 commit comments

Comments
 (0)