Skip to content

Commit 3a03c67

Browse files
committed
Merge tag 'ceph-for-5.14-rc6' of git://github.com/ceph/ceph-client
Pull ceph fixes from Ilya Dryomov: "A patch to avoid a soft lockup in ceph_check_delayed_caps() from Luis and a reference handling fix from Jeff that should address some memory corruption reports in the snaprealm area. Both marked for stable" * tag 'ceph-for-5.14-rc6' of git://github.com/ceph/ceph-client: ceph: take snap_empty_lock atomically with snaprealm refcount change ceph: reduce contention in ceph_check_delayed_caps()
2 parents 82cce5f + 8434ffe commit 3a03c67

File tree

4 files changed

+50
-28
lines changed

4 files changed

+50
-28
lines changed

fs/ceph/caps.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4150,18 +4150,31 @@ void ceph_handle_caps(struct ceph_mds_session *session,
41504150

41514151
/*
41524152
* Delayed work handler to process end of delayed cap release LRU list.
4153+
*
4154+
* If new caps are added to the list while processing it, these won't get
4155+
* processed in this run. In this case, the ci->i_hold_caps_max will be
4156+
* returned so that the work can be scheduled accordingly.
41534157
*/
4154-
void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4158+
unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
41554159
{
41564160
struct inode *inode;
41574161
struct ceph_inode_info *ci;
4162+
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4163+
unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4164+
unsigned long loop_start = jiffies;
4165+
unsigned long delay = 0;
41584166

41594167
dout("check_delayed_caps\n");
41604168
spin_lock(&mdsc->cap_delay_lock);
41614169
while (!list_empty(&mdsc->cap_delay_list)) {
41624170
ci = list_first_entry(&mdsc->cap_delay_list,
41634171
struct ceph_inode_info,
41644172
i_cap_delay_list);
4173+
if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4174+
dout("%s caps added recently. Exiting loop", __func__);
4175+
delay = ci->i_hold_caps_max;
4176+
break;
4177+
}
41654178
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
41664179
time_before(jiffies, ci->i_hold_caps_max))
41674180
break;
@@ -4177,6 +4190,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
41774190
}
41784191
}
41794192
spin_unlock(&mdsc->cap_delay_lock);
4193+
4194+
return delay;
41804195
}
41814196

41824197
/*

fs/ceph/mds_client.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4490,22 +4490,29 @@ void inc_session_sequence(struct ceph_mds_session *s)
44904490
}
44914491

44924492
/*
4493-
* delayed work -- periodically trim expired leases, renew caps with mds
4493+
* delayed work -- periodically trim expired leases, renew caps with mds. If
4494+
* the @delay parameter is set to 0 or if it's more than 5 secs, the default
4495+
* workqueue delay value of 5 secs will be used.
44944496
*/
4495-
static void schedule_delayed(struct ceph_mds_client *mdsc)
4497+
static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
44964498
{
4497-
int delay = 5;
4498-
unsigned hz = round_jiffies_relative(HZ * delay);
4499-
schedule_delayed_work(&mdsc->delayed_work, hz);
4499+
unsigned long max_delay = HZ * 5;
4500+
4501+
/* 5 secs default delay */
4502+
if (!delay || (delay > max_delay))
4503+
delay = max_delay;
4504+
schedule_delayed_work(&mdsc->delayed_work,
4505+
round_jiffies_relative(delay));
45004506
}
45014507

45024508
static void delayed_work(struct work_struct *work)
45034509
{
4504-
int i;
45054510
struct ceph_mds_client *mdsc =
45064511
container_of(work, struct ceph_mds_client, delayed_work.work);
4512+
unsigned long delay;
45074513
int renew_interval;
45084514
int renew_caps;
4515+
int i;
45094516

45104517
dout("mdsc delayed_work\n");
45114518

@@ -4545,15 +4552,15 @@ static void delayed_work(struct work_struct *work)
45454552
}
45464553
mutex_unlock(&mdsc->mutex);
45474554

4548-
ceph_check_delayed_caps(mdsc);
4555+
delay = ceph_check_delayed_caps(mdsc);
45494556

45504557
ceph_queue_cap_reclaim_work(mdsc);
45514558

45524559
ceph_trim_snapid_map(mdsc);
45534560

45544561
maybe_recover_session(mdsc);
45554562

4556-
schedule_delayed(mdsc);
4563+
schedule_delayed(mdsc, delay);
45574564
}
45584565

45594566
int ceph_mdsc_init(struct ceph_fs_client *fsc)
@@ -5030,7 +5037,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
50305037
mdsc->mdsmap->m_epoch);
50315038

50325039
mutex_unlock(&mdsc->mutex);
5033-
schedule_delayed(mdsc);
5040+
schedule_delayed(mdsc, 0);
50345041
return;
50355042

50365043
bad_unlock:

fs/ceph/snap.c

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
6767
{
6868
lockdep_assert_held(&mdsc->snap_rwsem);
6969

70-
dout("get_realm %p %d -> %d\n", realm,
71-
atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
7270
/*
73-
* since we _only_ increment realm refs or empty the empty
74-
* list with snap_rwsem held, adjusting the empty list here is
75-
* safe. we do need to protect against concurrent empty list
76-
* additions, however.
71+
* The 0->1 and 1->0 transitions must take the snap_empty_lock
72+
* atomically with the refcount change. Go ahead and bump the
73+
* nref here, unless it's 0, in which case we take the spinlock
74+
* and then do the increment and remove it from the list.
7775
*/
78-
if (atomic_inc_return(&realm->nref) == 1) {
79-
spin_lock(&mdsc->snap_empty_lock);
76+
if (atomic_inc_not_zero(&realm->nref))
77+
return;
78+
79+
spin_lock(&mdsc->snap_empty_lock);
80+
if (atomic_inc_return(&realm->nref) == 1)
8081
list_del_init(&realm->empty_item);
81-
spin_unlock(&mdsc->snap_empty_lock);
82-
}
82+
spin_unlock(&mdsc->snap_empty_lock);
8383
}
8484

8585
static void __insert_snap_realm(struct rb_root *root,
@@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
208208
{
209209
lockdep_assert_held_write(&mdsc->snap_rwsem);
210210

211-
dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
212-
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
211+
/*
212+
* We do not require the snap_empty_lock here, as any caller that
213+
* increments the value must hold the snap_rwsem.
214+
*/
213215
if (atomic_dec_and_test(&realm->nref))
214216
__destroy_snap_realm(mdsc, realm);
215217
}
216218

217219
/*
218-
* caller needn't hold any locks
220+
* See comments in ceph_get_snap_realm. Caller needn't hold any locks.
219221
*/
220222
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
221223
struct ceph_snap_realm *realm)
222224
{
223-
dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
224-
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
225-
if (!atomic_dec_and_test(&realm->nref))
225+
if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
226226
return;
227227

228228
if (down_write_trylock(&mdsc->snap_rwsem)) {
229+
spin_unlock(&mdsc->snap_empty_lock);
229230
__destroy_snap_realm(mdsc, realm);
230231
up_write(&mdsc->snap_rwsem);
231232
} else {
232-
spin_lock(&mdsc->snap_empty_lock);
233233
list_add(&realm->empty_item, &mdsc->snap_empty);
234234
spin_unlock(&mdsc->snap_empty_lock);
235235
}

fs/ceph/super.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci,
11671167
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
11681168
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
11691169
struct ceph_mds_session *session);
1170-
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
1170+
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
11711171
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
11721172
extern int ceph_drop_caps_for_unlink(struct inode *inode);
11731173
extern int ceph_encode_inode_release(void **p, struct inode *inode,

0 commit comments

Comments
 (0)