Skip to content

Commit 62eec75

Browse files
committed
Merge patch series "fs: introduce file_ref_t"
Christian Brauner <[email protected]> says: As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. As suggested by Linus, add a file specific variant instead of making this a generic library. I've been testing this with will-it-scale using a multi-threaded fstat() on the same file descriptor on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on workloads with 256+ and more threads comparing v6.12-rc1 as base with and without these patches applied. * patches from https://lore.kernel.org/r/[email protected]: fs: port files to file_ref fs: add file_ref fs: protect backing files with rcu Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents 8cf0b93 + 90ee6ed commit 62eec75

File tree

7 files changed

+292
-27
lines changed

7 files changed

+292
-27
lines changed

drivers/gpu/drm/i915/gt/shmem_utils.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
4040

4141
if (i915_gem_object_is_shmem(obj)) {
4242
file = obj->base.filp;
43-
atomic_long_inc(&file->f_count);
43+
get_file(file);
4444
return file;
4545
}
4646

drivers/gpu/drm/vmwgfx/ttm_object.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev)
471471
*/
472472
static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf)
473473
{
474-
return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L;
474+
return file_ref_get(&dmabuf->file->f_ref);
475475
}
476476

477477
/**

fs/eventpoll.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1002,7 +1002,7 @@ static struct file *epi_fget(const struct epitem *epi)
10021002
struct file *file;
10031003

10041004
file = epi->ffd.file;
1005-
if (!atomic_long_inc_not_zero(&file->f_count))
1005+
if (!file_ref_get(&file->f_ref))
10061006
file = NULL;
10071007
return file;
10081008
}

fs/file.c

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,73 @@
2020
#include <linux/spinlock.h>
2121
#include <linux/rcupdate.h>
2222
#include <linux/close_range.h>
23+
#include <linux/file_ref.h>
2324
#include <net/sock.h>
2425

2526
#include "internal.h"
2627

28+
/**
29+
* __file_ref_put - Slowpath of file_ref_put()
30+
* @ref: Pointer to the reference count
31+
* @cnt: Current reference count
32+
*
33+
* Invoked when the reference count is outside of the valid zone.
34+
*
35+
* Return:
36+
* True if this was the last reference with no future references
37+
* possible. This signals the caller that it can safely schedule the
38+
* object, which is protected by the reference counter, for
39+
* deconstruction.
40+
*
41+
* False if there are still active references or the put() raced
42+
* with a concurrent get()/put() pair. Caller is not allowed to
43+
* deconstruct the protected object.
44+
*/
45+
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
46+
{
47+
/* Did this drop the last reference? */
48+
if (likely(cnt == FILE_REF_NOREF)) {
49+
/*
50+
* Carefully try to set the reference count to FILE_REF_DEAD.
51+
*
52+
* This can fail if a concurrent get() operation has
53+
* elevated it again or the corresponding put() even marked
54+
* it dead already. Both are valid situations and do not
55+
* require a retry. If this fails the caller is not
56+
* allowed to deconstruct the object.
57+
*/
58+
if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
59+
return false;
60+
61+
/*
62+
* The caller can safely schedule the object for
63+
* deconstruction. Provide acquire ordering.
64+
*/
65+
smp_acquire__after_ctrl_dep();
66+
return true;
67+
}
68+
69+
/*
70+
* If the reference count was already in the dead zone, then this
71+
* put() operation is imbalanced. Warn, put the reference count back to
72+
* DEAD and tell the caller to not deconstruct the object.
73+
*/
74+
if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
75+
atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
76+
return false;
77+
}
78+
79+
/*
80+
* This is a put() operation on a saturated refcount. Restore the
81+
* mean saturation value and tell the caller to not deconstruct the
82+
* object.
83+
*/
84+
if (cnt > FILE_REF_MAXREF)
85+
atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
86+
return false;
87+
}
88+
EXPORT_SYMBOL_GPL(__file_ref_put);
89+
2790
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
2891
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
2992
/* our min() is unusable in constant expressions ;-/ */
@@ -839,7 +902,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
839902
if (!file)
840903
return NULL;
841904

842-
if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
905+
if (unlikely(!file_ref_get(&file->f_ref)))
843906
return ERR_PTR(-EAGAIN);
844907

845908
file_reloaded = rcu_dereference_raw(*f);
@@ -853,8 +916,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
853916
OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
854917

855918
/*
856-
* atomic_long_inc_not_zero() above provided a full memory
857-
* barrier when we acquired a reference.
919+
* file_ref_get() above provided a full memory barrier when we
920+
* acquired a reference.
858921
*
859922
* This is paired with the write barrier from assigning to the
860923
* __rcu protected file pointer so that if that pointer still
@@ -952,11 +1015,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
9521015
* We need to confirm it by incrementing the refcount
9531016
* and then check the lookup again.
9541017
*
955-
* atomic_long_inc_not_zero() gives us a full memory
956-
* barrier. We only really need an 'acquire' one to
957-
* protect the loads below, but we don't have that.
1018+
* file_ref_get() gives us a full memory barrier. We
1019+
* only really need an 'acquire' one to protect the
1020+
* loads below, but we don't have that.
9581021
*/
959-
if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
1022+
if (unlikely(!file_ref_get(&file->f_ref)))
9601023
continue;
9611024

9621025
/*

fs/file_table.c

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,17 @@ static struct files_stat_struct files_stat = {
4040

4141
/* SLAB cache for file structures */
4242
static struct kmem_cache *filp_cachep __ro_after_init;
43+
static struct kmem_cache *bfilp_cachep __ro_after_init;
4344

4445
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
4546

4647
/* Container for backing file with optional user path */
4748
struct backing_file {
4849
struct file file;
49-
struct path user_path;
50+
union {
51+
struct path user_path;
52+
freeptr_t bf_freeptr;
53+
};
5054
};
5155

5256
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +72,7 @@ static inline void file_free(struct file *f)
6872
put_cred(f->f_cred);
6973
if (unlikely(f->f_mode & FMODE_BACKING)) {
7074
path_put(backing_file_user_path(f));
71-
kfree(backing_file(f));
75+
kmem_cache_free(bfilp_cachep, backing_file(f));
7276
} else {
7377
kmem_cache_free(filp_cachep, f);
7478
}
@@ -165,16 +169,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
165169
* the respective member when opening the file.
166170
*/
167171
mutex_init(&f->f_pos_lock);
168-
f->f_flags = flags;
169-
f->f_mode = OPEN_FMODE(flags);
170-
/* f->f_version: 0 */
172+
memset(&f->f_path, 0, sizeof(f->f_path));
173+
memset(&f->f_ra, 0, sizeof(f->f_ra));
174+
175+
f->f_flags = flags;
176+
f->f_mode = OPEN_FMODE(flags);
177+
178+
f->f_op = NULL;
179+
f->f_mapping = NULL;
180+
f->private_data = NULL;
181+
f->f_inode = NULL;
182+
f->f_owner = NULL;
183+
#ifdef CONFIG_EPOLL
184+
f->f_ep = NULL;
185+
#endif
186+
187+
f->f_iocb_flags = 0;
188+
f->f_pos = 0;
189+
f->f_wb_err = 0;
190+
f->f_sb_err = 0;
171191

172192
/*
173193
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
174194
* fget-rcu pattern users need to be able to handle spurious
175195
* refcount bumps we should reinitialize the reused file first.
176196
*/
177-
atomic_long_set(&f->f_count, 1);
197+
file_ref_init(&f->f_ref, 1);
178198
return 0;
179199
}
180200

@@ -206,7 +226,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
206226
goto over;
207227
}
208228

209-
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
229+
f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
210230
if (unlikely(!f))
211231
return ERR_PTR(-ENOMEM);
212232

@@ -240,7 +260,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
240260
struct file *f;
241261
int error;
242262

243-
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
263+
f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
244264
if (unlikely(!f))
245265
return ERR_PTR(-ENOMEM);
246266

@@ -267,13 +287,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
267287
struct backing_file *ff;
268288
int error;
269289

270-
ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
290+
ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
271291
if (unlikely(!ff))
272292
return ERR_PTR(-ENOMEM);
273293

274294
error = init_file(&ff->file, flags, cred);
275295
if (unlikely(error)) {
276-
kfree(ff);
296+
kmem_cache_free(bfilp_cachep, ff);
277297
return ERR_PTR(error);
278298
}
279299

@@ -479,7 +499,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
479499

480500
void fput(struct file *file)
481501
{
482-
if (atomic_long_dec_and_test(&file->f_count)) {
502+
if (file_ref_put(&file->f_ref)) {
483503
struct task_struct *task = current;
484504

485505
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +532,7 @@ void fput(struct file *file)
512532
*/
513533
void __fput_sync(struct file *file)
514534
{
515-
if (atomic_long_dec_and_test(&file->f_count))
535+
if (file_ref_put(&file->f_ref))
516536
__fput(file);
517537
}
518538

@@ -529,6 +549,11 @@ void __init files_init(void)
529549
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
530550
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
531551
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
552+
553+
args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
554+
bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
555+
&args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
556+
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
532557
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
533558
}
534559

0 commit comments

Comments
 (0)