Skip to content

Commit 5b8fea6

Browse files
amir73iljankara
authored andcommitted
fanotify: configurable limits via sysfs
fanotify has some hardcoded limits. The only APIs to escape those limits are FAN_UNLIMITED_QUEUE and FAN_UNLIMITED_MARKS. Allow finer grained tuning of the system limits via sysfs tunables under /proc/sys/fs/fanotify, similar to tunables under /proc/sys/fs/inotify, with some minor differences. - max_queued_events - global system tunable for group queue size limit. Like the inotify tunable with the same name, it defaults to 16384 and applies on initialization of a new group. - max_user_marks - user ns tunable for marks limit per user. Like the inotify tunable named max_user_watches, on a machine with sufficient RAM and it defaults to 1048576 in init userns and can be further limited per containing user ns. - max_user_groups - user ns tunable for number of groups per user. Like the inotify tunable named max_user_instances, it defaults to 128 in init userns and can be further limited per containing user ns. The slightly different tunable names used for fanotify are derived from the "group" and "mark" terminology used in the fanotify man pages and throughout the code. Considering the fact that the default value for max_user_instances was increased in kernel v5.10 from 8192 to 1048576, leaving the legacy fanotify limit of 8192 marks per group in addition to the max_user_marks limit makes little sense, so the per group marks limit has been removed. Note that when a group is initialized with FAN_UNLIMITED_MARKS, its own marks are not accounted in the per user marks account, so in effect the limit of max_user_marks is only for the collection of groups that are not initialized with FAN_UNLIMITED_MARKS. Link: https://lore.kernel.org/r/[email protected] Suggested-by: Jan Kara <[email protected]> Signed-off-by: Amir Goldstein <[email protected]> Signed-off-by: Jan Kara <[email protected]>
1 parent b8cd0ee commit 5b8fea6

File tree

10 files changed

+137
-39
lines changed

10 files changed

+137
-39
lines changed

fs/notify/fanotify/fanotify.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -801,12 +801,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
801801

802802
static void fanotify_free_group_priv(struct fsnotify_group *group)
803803
{
804-
struct user_struct *user;
805-
806804
kfree(group->fanotify_data.merge_hash);
807-
user = group->fanotify_data.user;
808-
atomic_dec(&user->fanotify_listeners);
809-
free_uid(user);
805+
if (group->fanotify_data.ucounts)
806+
dec_ucount(group->fanotify_data.ucounts,
807+
UCOUNT_FANOTIFY_GROUPS);
810808
}
811809

812810
static void fanotify_free_path_event(struct fanotify_event *event)
@@ -862,6 +860,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
862860
}
863861
}
864862

863+
static void fanotify_freeing_mark(struct fsnotify_mark *mark,
864+
struct fsnotify_group *group)
865+
{
866+
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
867+
dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS);
868+
}
869+
865870
static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
866871
{
867872
kmem_cache_free(fanotify_mark_cache, fsn_mark);
@@ -871,5 +876,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = {
871876
.handle_event = fanotify_handle_event,
872877
.free_group_priv = fanotify_free_group_priv,
873878
.free_event = fanotify_free_event,
879+
.freeing_mark = fanotify_freeing_mark,
874880
.free_mark = fanotify_free_mark,
875881
};

fs/notify/fanotify/fanotify_user.c

Lines changed: 103 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,61 @@
2727
#include "fanotify.h"
2828

2929
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
30-
#define FANOTIFY_DEFAULT_MAX_MARKS 8192
31-
#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
30+
#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
31+
#define FANOTIFY_DEFAULT_MAX_GROUPS 128
32+
33+
/*
34+
* Legacy fanotify marks limits (8192) is per group and we introduced a tunable
35+
* limit of marks per user, similar to inotify. Effectively, the legacy limit
36+
* of fanotify marks per user is <max marks per group> * <max groups per user>.
37+
* This default limit (1M) also happens to match the increased limit of inotify
38+
* max_user_watches since v5.10.
39+
*/
40+
#define FANOTIFY_DEFAULT_MAX_USER_MARKS \
41+
(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
42+
43+
/*
44+
* Most of the memory cost of adding an inode mark is pinning the marked inode.
45+
* The size of the filesystem inode struct is not uniform across filesystems,
46+
* so double the size of a VFS inode is used as a conservative approximation.
47+
*/
48+
#define INODE_MARK_COST (2 * sizeof(struct inode))
49+
50+
/* configurable via /proc/sys/fs/fanotify/ */
51+
static int fanotify_max_queued_events __read_mostly;
52+
53+
#ifdef CONFIG_SYSCTL
54+
55+
#include <linux/sysctl.h>
56+
57+
struct ctl_table fanotify_table[] = {
58+
{
59+
.procname = "max_user_groups",
60+
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
61+
.maxlen = sizeof(int),
62+
.mode = 0644,
63+
.proc_handler = proc_dointvec_minmax,
64+
.extra1 = SYSCTL_ZERO,
65+
},
66+
{
67+
.procname = "max_user_marks",
68+
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
69+
.maxlen = sizeof(int),
70+
.mode = 0644,
71+
.proc_handler = proc_dointvec_minmax,
72+
.extra1 = SYSCTL_ZERO,
73+
},
74+
{
75+
.procname = "max_queued_events",
76+
.data = &fanotify_max_queued_events,
77+
.maxlen = sizeof(int),
78+
.mode = 0644,
79+
.proc_handler = proc_dointvec_minmax,
80+
.extra1 = SYSCTL_ZERO
81+
},
82+
{ }
83+
};
84+
#endif /* CONFIG_SYSCTL */
3285

3386
/*
3487
* All flags that may be specified in parameter event_f_flags of fanotify_init.
@@ -847,24 +900,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
847900
unsigned int type,
848901
__kernel_fsid_t *fsid)
849902
{
903+
struct ucounts *ucounts = group->fanotify_data.ucounts;
850904
struct fsnotify_mark *mark;
851905
int ret;
852906

853-
if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
907+
/*
908+
* Enforce per user marks limits per user in all containing user ns.
909+
* A group with FAN_UNLIMITED_MARKS does not contribute to mark count
910+
* in the limited groups account.
911+
*/
912+
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
913+
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
854914
return ERR_PTR(-ENOSPC);
855915

856916
mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
857-
if (!mark)
858-
return ERR_PTR(-ENOMEM);
917+
if (!mark) {
918+
ret = -ENOMEM;
919+
goto out_dec_ucounts;
920+
}
859921

860922
fsnotify_init_mark(mark, group);
861923
ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
862924
if (ret) {
863925
fsnotify_put_mark(mark);
864-
return ERR_PTR(ret);
926+
goto out_dec_ucounts;
865927
}
866928

867929
return mark;
930+
931+
out_dec_ucounts:
932+
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
933+
dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
934+
return ERR_PTR(ret);
868935
}
869936

870937

@@ -963,7 +1030,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
9631030
{
9641031
struct fsnotify_group *group;
9651032
int f_flags, fd;
966-
struct user_struct *user;
9671033
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
9681034
unsigned int class = flags & FANOTIFY_CLASS_BITS;
9691035

@@ -1002,12 +1068,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
10021068
if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
10031069
return -EINVAL;
10041070

1005-
user = get_current_user();
1006-
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
1007-
free_uid(user);
1008-
return -EMFILE;
1009-
}
1010-
10111071
f_flags = O_RDWR | FMODE_NONOTIFY;
10121072
if (flags & FAN_CLOEXEC)
10131073
f_flags |= O_CLOEXEC;
@@ -1017,13 +1077,19 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
10171077
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
10181078
group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
10191079
if (IS_ERR(group)) {
1020-
free_uid(user);
10211080
return PTR_ERR(group);
10221081
}
10231082

1024-
group->fanotify_data.user = user;
1083+
/* Enforce groups limits per user in all containing user ns */
1084+
group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1085+
current_euid(),
1086+
UCOUNT_FANOTIFY_GROUPS);
1087+
if (!group->fanotify_data.ucounts) {
1088+
fd = -EMFILE;
1089+
goto out_destroy_group;
1090+
}
1091+
10251092
group->fanotify_data.flags = flags;
1026-
atomic_inc(&user->fanotify_listeners);
10271093
group->memcg = get_mem_cgroup_from_mm(current->mm);
10281094

10291095
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
@@ -1064,16 +1130,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
10641130
goto out_destroy_group;
10651131
group->max_events = UINT_MAX;
10661132
} else {
1067-
group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1133+
group->max_events = fanotify_max_queued_events;
10681134
}
10691135

10701136
if (flags & FAN_UNLIMITED_MARKS) {
10711137
fd = -EPERM;
10721138
if (!capable(CAP_SYS_ADMIN))
10731139
goto out_destroy_group;
1074-
group->fanotify_data.max_marks = UINT_MAX;
1075-
} else {
1076-
group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
10771140
}
10781141

10791142
if (flags & FAN_ENABLE_AUDIT) {
@@ -1357,6 +1420,21 @@ SYSCALL32_DEFINE6(fanotify_mark,
13571420
*/
13581421
static int __init fanotify_user_setup(void)
13591422
{
1423+
struct sysinfo si;
1424+
int max_marks;
1425+
1426+
si_meminfo(&si);
1427+
/*
1428+
* Allow up to 1% of addressable memory to be accounted for per user
1429+
* marks limited to the range [8192, 1048576]. mount and sb marks are
1430+
* a lot cheaper than inode marks, but there is no reason for a user
1431+
* to have many of those, so calculate by the cost of inode marks.
1432+
*/
1433+
max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1434+
INODE_MARK_COST;
1435+
max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1436+
FANOTIFY_DEFAULT_MAX_USER_MARKS);
1437+
13601438
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
13611439
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
13621440

@@ -1371,6 +1449,11 @@ static int __init fanotify_user_setup(void)
13711449
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
13721450
}
13731451

1452+
fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1453+
init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1454+
FANOTIFY_DEFAULT_MAX_GROUPS;
1455+
init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1456+
13741457
return 0;
13751458
}
13761459
device_initcall(fanotify_user_setup);

fs/notify/group.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ static struct fsnotify_group *__fsnotify_alloc_group(
122122

123123
/* set to 0 when there a no external references to this group */
124124
refcount_set(&group->refcnt, 1);
125-
atomic_set(&group->num_marks, 0);
126125
atomic_set(&group->user_waits, 0);
127126

128127
spin_lock_init(&group->notification_lock);

fs/notify/mark.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,6 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
391391
list_del_init(&mark->g_list);
392392
spin_unlock(&mark->lock);
393393

394-
atomic_dec(&group->num_marks);
395-
396394
/* Drop mark reference acquired in fsnotify_add_mark_locked() */
397395
fsnotify_put_mark(mark);
398396
}
@@ -656,7 +654,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
656654
mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
657655

658656
list_add(&mark->g_list, &group->marks_list);
659-
atomic_inc(&group->num_marks);
660657
fsnotify_get_mark(mark); /* for g_list */
661658
spin_unlock(&mark->lock);
662659

@@ -674,7 +671,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
674671
FSNOTIFY_MARK_FLAG_ATTACHED);
675672
list_del_init(&mark->g_list);
676673
spin_unlock(&mark->lock);
677-
atomic_dec(&group->num_marks);
678674

679675
fsnotify_put_mark(mark);
680676
return ret;

include/linux/fanotify.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
#ifndef _LINUX_FANOTIFY_H
33
#define _LINUX_FANOTIFY_H
44

5+
#include <linux/sysctl.h>
56
#include <uapi/linux/fanotify.h>
67

8+
extern struct ctl_table fanotify_table[]; /* for sysctl */
9+
710
#define FAN_GROUP_FLAG(group, flag) \
811
((group)->fanotify_data.flags & (flag))
912

include/linux/fsnotify_backend.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,6 @@ struct fsnotify_group {
206206

207207
/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
208208
struct mutex mark_mutex; /* protect marks_list */
209-
atomic_t num_marks; /* 1 for each mark and 1 for not being
210-
* past the point of no return when freeing
211-
* a group */
212209
atomic_t user_waits; /* Number of tasks waiting for user
213210
* response */
214211
struct list_head marks_list; /* all inode marks for this group */
@@ -240,8 +237,7 @@ struct fsnotify_group {
240237
wait_queue_head_t access_waitq;
241238
int flags; /* flags from fanotify_init() */
242239
int f_flags; /* event_f_flags from fanotify_init() */
243-
unsigned int max_marks;
244-
struct user_struct *user;
240+
struct ucounts *ucounts;
245241
} fanotify_data;
246242
#endif /* CONFIG_FANOTIFY */
247243
};

include/linux/sched/user.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@ struct user_struct {
1414
refcount_t __count; /* reference count */
1515
atomic_t processes; /* How many processes does this user have? */
1616
atomic_t sigpending; /* How many pending signals does this user have? */
17-
#ifdef CONFIG_FANOTIFY
18-
atomic_t fanotify_listeners;
19-
#endif
2017
#ifdef CONFIG_EPOLL
2118
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
2219
#endif

include/linux/user_namespace.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ enum ucount_type {
4949
#ifdef CONFIG_INOTIFY_USER
5050
UCOUNT_INOTIFY_INSTANCES,
5151
UCOUNT_INOTIFY_WATCHES,
52+
#endif
53+
#ifdef CONFIG_FANOTIFY
54+
UCOUNT_FANOTIFY_GROUPS,
55+
UCOUNT_FANOTIFY_MARKS,
5256
#endif
5357
UCOUNT_COUNTS,
5458
};

kernel/sysctl.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
148148
#ifdef CONFIG_INOTIFY_USER
149149
#include <linux/inotify.h>
150150
#endif
151+
#ifdef CONFIG_FANOTIFY
152+
#include <linux/fanotify.h>
153+
#endif
151154

152155
#ifdef CONFIG_PROC_SYSCTL
153156

@@ -3258,7 +3261,14 @@ static struct ctl_table fs_table[] = {
32583261
.mode = 0555,
32593262
.child = inotify_table,
32603263
},
3261-
#endif
3264+
#endif
3265+
#ifdef CONFIG_FANOTIFY
3266+
{
3267+
.procname = "fanotify",
3268+
.mode = 0555,
3269+
.child = fanotify_table,
3270+
},
3271+
#endif
32623272
#ifdef CONFIG_EPOLL
32633273
{
32643274
.procname = "epoll",

kernel/ucount.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ static struct ctl_table user_table[] = {
7373
#ifdef CONFIG_INOTIFY_USER
7474
UCOUNT_ENTRY("max_inotify_instances"),
7575
UCOUNT_ENTRY("max_inotify_watches"),
76+
#endif
77+
#ifdef CONFIG_FANOTIFY
78+
UCOUNT_ENTRY("max_fanotify_groups"),
79+
UCOUNT_ENTRY("max_fanotify_marks"),
7680
#endif
7781
{ }
7882
};

0 commit comments

Comments
 (0)