Skip to content

Commit a13ae69

Browse files
committed
proc: Dentry flushing without proc_mnt
Cleanly handling proc mount options require the internal mount of proc to be removed (so mount options are not ignored), and quite possibly multiple proc superblocks per pid namespace (so a second mount of proc does not silently get the mount options of the first mount of proc. In either case being able to flush proc dentries on process exit needs to be made to work without going through proc_mnt. After serveral discussions this is the set of changes that work and no one objects to. --- I have addressed all of the review comments as I understand them, and fixed the small oversight the kernel test robot was able to find. (I had failed to initialize the new field pid->inodes). I did not hear any concerns from the 10,000 foot level last time so I am assuming this set of changes (baring bugs) is good to go. Unless some new issues appear my plan is to put this in my tree and get this into linux-next. Which will give Alexey something to build his changes on. I tested this set of changes by running: (while ls -1 -f /proc > /dev/null ; do :; done ) & And monitoring the amount of free memory. With the flushing disabled I saw the used memory in the system grow by 20M before the shrinker would bring it back down to where it started. With the patch applied I saw the memory usage stay essentially fixed. So flushing definitely keeps things working better. Eric W. Biederman (6): proc: Rename in proc_inode rename sysctl_inodes sibling_inodes proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache proc: In proc_prune_siblings_dcache cache an aquired super block proc: Use d_invalidate in proc_prune_siblings_dcache proc: Clear the pieces of proc_inode that proc_evict_inode cares about proc: Use a list of inodes to flush from proc fs/proc/base.c | 111 ++++++++++++++++-------------------------------- fs/proc/inode.c | 73 ++++++++++++++++++++++++++++--- fs/proc/internal.h | 4 +- fs/proc/proc_sysctl.c | 45 +++----------------- include/linux/pid.h | 1 + include/linux/proc_fs.h | 4 +- kernel/exit.c | 4 +- kernel/pid.c | 1 + 8 files changed, 120 insertions(+), 123 deletions(-) Link: https://lore.kernel.org/lkml/[email protected]/ Signed-off-by: "Eric W. Biederman" <[email protected]> Merge branch 'proc-dentry-flushing-without-proc-mnt-v2' into HEAD
2 parents 11a48a5 + 7bc3e6e commit a13ae69

File tree

8 files changed

+120
-123
lines changed

8 files changed

+120
-123
lines changed

fs/proc/base.c

Lines changed: 36 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
18341834
*rgid = gid;
18351835
}
18361836

1837+
void proc_pid_evict_inode(struct proc_inode *ei)
1838+
{
1839+
struct pid *pid = ei->pid;
1840+
1841+
if (S_ISDIR(ei->vfs_inode.i_mode)) {
1842+
spin_lock(&pid->wait_pidfd.lock);
1843+
hlist_del_init_rcu(&ei->sibling_inodes);
1844+
spin_unlock(&pid->wait_pidfd.lock);
1845+
}
1846+
1847+
put_pid(pid);
1848+
}
1849+
18371850
struct inode *proc_pid_make_inode(struct super_block * sb,
18381851
struct task_struct *task, umode_t mode)
18391852
{
18401853
struct inode * inode;
18411854
struct proc_inode *ei;
1855+
struct pid *pid;
18421856

18431857
/* We need a new inode */
18441858

@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
18561870
/*
18571871
* grab the reference to task.
18581872
*/
1859-
ei->pid = get_task_pid(task, PIDTYPE_PID);
1860-
if (!ei->pid)
1873+
pid = get_task_pid(task, PIDTYPE_PID);
1874+
if (!pid)
18611875
goto out_unlock;
18621876

1877+
/* Let the pid remember us for quick removal */
1878+
ei->pid = pid;
1879+
if (S_ISDIR(mode)) {
1880+
spin_lock(&pid->wait_pidfd.lock);
1881+
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
1882+
spin_unlock(&pid->wait_pidfd.lock);
1883+
}
1884+
18631885
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
18641886
security_task_to_inode(task, inode);
18651887

@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
32303252
.permission = proc_pid_permission,
32313253
};
32323254

3233-
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
3234-
{
3235-
struct dentry *dentry, *leader, *dir;
3236-
char buf[10 + 1];
3237-
struct qstr name;
3238-
3239-
name.name = buf;
3240-
name.len = snprintf(buf, sizeof(buf), "%u", pid);
3241-
/* no ->d_hash() rejects on procfs */
3242-
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
3243-
if (dentry) {
3244-
d_invalidate(dentry);
3245-
dput(dentry);
3246-
}
3247-
3248-
if (pid == tgid)
3249-
return;
3250-
3251-
name.name = buf;
3252-
name.len = snprintf(buf, sizeof(buf), "%u", tgid);
3253-
leader = d_hash_and_lookup(mnt->mnt_root, &name);
3254-
if (!leader)
3255-
goto out;
3256-
3257-
name.name = "task";
3258-
name.len = strlen(name.name);
3259-
dir = d_hash_and_lookup(leader, &name);
3260-
if (!dir)
3261-
goto out_put_leader;
3262-
3263-
name.name = buf;
3264-
name.len = snprintf(buf, sizeof(buf), "%u", pid);
3265-
dentry = d_hash_and_lookup(dir, &name);
3266-
if (dentry) {
3267-
d_invalidate(dentry);
3268-
dput(dentry);
3269-
}
3270-
3271-
dput(dir);
3272-
out_put_leader:
3273-
dput(leader);
3274-
out:
3275-
return;
3276-
}
3277-
32783255
/**
3279-
* proc_flush_task - Remove dcache entries for @task from the /proc dcache.
3280-
* @task: task that should be flushed.
3256+
* proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
3257+
* @pid: pid that should be flushed.
32813258
*
3282-
* When flushing dentries from proc, one needs to flush them from global
3283-
* proc (proc_mnt) and from all the namespaces' procs this task was seen
3284-
* in. This call is supposed to do all of this job.
3285-
*
3286-
* Looks in the dcache for
3287-
* /proc/@pid
3288-
* /proc/@tgid/task/@pid
3289-
* if either directory is present flushes it and all of it'ts children
3290-
* from the dcache.
3259+
* This function walks a list of inodes (that belong to any proc
3260+
* filesystem) that are attached to the pid and flushes them from
3261+
* the dentry cache.
32913262
*
32923263
* It is safe and reasonable to cache /proc entries for a task until
32933264
* that task exits. After that they just clog up the dcache with
32943265
* useless entries, possibly causing useful dcache entries to be
3295-
* flushed instead. This routine is proved to flush those useless
3296-
* dcache entries at process exit time.
3266+
* flushed instead. This routine is provided to flush those useless
3267+
* dcache entries when a process is reaped.
32973268
*
32983269
* NOTE: This routine is just an optimization so it does not guarantee
3299-
* that no dcache entries will exist at process exit time it
3300-
* just makes it very unlikely that any will persist.
3270+
* that no dcache entries will exist after a process is reaped
3271+
* it just makes it very unlikely that any will persist.
33013272
*/
33023273

3303-
void proc_flush_task(struct task_struct *task)
3274+
void proc_flush_pid(struct pid *pid)
33043275
{
3305-
int i;
3306-
struct pid *pid, *tgid;
3307-
struct upid *upid;
3308-
3309-
pid = task_pid(task);
3310-
tgid = task_tgid(task);
3311-
3312-
for (i = 0; i <= pid->level; i++) {
3313-
upid = &pid->numbers[i];
3314-
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
3315-
tgid->numbers[i].nr);
3316-
}
3276+
proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
3277+
put_pid(pid);
33173278
}
33183279

33193280
static struct dentry *proc_pid_instantiate(struct dentry * dentry,

fs/proc/inode.c

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
3333
{
3434
struct proc_dir_entry *de;
3535
struct ctl_table_header *head;
36+
struct proc_inode *ei = PROC_I(inode);
3637

3738
truncate_inode_pages_final(&inode->i_data);
3839
clear_inode(inode);
3940

4041
/* Stop tracking associated processes */
41-
put_pid(PROC_I(inode)->pid);
42+
if (ei->pid) {
43+
proc_pid_evict_inode(ei);
44+
ei->pid = NULL;
45+
}
4246

4347
/* Let go of any associated proc directory entry */
44-
de = PDE(inode);
45-
if (de)
48+
de = ei->pde;
49+
if (de) {
4650
pde_put(de);
51+
ei->pde = NULL;
52+
}
4753

48-
head = PROC_I(inode)->sysctl;
54+
head = ei->sysctl;
4955
if (head) {
50-
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
56+
RCU_INIT_POINTER(ei->sysctl, NULL);
5157
proc_sys_evict_inode(inode, head);
5258
}
5359
}
@@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
6874
ei->pde = NULL;
6975
ei->sysctl = NULL;
7076
ei->sysctl_entry = NULL;
77+
INIT_HLIST_NODE(&ei->sibling_inodes);
7178
ei->ns_ops = NULL;
7279
return &ei->vfs_inode;
7380
}
@@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
102109
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
103110
}
104111

112+
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
113+
{
114+
struct inode *inode;
115+
struct proc_inode *ei;
116+
struct hlist_node *node;
117+
struct super_block *old_sb = NULL;
118+
119+
rcu_read_lock();
120+
for (;;) {
121+
struct super_block *sb;
122+
node = hlist_first_rcu(inodes);
123+
if (!node)
124+
break;
125+
ei = hlist_entry(node, struct proc_inode, sibling_inodes);
126+
spin_lock(lock);
127+
hlist_del_init_rcu(&ei->sibling_inodes);
128+
spin_unlock(lock);
129+
130+
inode = &ei->vfs_inode;
131+
sb = inode->i_sb;
132+
if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
133+
continue;
134+
inode = igrab(inode);
135+
rcu_read_unlock();
136+
if (sb != old_sb) {
137+
if (old_sb)
138+
deactivate_super(old_sb);
139+
old_sb = sb;
140+
}
141+
if (unlikely(!inode)) {
142+
rcu_read_lock();
143+
continue;
144+
}
145+
146+
if (S_ISDIR(inode->i_mode)) {
147+
struct dentry *dir = d_find_any_alias(inode);
148+
if (dir) {
149+
d_invalidate(dir);
150+
dput(dir);
151+
}
152+
} else {
153+
struct dentry *dentry;
154+
while ((dentry = d_find_alias(inode))) {
155+
d_invalidate(dentry);
156+
dput(dentry);
157+
}
158+
}
159+
iput(inode);
160+
161+
rcu_read_lock();
162+
}
163+
rcu_read_unlock();
164+
if (old_sb)
165+
deactivate_super(old_sb);
166+
}
167+
105168
static int proc_show_options(struct seq_file *seq, struct dentry *root)
106169
{
107170
struct super_block *sb = root->d_sb;

fs/proc/internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ struct proc_inode {
9191
struct proc_dir_entry *pde;
9292
struct ctl_table_header *sysctl;
9393
struct ctl_table *sysctl_entry;
94-
struct hlist_node sysctl_inodes;
94+
struct hlist_node sibling_inodes;
9595
const struct proc_ns_operations *ns_ops;
9696
struct inode vfs_inode;
9797
} __randomize_layout;
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
158158
extern const struct dentry_operations pid_dentry_operations;
159159
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
160160
extern int proc_setattr(struct dentry *, struct iattr *);
161+
extern void proc_pid_evict_inode(struct proc_inode *);
161162
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
162163
extern void pid_update_inode(struct task_struct *, struct inode *);
163164
extern int pid_delete_dentry(const struct dentry *);
@@ -210,6 +211,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
210211
extern const struct super_operations proc_sops;
211212

212213
void proc_init_kmemcache(void);
214+
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
213215
void set_proc_pid_nlink(void);
214216
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
215217
extern void proc_entry_rundown(struct proc_dir_entry *);

fs/proc/proc_sysctl.c

Lines changed: 6 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
267267
complete(p->unregistering);
268268
}
269269

270-
static void proc_sys_prune_dcache(struct ctl_table_header *head)
270+
static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
271271
{
272-
struct inode *inode;
273-
struct proc_inode *ei;
274-
struct hlist_node *node;
275-
struct super_block *sb;
276-
277-
rcu_read_lock();
278-
for (;;) {
279-
node = hlist_first_rcu(&head->inodes);
280-
if (!node)
281-
break;
282-
ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
283-
spin_lock(&sysctl_lock);
284-
hlist_del_init_rcu(&ei->sysctl_inodes);
285-
spin_unlock(&sysctl_lock);
286-
287-
inode = &ei->vfs_inode;
288-
sb = inode->i_sb;
289-
if (!atomic_inc_not_zero(&sb->s_active))
290-
continue;
291-
inode = igrab(inode);
292-
rcu_read_unlock();
293-
if (unlikely(!inode)) {
294-
deactivate_super(sb);
295-
rcu_read_lock();
296-
continue;
297-
}
298-
299-
d_prune_aliases(inode);
300-
iput(inode);
301-
deactivate_super(sb);
302-
303-
rcu_read_lock();
304-
}
305-
rcu_read_unlock();
272+
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
306273
}
307274

308275
/* called under sysctl_lock, will reacquire if has to wait */
@@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
324291
spin_unlock(&sysctl_lock);
325292
}
326293
/*
327-
* Prune dentries for unregistered sysctls: namespaced sysctls
294+
* Invalidate dentries for unregistered sysctls: namespaced sysctls
328295
* can have duplicate names and contaminate dcache very badly.
329296
*/
330-
proc_sys_prune_dcache(p);
297+
proc_sys_invalidate_dcache(p);
331298
/*
332299
* do not remove from the list until nobody holds it; walking the
333300
* list in do_sysctl() relies on that.
@@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
483450
}
484451
ei->sysctl = head;
485452
ei->sysctl_entry = table;
486-
hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
453+
hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
487454
head->count++;
488455
spin_unlock(&sysctl_lock);
489456

@@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
514481
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
515482
{
516483
spin_lock(&sysctl_lock);
517-
hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
484+
hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
518485
if (!--head->count)
519486
kfree_rcu(head, rcu);
520487
spin_unlock(&sysctl_lock);

include/linux/pid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct pid
6262
unsigned int level;
6363
/* lists of tasks that use this pid */
6464
struct hlist_head tasks[PIDTYPE_MAX];
65+
struct hlist_head inodes;
6566
/* wait queue for pidfd notifications */
6667
wait_queue_head_t wait_pidfd;
6768
struct rcu_head rcu;

include/linux/proc_fs.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ struct proc_ops {
3232
typedef int (*proc_write_t)(struct file *, char *, size_t);
3333

3434
extern void proc_root_init(void);
35-
extern void proc_flush_task(struct task_struct *);
35+
extern void proc_flush_pid(struct pid *);
3636

3737
extern struct proc_dir_entry *proc_symlink(const char *,
3838
struct proc_dir_entry *, const char *);
@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
105105
{
106106
}
107107

108-
static inline void proc_flush_task(struct task_struct *task)
108+
static inline void proc_flush_pid(struct pid *pid)
109109
{
110110
}
111111

0 commit comments

Comments
 (0)