Skip to content

Commit 0aecba6

Browse files
committed
Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs d_inode/d_flags memory ordering fixes from Al Viro: "Fallout from tree-wide audit for ->d_inode/->d_flags barriers use. Basically, the problem is that negative pinned dentries require careful treatment - unless ->d_lock is locked or parent is held at least shared, another thread can make them positive right under us. Most of the uses turned out to be safe - the main surprises as far as filesystems are concerned were - race in dget_parent() fastpath, that might end up with the caller observing the returned dentry _negative_, due to insufficient barriers. It is positive in memory, but we could end up seeing the wrong value of ->d_inode in CPU cache. Fixed. - manual checks that result of lookup_one_len_unlocked() is positive (and rejection of negatives). Again, insufficient barriers (we might end up with inconsistent observed values of ->d_inode and ->d_flags). Fixed by switching to a new primitive that does the checks itself and returns ERR_PTR(-ENOENT) instead of a negative dentry. That way we get rid of boilerplate converting negatives into ERR_PTR(-ENOENT) in the callers and have a single place to deal with the barrier-related mess - inside fs/namei.c rather than in every caller out there. The guts of pathname resolution *do* need to be careful - the race found by Ritesh is real, as well as several similar races. Fortunately, it turns out that we can take care of that with fairly local changes in there. The tree-wide audit had not been fun, and I hate the idea of repeating it. I think the right approach would be to annotate the places where we are _not_ guaranteed ->d_inode/->d_flags stability and have sparse catch regressions. But I'm still not sure what would be the least invasive way of doing that and it's clearly the next cycle fodder" * 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: fs/namei.c: fix missing barriers when checking positivity fix dget_parent() fastpath race new helper: lookup_positive_unlocked() fs/namei.c: pull positivity check into follow_managed()
2 parents b0d4bea + 2fa6b1e commit 0aecba6

File tree

11 files changed

+56
-74
lines changed

11 files changed

+56
-74
lines changed

fs/cifs/cifsfs.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -730,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
730730
struct inode *dir = d_inode(dentry);
731731
struct dentry *child;
732732

733-
if (!dir) {
734-
dput(dentry);
735-
dentry = ERR_PTR(-ENOENT);
736-
break;
737-
}
738733
if (!S_ISDIR(dir->i_mode)) {
739734
dput(dentry);
740735
dentry = ERR_PTR(-ENOTDIR);
@@ -751,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
751746
while (*s && *s != sep)
752747
s++;
753748

754-
child = lookup_one_len_unlocked(p, dentry, s - p);
749+
child = lookup_positive_unlocked(p, dentry, s - p);
755750
dput(dentry);
756751
dentry = child;
757752
} while (!IS_ERR(dentry));

fs/dcache.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
319319
flags = READ_ONCE(dentry->d_flags);
320320
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
321321
flags |= type_flags;
322-
WRITE_ONCE(dentry->d_flags, flags);
322+
smp_store_release(&dentry->d_flags, flags);
323323
}
324324

325325
static inline void __d_clear_type_and_inode(struct dentry *dentry)
@@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry)
903903
{
904904
int gotref;
905905
struct dentry *ret;
906+
unsigned seq;
906907

907908
/*
908909
* Do optimistic parent lookup without any
909910
* locking.
910911
*/
911912
rcu_read_lock();
913+
seq = raw_seqcount_begin(&dentry->d_seq);
912914
ret = READ_ONCE(dentry->d_parent);
913915
gotref = lockref_get_not_zero(&ret->d_lockref);
914916
rcu_read_unlock();
915917
if (likely(gotref)) {
916-
if (likely(ret == READ_ONCE(dentry->d_parent)))
918+
if (!read_seqcount_retry(&dentry->d_seq, seq))
917919
return ret;
918920
dput(ret);
919921
}

fs/debugfs/inode.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
299299
if (!parent)
300300
parent = debugfs_mount->mnt_root;
301301

302-
dentry = lookup_one_len_unlocked(name, parent, strlen(name));
302+
dentry = lookup_positive_unlocked(name, parent, strlen(name));
303303
if (IS_ERR(dentry))
304304
return NULL;
305-
if (!d_really_is_positive(dentry)) {
306-
dput(dentry);
307-
return NULL;
308-
}
309305
return dentry;
310306
}
311307
EXPORT_SYMBOL_GPL(debugfs_lookup);

fs/kernfs/mount.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
223223
dput(dentry);
224224
return ERR_PTR(-EINVAL);
225225
}
226-
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
226+
dtmp = lookup_positive_unlocked(kntmp->name, dentry,
227227
strlen(kntmp->name));
228228
dput(dentry);
229229
if (IS_ERR(dtmp))

fs/namei.c

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd,
12101210
* - Flagged as automount point
12111211
*
12121212
* This may only be called in refwalk mode.
1213+
* On success path->dentry is known positive.
12131214
*
12141215
* Serialization is taken care of in namespace.c
12151216
*/
12161217
static int follow_managed(struct path *path, struct nameidata *nd)
12171218
{
12181219
struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1219-
unsigned managed;
1220+
unsigned flags;
12201221
bool need_mntput = false;
12211222
int ret = 0;
12221223

12231224
/* Given that we're not holding a lock here, we retain the value in a
12241225
* local variable for each dentry as we look at it so that we don't see
12251226
* the components of that value change under us */
1226-
while (managed = READ_ONCE(path->dentry->d_flags),
1227-
managed &= DCACHE_MANAGED_DENTRY,
1228-
unlikely(managed != 0)) {
1227+
while (flags = smp_load_acquire(&path->dentry->d_flags),
1228+
unlikely(flags & DCACHE_MANAGED_DENTRY)) {
12291229
/* Allow the filesystem to manage the transit without i_mutex
12301230
* being held. */
1231-
if (managed & DCACHE_MANAGE_TRANSIT) {
1231+
if (flags & DCACHE_MANAGE_TRANSIT) {
12321232
BUG_ON(!path->dentry->d_op);
12331233
BUG_ON(!path->dentry->d_op->d_manage);
12341234
ret = path->dentry->d_op->d_manage(path, false);
@@ -1237,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
12371237
}
12381238

12391239
/* Transit to a mounted filesystem. */
1240-
if (managed & DCACHE_MOUNTED) {
1240+
if (flags & DCACHE_MOUNTED) {
12411241
struct vfsmount *mounted = lookup_mnt(path);
12421242
if (mounted) {
12431243
dput(path->dentry);
@@ -1256,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
12561256
}
12571257

12581258
/* Handle an automount point */
1259-
if (managed & DCACHE_NEED_AUTOMOUNT) {
1259+
if (flags & DCACHE_NEED_AUTOMOUNT) {
12601260
ret = follow_automount(path, nd, &need_mntput);
12611261
if (ret < 0)
12621262
break;
@@ -1269,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd)
12691269

12701270
if (need_mntput && path->mnt == mnt)
12711271
mntput(path->mnt);
1272-
if (ret == -EISDIR || !ret)
1273-
ret = 1;
12741272
if (need_mntput)
12751273
nd->flags |= LOOKUP_JUMPED;
1274+
if (ret == -EISDIR || !ret)
1275+
ret = 1;
1276+
if (ret > 0 && unlikely(d_flags_negative(flags)))
1277+
ret = -ENOENT;
12761278
if (unlikely(ret < 0))
12771279
path_put_conditional(path, nd);
12781280
return ret;
@@ -1621,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd,
16211623
dput(dentry);
16221624
return status;
16231625
}
1624-
if (unlikely(d_is_negative(dentry))) {
1625-
dput(dentry);
1626-
return -ENOENT;
1627-
}
16281626

16291627
path->mnt = mnt;
16301628
path->dentry = dentry;
@@ -1811,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags)
18111809
if (unlikely(err < 0))
18121810
return err;
18131811

1814-
if (unlikely(d_is_negative(path.dentry))) {
1815-
path_to_nameidata(&path, nd);
1816-
return -ENOENT;
1817-
}
1818-
18191812
seq = 0; /* we are already out of RCU mode */
18201813
inode = d_backing_inode(path.dentry);
18211814
}
@@ -2568,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name,
25682561
}
25692562
EXPORT_SYMBOL(lookup_one_len_unlocked);
25702563

2564+
/*
2565+
* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2566+
* on negatives. Returns known positive or ERR_PTR(); that's what
2567+
* most of the users want. Note that pinned negative with unlocked parent
2568+
* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2569+
* need to be very careful; pinned positives have ->d_inode stable, so
2570+
* this one avoids such problems.
2571+
*/
2572+
struct dentry *lookup_positive_unlocked(const char *name,
2573+
struct dentry *base, int len)
2574+
{
2575+
struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2576+
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2577+
dput(ret);
2578+
ret = ERR_PTR(-ENOENT);
2579+
}
2580+
return ret;
2581+
}
2582+
EXPORT_SYMBOL(lookup_positive_unlocked);
2583+
25712584
#ifdef CONFIG_UNIX98_PTYS
25722585
int path_pts(struct path *path)
25732586
{
@@ -2662,7 +2675,7 @@ mountpoint_last(struct nameidata *nd)
26622675
return PTR_ERR(path.dentry);
26632676
}
26642677
}
2665-
if (d_is_negative(path.dentry)) {
2678+
if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) {
26662679
dput(path.dentry);
26672680
return -ENOENT;
26682681
}
@@ -3356,11 +3369,6 @@ static int do_last(struct nameidata *nd,
33563369
if (unlikely(error < 0))
33573370
return error;
33583371

3359-
if (unlikely(d_is_negative(path.dentry))) {
3360-
path_to_nameidata(&path, nd);
3361-
return -ENOENT;
3362-
}
3363-
33643372
/*
33653373
* create/update audit record if it already exists.
33663374
*/

fs/nfsd/nfs3xdr.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -863,13 +863,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
863863
} else
864864
dchild = dget(dparent);
865865
} else
866-
dchild = lookup_one_len_unlocked(name, dparent, namlen);
866+
dchild = lookup_positive_unlocked(name, dparent, namlen);
867867
if (IS_ERR(dchild))
868868
return rv;
869869
if (d_mountpoint(dchild))
870870
goto out;
871-
if (d_really_is_negative(dchild))
872-
goto out;
873871
if (dchild->d_inode->i_ino != ino)
874872
goto out;
875873
rv = fh_compose(fhp, exp, dchild, &cd->fh);

fs/nfsd/nfs4xdr.c

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
29912991
__be32 nfserr;
29922992
int ignore_crossmnt = 0;
29932993

2994-
dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
2994+
dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
29952995
if (IS_ERR(dentry))
29962996
return nfserrno(PTR_ERR(dentry));
2997-
if (d_really_is_negative(dentry)) {
2998-
/*
2999-
* we're not holding the i_mutex here, so there's
3000-
* a window where this directory entry could have gone
3001-
* away.
3002-
*/
3003-
dput(dentry);
3004-
return nfserr_noent;
3005-
}
30062997

30072998
exp_get(exp);
30082999
/*

fs/overlayfs/namei.c

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -200,16 +200,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
200200
int err;
201201
bool last_element = !post[0];
202202

203-
this = lookup_one_len_unlocked(name, base, namelen);
203+
this = lookup_positive_unlocked(name, base, namelen);
204204
if (IS_ERR(this)) {
205205
err = PTR_ERR(this);
206206
this = NULL;
207207
if (err == -ENOENT || err == -ENAMETOOLONG)
208208
goto out;
209209
goto out_err;
210210
}
211-
if (!this->d_inode)
212-
goto put_and_out;
213211

214212
if (ovl_dentry_weird(this)) {
215213
/* Don't support traversing automounts and other weirdness */
@@ -651,17 +649,15 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
651649
if (err)
652650
return ERR_PTR(err);
653651

654-
index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
652+
index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
655653
kfree(name.name);
656654
if (IS_ERR(index)) {
657655
if (PTR_ERR(index) == -ENOENT)
658656
index = NULL;
659657
return index;
660658
}
661659

662-
if (d_is_negative(index))
663-
err = 0;
664-
else if (ovl_is_whiteout(index))
660+
if (ovl_is_whiteout(index))
665661
err = -ESTALE;
666662
else if (ovl_dentry_weird(index))
667663
err = -EIO;
@@ -685,7 +681,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
685681
if (err)
686682
return ERR_PTR(err);
687683

688-
index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
684+
index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
689685
if (IS_ERR(index)) {
690686
err = PTR_ERR(index);
691687
if (err == -ENOENT) {
@@ -700,9 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
700696
}
701697

702698
inode = d_inode(index);
703-
if (d_is_negative(index)) {
704-
goto out_dput;
705-
} else if (ovl_is_whiteout(index) && !verify) {
699+
if (ovl_is_whiteout(index) && !verify) {
706700
/*
707701
* When index lookup is called with !verify for decoding an
708702
* overlay file handle, a whiteout index implies that decode
@@ -1131,7 +1125,7 @@ bool ovl_lower_positive(struct dentry *dentry)
11311125
struct dentry *this;
11321126
struct dentry *lowerdir = poe->lowerstack[i].dentry;
11331127

1134-
this = lookup_one_len_unlocked(name->name, lowerdir,
1128+
this = lookup_positive_unlocked(name->name, lowerdir,
11351129
name->len);
11361130
if (IS_ERR(this)) {
11371131
switch (PTR_ERR(this)) {
@@ -1148,10 +1142,8 @@ bool ovl_lower_positive(struct dentry *dentry)
11481142
break;
11491143
}
11501144
} else {
1151-
if (this->d_inode) {
1152-
positive = !ovl_is_whiteout(this);
1153-
done = true;
1154-
}
1145+
positive = !ovl_is_whiteout(this);
1146+
done = true;
11551147
dput(this);
11561148
}
11571149
}

fs/quota/dquot.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2487,21 +2487,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
24872487
struct dentry *dentry;
24882488
int error;
24892489

2490-
dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
2490+
dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
24912491
if (IS_ERR(dentry))
24922492
return PTR_ERR(dentry);
24932493

2494-
if (d_really_is_negative(dentry)) {
2495-
error = -ENOENT;
2496-
goto out;
2497-
}
2498-
24992494
error = security_quota_on(dentry);
25002495
if (!error)
25012496
error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
25022497
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
25032498

2504-
out:
25052499
dput(dentry);
25062500
return error;
25072501
}

include/linux/dcache.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,11 @@ static inline bool d_is_negative(const struct dentry *dentry)
440440
return d_is_miss(dentry);
441441
}
442442

443+
static inline bool d_flags_negative(unsigned flags)
444+
{
445+
return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
446+
}
447+
443448
static inline bool d_is_positive(const struct dentry *dentry)
444449
{
445450
return !d_is_negative(dentry);

0 commit comments

Comments
 (0)