Skip to content

Commit 65c9146

Browse files
committed
contenthash: improve the correctness of needsScan
Commit f724d6f ("contenthash: implement proper Linux symlink semantics for needsScan") fixed issues with needScan's handling of symlinks, but the logic used to figure out if a parent path is in the cache was incorrect in two ways: 1. The optimisations in getFollowSymlinksCallback to avoid looking up / or no-op components in the cache lead to the callback not being called when we go through / (both in for the initial currentPath=/ state and for absolute symlinks). The upshot is that needsScan(/non-existent-path) would always return true because we didn't check if / has been scanned. There were also some issues with the wrong cache record being returned if you hit a symlink to only /. These optimisations only make sense if are only returning a path (as in rootPath) and not anything else. 2. Because needsScan would only store the _last_ good path, cases with symlink jumps to non-existent paths within directories already scanned would result in a re-scan that isn't necessary. Fix this by saving a set of prefix paths we have seen. This change also means that we can also simplify the logic in the needsScan callback. Note that in combination with (1) and (2), if / has been scanned then needsScan will always return false now (because the / prefix is always checked against, and every path has it as a parent). The pathSet structure is the "dumb" way of doing it. We could use a radix tree and LongestPrefix() to implement it in a smarter way, but in practice the list will be very small and is both short-lived and write-many-read-few, so go-immutable-radix's node copy cost probably makes the array better in practice anyway. Fixes: f724d6f ("contenthash: implement proper Linux symlink semantics for needsScan") Signed-off-by: Aleksa Sarai <[email protected]>
1 parent 9aa07e6 commit 65c9146

File tree

1 file changed

+67
-34
lines changed

1 file changed

+67
-34
lines changed

cache/contenthash/checksum.go

Lines changed: 67 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -928,31 +928,64 @@ func (cc *cacheContext) checksum(ctx context.Context, root *iradix.Node[*CacheRe
928928
return cr2, true, nil
929929
}
930930

931+
// pathSet is a set of path prefixes that can be used to see if a given path is
932+
// lexically a child of any path in the set. All paths provided to this set
933+
// MUST be absolute and use / as the separator.
934+
type pathSet struct {
935+
// prefixes contains paths of the form "/a/b/", so that we correctly detect
936+
// /a/b as being a parent of /a/b/c but not /a/bc.
937+
prefixes []string
938+
}
939+
940+
// add a path to the set. This is a no-op if includes(path) == true.
941+
func (s *pathSet) add(p string) {
942+
// Ensure the path is absolute and clean.
943+
p = path.Join("/", p)
944+
if !s.includes(p) {
945+
if p != "/" {
946+
p += "/"
947+
}
948+
s.prefixes = append(s.prefixes, p)
949+
}
950+
}
951+
952+
// includes returns true iff there is a path in the pathSet which is a lexical
953+
// parent of the given path. The provided path MUST be an absolute path and
954+
// MUST NOT contain any ".." components, as they will be path.Clean'd.
955+
func (s pathSet) includes(p string) bool {
956+
// Ensure the path is absolute and clean.
957+
p = path.Join("/", p)
958+
if p != "/" {
959+
p += "/"
960+
}
961+
for _, prefix := range s.prefixes {
962+
if strings.HasPrefix(p, prefix) {
963+
return true
964+
}
965+
}
966+
return false
967+
}
968+
931969
// needsScan returns false if path is in the tree or a parent path is in tree
932970
// and subpath is missing.
933971
func (cc *cacheContext) needsScan(root *iradix.Node[*CacheRecord], path string, followTrailing bool) (bool, error) {
934972
var (
935-
lastGoodPath string
973+
goodPaths pathSet
936974
hasParentInTree bool
937975
)
938976
k := convertPathToKey(path)
939977
_, cr, err := getFollowLinksCallback(root, k, followTrailing, func(subpath string, cr *CacheRecord) error {
978+
// If we found a path that exists in the cache, add it to the set of
979+
// known-scanned paths. Otherwise, verify whether the not-found subpath
980+
// is inside a known-scanned path (we might have hit a "..", taking us
981+
// out of the scanned paths, or we might hit a non-existent path inside
982+
// a scanned path). getFollowLinksCallback iterates left-to-right, so
983+
// we will always hit ancestors first.
940984
if cr != nil {
941-
// If the path is not a symlink, then for now we have a parent in
942-
// the tree. Otherwise, we reset hasParentInTree because we
943-
// might've jumped to a part of the tree that hasn't been scanned.
944-
hasParentInTree = (cr.Type != CacheRecordTypeSymlink)
945-
if hasParentInTree {
946-
lastGoodPath = subpath
947-
}
948-
} else if hasParentInTree {
949-
// If the current component was something like ".." and the subpath
950-
// couldn't be found, then we need to invalidate hasParentInTree.
951-
// In practice this means that our subpath needs to be prefixed by
952-
// the last good path. We add a trailing slash to make sure the
953-
// prefix is a proper lexical prefix (as opposed to /a/b being seen
954-
// as a prefix of /a/bc).
955-
hasParentInTree = strings.HasPrefix(subpath+"/", lastGoodPath+"/")
985+
hasParentInTree = cr.Type != CacheRecordTypeSymlink
986+
goodPaths.add(subpath)
987+
} else {
988+
hasParentInTree = goodPaths.includes(subpath)
956989
}
957990
return nil
958991
})
@@ -1005,11 +1038,7 @@ func (cc *cacheContext) scanPath(ctx context.Context, m *mount, p string, follow
10051038
if err != nil {
10061039
return err
10071040
}
1008-
p := path.Join("/", filepath.ToSlash(rel))
1009-
if p == "/" {
1010-
p = ""
1011-
}
1012-
k := convertPathToKey(p)
1041+
k := convertPathToKey(keyPath(rel))
10131042
if _, ok := n.Get(k); !ok {
10141043
cr := &CacheRecord{
10151044
Type: CacheRecordTypeFile,
@@ -1060,9 +1089,18 @@ func getFollowLinks(root *iradix.Node[*CacheRecord], k []byte, followTrailing bo
10601089
// Linux APIs), followTrailing is obeyed even if the key has a trailing slash
10611090
// (though paths like "foo/link/." will cause the link to be resolved).
10621091
//
1063-
// The callback cb is called after each cache lookup done by
1064-
// getFollowLinksCallback, except for the first lookup where the verbatim key
1065-
// is looked up in the cache.
1092+
// cb is a callback that is called for each path component encountered during
1093+
// path resolution (after the path component is looked up in the cache). This
1094+
// means for a path like /a/b/c, the callback will be called for at least
1095+
//
1096+
// {/, /a, /a/b, /a/b/c}
1097+
//
1098+
// Note that if any of the components are symlinks, the paths will depend on
1099+
// the symlink contents and there will be more callbacks. If the requested key
1100+
// has a trailing slash, the callback will also be called for the final
1101+
// trailing-slash lookup (/a/b/c/ in the above example). Note that
1102+
// getFollowLinksCallback will try to look up the original key directly first
1103+
// and the callback is not called for this first lookup.
10661104
func getFollowLinksCallback(root *iradix.Node[*CacheRecord], k []byte, followTrailing bool, cb followLinksCallback) ([]byte, *CacheRecord, error) {
10671105
v, ok := root.Get(k)
10681106
if ok && (!followTrailing || v.Type != CacheRecordTypeSymlink) {
@@ -1093,16 +1131,11 @@ func getFollowLinksCallback(root *iradix.Node[*CacheRecord], k []byte, followTra
10931131
// Apply the component to the path. Since it is a single component, and
10941132
// our current path contains no symlinks, we can just apply it
10951133
// leixically.
1096-
nextPath := path.Join("/", currentPath, part)
1097-
if nextPath == "/" {
1098-
// If we hit the root, just treat it as a directory.
1099-
currentPath = "/"
1100-
continue
1101-
}
1102-
if nextPath == currentPath {
1103-
// noop component
1104-
continue
1105-
}
1134+
nextPath := keyPath(path.Join("/", currentPath, part))
1135+
// In contrast to rootPath, we don't skip lookups for no-op components
1136+
// or / because we need to call the callback for every path component
1137+
// we hit (including /) and we need to make sure that the CacheRecord
1138+
// we return is correct after every iteration.
11061139

11071140
cr, ok = root.Get(convertPathToKey(nextPath))
11081141
if cb != nil {

0 commit comments

Comments
 (0)