Skip to content

Commit ab9666d

Browse files
ayushr2gvisor-bot
authored andcommitted
Handle non-empty EmptyDirs used by GCS Fuse CSI Driver.
Some CSI drivers, like GCS Fuse CSI driver, inject EmptyDirs into sidecar containers and communicate with the container using files in the EmptyDir. In gVisor terminology, such an EmptyDir is being used as a shared bind (gofer) mount. It is not exclusive to the sandbox. This breaks a fundamental assumption gVisor makes about EmptyDirs; it assumes that they are exclusive to the sandbox and that it has no external observers. So as an optimization, gVisor converts EmptyDir volumes into gVisor-internal tmpfs filesystems that are mounted into all the containers that are using that EmptyDir. As a result: - Any files in the host EmptyDir directory is not reflected within the sandbox. - Any changes made by the sandbox in the EmptyDir are not reflcted on the host. This change uses the heuristic that if the EmptyDir volume's host directory is not empty at sandbox creation time, then it is being shared with some external component which is interacting with the sandbox. We have observed that the GCS Fuse CSI Driver populates the /gcsfuse-tmp EmptyDir with a UDS at path `.volumes/gcsfuse-mount/socket`. PiperOrigin-RevId: 761839333
1 parent a178e09 commit ab9666d

File tree

5 files changed

+168
-23
lines changed

5 files changed

+168
-23
lines changed

pkg/shim/v1/utils/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ go_library(
1717
],
1818
deps = [
1919
"//runsc/specutils",
20+
"@com_github_containerd_log//:go_default_library",
2021
"@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
2122
],
2223
)

pkg/shim/v1/utils/volumes.go

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@ package utils
1616

1717
import (
1818
"fmt"
19+
"io"
20+
"os"
1921
"path/filepath"
2022
"strings"
2123

24+
"github.com/containerd/log"
2225
specs "github.com/opencontainers/runtime-spec/specs-go"
2326
"gvisor.dev/gvisor/runsc/specutils"
2427
)
@@ -33,6 +36,10 @@ const (
3336
// emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/
3437
// that hosts all the EmptyDir volumes used by the pod.
3538
emptyDirVolumesDir = "kubernetes.io~empty-dir"
39+
40+
// selfFilestorePrefix is the prefix for the filestore files used for
41+
// self-backed mounts.
42+
selfFilestorePrefix = ".gvisor.filestore."
3643
)
3744

3845
// The directory structure for volumes is as follows:
@@ -81,6 +88,11 @@ func volumeSourceKey(volume string) string {
8188
return volumeKeyPrefix + volume + ".source"
8289
}
8390

91+
// volumeShareKey constructs the annotation key for volume share type.
92+
func volumeShareKey(volume string) string {
93+
return volumeKeyPrefix + volume + ".share"
94+
}
95+
8496
// volumePath searches the volume path in the kubelet pod directory.
8597
func volumePath(volume, uid string) (string, error) {
8698
// TODO: Support subpath when gvisor supports pod volume bind mount.
@@ -121,17 +133,12 @@ func isVolumePath(volume, path string) (bool, error) {
121133
// runsc should use these two setting to infer EmptyDir medium:
122134
// - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir
123135
// - tmpfs annotation type + bind mount type = disk-backed EmptyDir
136+
//
137+
// NOTE(b/416567832): Some CSI drivers (like GCS FUSE driver) use EmptyDirs to
138+
// communicate with the Pod over a UDS. While not foolproof, we detect such
139+
// EmptyDirs by checking if the host directory is not empty and turn off the
140+
// EmptyDir optimization for them by configuring them as normal bind mounts.
124141
func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) {
125-
var uid string
126-
if IsSandbox(s) {
127-
var err error
128-
uid, err = podUID(s)
129-
if err != nil {
130-
// Skip if we can't get pod UID, because this doesn't work
131-
// for containerd 1.1.
132-
return false, nil
133-
}
134-
}
135142
updated := false
136143
for k, v := range s.Annotations {
137144
if !isVolumeKey(k) {
@@ -141,34 +148,51 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) {
141148
continue
142149
}
143150
volume := volumeName(k)
144-
if uid != "" {
151+
if IsSandbox(s) {
145152
// This is the root (first) container. Mount annotations are only
146153
// consumed from this container's spec. So fix mount annotations by:
147154
// 1. Adding source annotation.
148155
// 2. Fixing type annotation.
156+
uid, err := podUID(s)
157+
if err != nil {
158+
// Skip if we can't get pod UID, because this doesn't work
159+
// for containerd 1.1.
160+
return false, nil
161+
}
149162
path, err := volumePath(volume, uid)
150163
if err != nil {
151164
return false, fmt.Errorf("get volume path for %q: %w", volume, err)
152165
}
153166
s.Annotations[volumeSourceKey(volume)] = path
154167
if strings.Contains(path, emptyDirVolumesDir) {
155-
s.Annotations[k] = "tmpfs" // See note about EmptyDir.
168+
if isEmptyDirEmpty(path) {
169+
s.Annotations[k] = "tmpfs" // See note about EmptyDir.
170+
} else {
171+
// This is a non-empty EmptyDir volume. Configure it as a bind mount.
172+
log.L.Infof("Non-empty EmptyDir volume %q, configuring bind mount annotations", volume)
173+
s.Annotations[k] = "bind"
174+
s.Annotations[volumeShareKey(volume)] = "shared"
175+
}
156176
}
157177
updated = true
158178
} else {
159179
// This is a sub-container. Mount annotations are ignored. So no need to
160-
// bother fixing those.
180+
// bother fixing those. An error is returned for sandbox if source
181+
// annotation is not successfully applied, so it is guaranteed that the
182+
// source annotation for sandbox has already been successfully applied at
183+
// this point. Update mount type in spec.Mounts if required.
161184
for i := range s.Mounts {
162-
// An error is returned for sandbox if source annotation is not
163-
// successfully applied, so it is guaranteed that the source annotation
164-
// for sandbox has already been successfully applied at this point.
165-
//
166185
// The volume name is unique inside a pod, so matching without podUID
167186
// is fine here.
168187
//
169188
// TODO: Pass podUID down to shim for containers to do more accurate
170189
// matching.
171190
if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes {
191+
if strings.Contains(s.Mounts[i].Source, emptyDirVolumesDir) && !isEmptyDirEmpty(s.Mounts[i].Source) {
192+
// This is a non-empty EmptyDir volume. Don't change the mount type.
193+
log.L.Infof("Non-empty EmptyDir volume %q, not changing its mount type", volume)
194+
continue
195+
}
172196
// Container mount type must match the mount type specified by
173197
// admission controller. See note about EmptyDir.
174198
specutils.ChangeMountType(&s.Mounts[i], v)
@@ -187,6 +211,31 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) {
187211
return updated, nil
188212
}
189213

214+
func isEmptyDirEmpty(path string) bool {
215+
f, err := os.Open(path)
216+
if err != nil {
217+
log.L.Warningf("failed to open %q to check if it is empty: %v", path, err)
218+
return true
219+
}
220+
defer f.Close()
221+
222+
names, err := f.Readdirnames(2)
223+
if len(names) == 0 && err == io.EOF {
224+
return true
225+
}
226+
if err != io.EOF && err != nil {
227+
log.L.Warningf("failed to readdirnames %q to check if it is empty: %v", path, err)
228+
return true
229+
}
230+
if len(names) == 1 && strings.HasPrefix(names[0], selfFilestorePrefix) {
231+
// The gVisor filestore file is the only file in the directory. This means
232+
// that a previous container already created a shared mount for this
233+
// EmptyDir. This is expected and should be considered empty.
234+
return true
235+
}
236+
return false
237+
}
238+
190239
// configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs
191240
// mount inside containers.
192241
//

pkg/shim/v1/utils/volumes_test.go

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,25 @@ func TestUpdateVolumeAnnotations(t *testing.T) {
3232
kubeletPodsDir = dir
3333

3434
const (
35-
testPodUID = "testuid"
36-
testVolumeName = "testvolume"
37-
testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID
38-
testLegacyLogDirPath = "/var/log/pods/" + testPodUID
35+
testPodUID = "testuid"
36+
testVolumeName = "testvolume"
37+
testNonEmptyVolumeName = "nonemptyvolume"
38+
testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID
39+
testLegacyLogDirPath = "/var/log/pods/" + testPodUID
3940
)
4041
testVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testVolumeName)
41-
4242
if err := os.MkdirAll(testVolumePath, 0755); err != nil {
4343
t.Fatalf("Create test volume: %v", err)
4444
}
4545

46+
testNonEmptyVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testNonEmptyVolumeName)
47+
if err := os.MkdirAll(testNonEmptyVolumePath, 0755); err != nil {
48+
t.Fatalf("Create test volume: %v", err)
49+
}
50+
if err := os.WriteFile(testNonEmptyVolumePath+"/file", []byte("hello"), 0644); err != nil {
51+
t.Fatalf("Create test volume: %v", err)
52+
}
53+
4654
for _, test := range []struct {
4755
name string
4856
spec *specs.Spec
@@ -144,6 +152,87 @@ func TestUpdateVolumeAnnotations(t *testing.T) {
144152
},
145153
expectUpdate: true,
146154
},
155+
{
156+
name: "non-empty volume for sandbox",
157+
spec: &specs.Spec{
158+
Annotations: map[string]string{
159+
sandboxLogDirAnnotation: testLogDirPath,
160+
ContainerTypeAnnotation: containerTypeSandbox,
161+
volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod",
162+
volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs",
163+
volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro",
164+
},
165+
},
166+
expected: &specs.Spec{
167+
Annotations: map[string]string{
168+
sandboxLogDirAnnotation: testLogDirPath,
169+
ContainerTypeAnnotation: containerTypeSandbox,
170+
volumeKeyPrefix + testNonEmptyVolumeName + ".share": "shared",
171+
volumeKeyPrefix + testNonEmptyVolumeName + ".type": "bind",
172+
volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro",
173+
volumeKeyPrefix + testNonEmptyVolumeName + ".source": testNonEmptyVolumePath,
174+
},
175+
},
176+
expectUpdate: true,
177+
},
178+
{
179+
name: "non-empty volume for container",
180+
spec: &specs.Spec{
181+
Mounts: []specs.Mount{
182+
{
183+
Destination: "/test",
184+
Type: "bind",
185+
Source: testNonEmptyVolumePath,
186+
Options: []string{"ro"},
187+
},
188+
},
189+
Annotations: map[string]string{
190+
ContainerTypeAnnotation: ContainerTypeContainer,
191+
volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod",
192+
volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs",
193+
volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro",
194+
},
195+
},
196+
expected: &specs.Spec{
197+
Mounts: []specs.Mount{
198+
{
199+
Destination: "/test",
200+
Type: "bind",
201+
Source: testNonEmptyVolumePath,
202+
Options: []string{"ro"},
203+
},
204+
},
205+
Annotations: map[string]string{
206+
ContainerTypeAnnotation: ContainerTypeContainer,
207+
volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod",
208+
volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs",
209+
volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro",
210+
},
211+
},
212+
},
213+
{
214+
name: "bind: volume annotations for sandbox",
215+
spec: &specs.Spec{
216+
Annotations: map[string]string{
217+
sandboxLogDirAnnotation: testLogDirPath,
218+
ContainerTypeAnnotation: containerTypeSandbox,
219+
volumeKeyPrefix + testVolumeName + ".share": "container",
220+
volumeKeyPrefix + testVolumeName + ".type": "bind",
221+
volumeKeyPrefix + testVolumeName + ".options": "ro",
222+
},
223+
},
224+
expected: &specs.Spec{
225+
Annotations: map[string]string{
226+
sandboxLogDirAnnotation: testLogDirPath,
227+
ContainerTypeAnnotation: containerTypeSandbox,
228+
volumeKeyPrefix + testVolumeName + ".share": "container",
229+
volumeKeyPrefix + testVolumeName + ".type": "tmpfs",
230+
volumeKeyPrefix + testVolumeName + ".options": "ro",
231+
volumeKeyPrefix + testVolumeName + ".source": testVolumePath,
232+
},
233+
},
234+
expectUpdate: true,
235+
},
147236
{
148237
name: "bind: volume annotations for container",
149238
spec: &specs.Spec{

runsc/boot/mount_hints.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,12 @@ func (m *MountHint) ShouldShareMount() bool {
185185
(m.Share == container || m.Share == pod)
186186
}
187187

188+
// IsSandboxLocal returns true if this mount is only used by the sandbox and
189+
// has no external observers.
190+
func (m *MountHint) IsSandboxLocal() bool {
191+
return m.Share == container || m.Share == pod
192+
}
193+
188194
// checkCompatible verifies that shared mount is compatible with master.
189195
// Master options must be the same or less restrictive than the container mount,
190196
// e.g. master can be 'rw' while container mounts as 'ro'.

runsc/container/container.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ func (c *Container) initGoferConfs(ovlConf config.Overlay2, mountHints *boot.Pod
965965
if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) {
966966
overlayMedium = config.NoOverlay
967967
}
968-
if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil {
968+
if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil && hint.IsSandboxLocal() {
969969
// Note that we want overlayMedium=self even if this is a read-only mount so that
970970
// the shared mount is created correctly. Future containers may mount this writably.
971971
overlayMedium = config.SelfOverlay

0 commit comments

Comments
 (0)