Skip to content

Commit 23278c8

Browse files
committed
*: introduce image_pull_with_sync_fs in CRI
It's to ensure the data integrity during unexpected power failure. Background: Since release 1.3, in Linux system, containerD unpacks and writes files into overlayfs snapshot directly. It doesn’t involve any mount-umount operations so that the performance of pulling image has been improved. As we know, the umount syscall for overlayfs will force kernel to flush all the dirty pages into disk. Without umount syscall, the files’ data relies on kernel’s writeback threads or filesystem's commit setting (for instance, ext4 filesystem). The files in committed snapshot can be loss after unexpected power failure. However, the snapshot has been committed and the metadata also has been fsynced. There is data inconsistency between snapshot metadata and files in that snapshot. We, containerd, received several issues about data loss after unexpected power failure. * containerd#5854 * containerd#3369 (comment) Solution: * Option 1: SyncFs after unpack Linux platform provides [syncfs][syncfs] syscall to synchronize just the filesystem containing a given file. * Option 2: Fsync directories recursively and fsync on regular file The fsync doesn't support symlink/block device/char device files. We need to use fsync the parent directory to ensure that entry is persisted. However, based on [xfstest-dev][xfstest-dev], there is no case to ensure fsync-on-parent can persist the special file's metadata, for example, uid/gid, access mode. Checkout [generic/690][generic/690]: Syncing parent dir can persist symlink. But for f2fs, it needs special mount option. And it doesn't say that uid/gid can be persisted. All the details are behind the implemetation. > NOTE: All the related test cases has `_flakey_drop_and_remount` in [xfstest-dev]. Based on discussion about [Documenting the crash-recovery guarantees of Linux file systems][kernel-crash-recovery-data-integrity], we can't rely on Fsync-on-parent. * Option 1 is winner This patch is using option 1. There is test result based on [test-tool][test-tool]. All the networking traffic created by pull is local. * Image: docker.io/library/golang:1.19.4 (992 MiB) * Current: 5.446738579s * WIOS=21081, WBytes=1329741824, RIOS=79, RBytes=1197056 * Option 1: 6.239686088s * WIOS=34804, WBytes=1454845952, RIOS=79, RBytes=1197056 * Option 2: 1m30.510934813s * WIOS=42143, WBytes=1471397888, RIOS=82, RBytes=1209344 * Image: docker.io/tensorflow/tensorflow:latest (1.78 GiB, ~32590 Inodes) * Current: 8.852718042s * WIOS=39417, WBytes=2412818432, RIOS=2673, RBytes=335987712 * Option 1: 9.683387174s * WIOS=42767, WBytes=2431750144, RIOS=89, RBytes=1238016 * Option 2: 1m54.302103719s * WIOS=54403, WBytes=2460528640, RIOS=1709, RBytes=208237568 The Option 1 will increase `wios`. So, the `image_pull_with_sync_fs` is option in CRI plugin. [syncfs]: <https://man7.org/linux/man-pages/man2/syncfs.2.html> [xfstest-dev]: <https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git> [generic/690]: <https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git/tree/tests/generic/690?h=v2023.11.19> [kernel-crash-recovery-data-integrity]: <https://lore.kernel.org/linux-fsdevel/[email protected]/> [test-tool]: <https://github.com/fuweid/go-dmflakey/blob/a17fb2010db22654b3e54cf506b0dbb5ef7b33ca/contrib/syncfs/containerd/main_test.go#L51> Signed-off-by: Wei Fu <[email protected]>
1 parent bd5c602 commit 23278c8

File tree

12 files changed

+62
-4
lines changed

12 files changed

+62
-4
lines changed

client/image.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,14 @@ func WithUnpackDuplicationSuppressor(suppressor kmutex.KeyedLocker) UnpackOpt {
279279
}
280280
}
281281

282+
// WithUnpackApplyOpts appends new apply options on the UnpackConfig.
283+
func WithUnpackApplyOpts(opts ...diff.ApplyOpt) UnpackOpt {
284+
return func(ctx context.Context, uc *UnpackConfig) error {
285+
uc.ApplyOpts = append(uc.ApplyOpts, opts...)
286+
return nil
287+
}
288+
}
289+
282290
func (i *image) Unpack(ctx context.Context, snapshotterName string, opts ...UnpackOpt) error {
283291
ctx, done, err := i.client.WithLease(ctx)
284292
if err != nil {

contrib/diffservice/service.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (s *service) Apply(ctx context.Context, er *diffapi.ApplyRequest) (*diffapi
6060
}
6161
opts = append(opts, diff.WithPayloads(payloads))
6262
}
63+
opts = append(opts, diff.WithSyncFs(er.SyncFs))
6364

6465
ocidesc, err = s.applier.Apply(ctx, desc, mounts, opts...)
6566
if err != nil {

diff/apply/apply.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func (s *fsApplier) Apply(ctx context.Context, desc ocispec.Descriptor, mounts [
9292
r: io.TeeReader(processor, digester.Hash()),
9393
}
9494

95-
if err := apply(ctx, mounts, rc); err != nil {
95+
if err := apply(ctx, mounts, rc, config.SyncFs); err != nil {
9696
return emptyDesc, err
9797
}
9898

diff/apply/apply_darwin.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
"github.com/containerd/containerd/v2/mount"
2626
)
2727

28-
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
28+
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, _sync bool) error {
2929
// We currently do not support mounts nor bind mounts on MacOS in the containerd daemon.
3030
// Using this as an exception to enable native snapshotter and allow further research.
3131
if len(mounts) == 1 && mounts[0].Type == "bind" {
@@ -38,6 +38,8 @@ func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
3838
path := mounts[0].Source
3939
_, err := archive.Apply(ctx, path, r, opts...)
4040
return err
41+
42+
// TODO: Do we need to sync all the filesystems?
4143
}
4244

4345
return mount.WithTempMount(ctx, mounts, func(root string) error {

diff/apply/apply_linux.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,18 @@ import (
2020
"context"
2121
"fmt"
2222
"io"
23+
"os"
2324
"strings"
2425

2526
"github.com/containerd/containerd/v2/archive"
2627
"github.com/containerd/containerd/v2/errdefs"
2728
"github.com/containerd/containerd/v2/mount"
2829
"github.com/containerd/containerd/v2/pkg/userns"
30+
31+
"golang.org/x/sys/unix"
2932
)
3033

31-
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
34+
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, sync bool) (retErr error) {
3235
switch {
3336
case len(mounts) == 1 && mounts[0].Type == "overlay":
3437
// OverlayConvertWhiteout (mknod c 0 0) doesn't work in userns.
@@ -50,7 +53,18 @@ func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
5053
opts = append(opts, archive.WithParents(parents))
5154
}
5255
_, err = archive.Apply(ctx, path, r, opts...)
56+
if err == nil && sync {
57+
err = doSyncFs(path)
58+
}
5359
return err
60+
case sync && len(mounts) == 1 && mounts[0].Type == "bind":
61+
defer func() {
62+
if retErr != nil {
63+
return
64+
}
65+
66+
retErr = doSyncFs(mounts[0].Source)
67+
}()
5468
}
5569
return mount.WithTempMount(ctx, mounts, func(root string) error {
5670
_, err := archive.Apply(ctx, root, r)
@@ -75,3 +89,17 @@ func getOverlayPath(options []string) (upper string, lower []string, err error)
7589

7690
return
7791
}
92+
93+
func doSyncFs(file string) error {
94+
fd, err := os.Open(file)
95+
if err != nil {
96+
return fmt.Errorf("failed to open %s: %w", file, err)
97+
}
98+
defer fd.Close()
99+
100+
_, _, errno := unix.Syscall(unix.SYS_SYNCFS, fd.Fd(), 0, 0)
101+
if errno != 0 {
102+
return fmt.Errorf("failed to syncfs for %s: %w", file, errno)
103+
}
104+
return nil
105+
}

diff/apply/apply_other.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ import (
2626
"github.com/containerd/containerd/v2/mount"
2727
)
2828

29-
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
29+
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, _sync bool) error {
30+
// TODO: for windows, how to sync?
3031
return mount.WithTempMount(ctx, mounts, func(root string) error {
3132
_, err := archive.Apply(ctx, root, r)
3233
return err

diff/diff.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ type Comparer interface {
6767
type ApplyConfig struct {
6868
// ProcessorPayloads specifies the payload sent to various processors
6969
ProcessorPayloads map[string]typeurl.Any
70+
// SyncFs is to synchronize the underlying filesystem containing files
71+
SyncFs bool
7072
}
7173

7274
// ApplyOpt is used to configure an Apply operation
@@ -125,6 +127,14 @@ func WithPayloads(payloads map[string]typeurl.Any) ApplyOpt {
125127
}
126128
}
127129

130+
// WithSyncFs sets sync flag to the config.
131+
func WithSyncFs(sync bool) ApplyOpt {
132+
return func(_ context.Context, _ ocispec.Descriptor, c *ApplyConfig) error {
133+
c.SyncFs = sync
134+
return nil
135+
}
136+
}
137+
128138
// WithSourceDateEpoch specifies the timestamp used to provide control for reproducibility.
129139
// See also https://reproducible-builds.org/docs/source-date-epoch/ .
130140
//

diff/proxy/differ.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func (r *diffRemote) Apply(ctx context.Context, desc ocispec.Descriptor, mounts
6161
Diff: oci.DescriptorToProto(desc),
6262
Mounts: mount.ToProto(mounts),
6363
Payloads: payloads,
64+
SyncFs: config.SyncFs,
6465
}
6566
resp, err := r.client.Apply(ctx, req)
6667
if err != nil {

pkg/cri/config/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,9 @@ type PluginConfig struct {
357357
//
358358
// For example, the value can be '5h', '2h30m', '10s'.
359359
DrainExecSyncIOTimeout string `toml:"drain_exec_sync_io_timeout" json:"drainExecSyncIOTimeout"`
360+
// ImagePullWithSyncFs is an experimental setting. It's to force sync
361+
// filesystem during unpacking to ensure that data integrity.
362+
ImagePullWithSyncFs bool `toml:"image_pull_with_sync_fs" json:"imagePullWithSyncFs"`
360363
}
361364

362365
// X509KeyPairStreaming contains the x509 configuration for streaming

pkg/cri/config/config_unix.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ func DefaultConfig() PluginConfig {
100100
CDISpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
101101
ImagePullProgressTimeout: defaultImagePullProgressTimeoutDuration.String(),
102102
DrainExecSyncIOTimeout: "0s",
103+
ImagePullWithSyncFs: false,
103104
EnableUnprivilegedPorts: true,
104105
EnableUnprivilegedICMP: true,
105106
}

0 commit comments

Comments
 (0)