Skip to content

Commit 9772b9c

Browse files
committed
Allow writable /var and /etc directories
Binaries in the uVM can require writing to (or be configured by) files in `/var` (or `/etc`, respectively). An LCOW uVM VHD-backed rootfs, however, is readonly (as opposed to WCOW, which creates a new snapshot and therefor scratch VHD per uVM). Remedy this by creating `overlay` mounts for the two directories, enabled by the `/init` flag `-w`. Use `overlay` instead of creating a `tmpfs` mount directly over the directories (which is done for `/run` and `/tmp`) to preserve existing content in the rootfs. Add a new `WritableOverlayDirs` annotation to enable the feature. Make the annotation internal (unpublished) since: - it is implementation dependent (i.e., how LCOW uVMs are run could conceivably change in the future); and - the feature is aimed at more advanced use cases where users are modifying the LCOW uVMs rootfs and should be familiar with `hcshims`'s inner workings Signed-off-by: Hamza El-Saawy <[email protected]>
1 parent 296144f commit 9772b9c

File tree

6 files changed

+306
-18
lines changed

6 files changed

+306
-18
lines changed

init/init.c

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <net/if.h>
77
#include <netinet/ip.h>
88
#include <signal.h>
9+
#include <stdbool.h>
910
#include <stdio.h>
1011
#include <stdlib.h>
1112
#include <string.h>
@@ -138,6 +139,52 @@ const struct InitOp ops[] = {
138139
{OpMount, .mount = {"cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755"}},
139140
};
140141

142+
/*
143+
rootfs VHDs are mounted as read-only, which can cause issues for binaries running in the
144+
uVM (e.g., syslogd, (GPU) drivers) that expect to be able to write to /etc/
145+
(e.g., syslogd is configured by /etc/syslog.conf) or /var/ (e.g., syslogd (typically) writes to /var/log/messages).
146+
147+
Make /var and /etc writable by creating an overlay with a tmpfs-backer upper (and work) directories.
148+
149+
Use /run for overlay directories since that shouldn't be as volatile as /tmp.
150+
/run is already tmpfs backed, but create a new (smaller) tmpfs mount to prevent contestion
151+
with container-specific files under /run/gcs/c/ (e.g., the container config file and overlay work directory).
152+
153+
Note: tmpfs is backed by virtual memory and can be swapped out, but the uVM is, itself, virtual memory
154+
backed on the host.
155+
Hence limiting the total size of tmpfs mounts will prevent the virtual machine's worker
156+
thread on the host from growing egregiously.
157+
158+
See:
159+
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s07.html
160+
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05.html
161+
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05s10.html
162+
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html
163+
*/
164+
#define OVERLAY_PATH "/run/over"
165+
#define VAR_OVERLAY_PATH OVERLAY_PATH "/var"
166+
#define ETC_OVERLAY_PATH OVERLAY_PATH "/etc"
167+
168+
const struct InitOp overlay_ops[] = {
169+
// /run should already exist
170+
{OpMkdir, .mkdir = {OVERLAY_PATH, 0755}},
171+
{OpMount, .mount = {"tmpfs", OVERLAY_PATH, "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "size=40\%,mode=0755"}},
172+
173+
// /etc
174+
{OpMkdir, .mkdir = {ETC_OVERLAY_PATH, 0755}},
175+
{OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/upper"), 0755}},
176+
{OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/work"), 0755}},
177+
{OpMount, .mount = {"overlay", "/etc", "overlay", MS_NODEV | MS_NOSUID | MS_NOEXEC,
178+
"lowerdir=/etc,upperdir=" ETC_OVERLAY_PATH "/upper,workdir=" ETC_OVERLAY_PATH "/work"}},
179+
180+
// /var
181+
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH, 0755}},
182+
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/upper", 0755}},
183+
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/work", 0755}},
184+
{OpMount, .mount = {"overlay", "/var", "overlay", MS_NODEV | MS_NOSUID, // allow execs from the /var
185+
"lowerdir=/var,upperdir=" VAR_OVERLAY_PATH "/upper,workdir=" VAR_OVERLAY_PATH "/work"}},
186+
};
187+
141188
void warn(const char* msg) {
142189
int error = errno;
143190
perror(msg);
@@ -592,6 +639,8 @@ int debug_main(int argc, char** argv) {
592639
close(sockets[i]);
593640
}
594641
}
642+
643+
return 0;
595644
}
596645
#endif
597646

@@ -637,24 +686,27 @@ void start_services() {
637686

638687
int main(int argc, char** argv) {
639688
#ifdef DEBUG
640-
debug_main(argc, argv);
689+
if (debug_main(argc, argv) != 0) {
690+
dmesgWarn("failed to connect debug sockets");
691+
}
641692
printf("Running init\n");
642693
#endif
643694
char* debug_shell = NULL;
644695
int entropy_port = 0;
696+
bool overlay_mount = false;
645697
if (argc <= 1) {
646698
argv = (char**)default_argv;
647699
argc = sizeof(default_argv) / sizeof(default_argv[0]);
648700
optind = 0;
649701
debug_shell = (char*)default_shell;
650702
} else {
651-
for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0;) {
703+
for (int opt; (opt = getopt(argc, argv, "+d:e:w")) >= 0;) {
652704
switch (opt) {
653-
case 'd':
705+
case 'd': // [d]ebug
654706
debug_shell = optarg;
655707
break;
656708

657-
case 'e':
709+
case 'e': // [e]ntropy port
658710
entropy_port = atoi(optarg);
659711
#ifdef DEBUG
660712
printf("entropy port %d\n", entropy_port);
@@ -666,6 +718,10 @@ int main(int argc, char** argv) {
666718

667719
break;
668720

721+
case 'w': // [w]ritable overlay mounts
722+
overlay_mount = true;
723+
break;
724+
669725
default:
670726
exit(1);
671727
}
@@ -702,6 +758,13 @@ int main(int argc, char** argv) {
702758
#endif
703759
init_fs(ops, sizeof(ops) / sizeof(ops[0]));
704760

761+
if (overlay_mount) {
762+
#ifdef DEBUG
763+
printf("init_fs for overlay mounts\n");
764+
#endif
765+
init_fs(overlay_ops, sizeof(overlay_ops) / sizeof(overlay_ops[0]));
766+
}
767+
705768
#ifdef DEBUG
706769
printf("init_cgroups\n");
707770
#endif

internal/annotations/annotations.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ const (
3939
// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
4040
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"
4141

42+
// WritableOverlayDirs creates writable overlay mounts for the /var and /etc directories.
43+
//
44+
// This will nop if the LCOW uVM rootfs is already writable (e.g., initramfs-backed initrd).
45+
WritableOverlayDirs = "io.microsoft.virtualmachine.lcow.writable-overlay-directories"
46+
4247
// NetworkingPolicyBasedRouting toggles on the ability to set policy based routing in the
4348
// guest for LCOW.
4449
//

internal/oci/uvm.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
318318
opts.ProcessDumpLocation = ParseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, opts.ProcessDumpLocation)
319319
opts.NoWritableFileShares = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableWritableFileShares, opts.NoWritableFileShares)
320320
opts.DumpDirectoryPath = ParseAnnotationsString(s.Annotations, annotations.DumpDirectoryPath, opts.DumpDirectoryPath)
321+
opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe)
321322

322323
// NUMA settings
323324
opts.MaxProcessorsPerNumaNode = ParseAnnotationsUint32(ctx, s.Annotations, annotations.NumaMaximumProcessorsPerNode, opts.MaxProcessorsPerNumaNode)
@@ -330,7 +331,6 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
330331
opts.NumaProcessorCounts)
331332
opts.NumaMemoryBlocksCounts = ParseAnnotationCommaSeparatedUint64(ctx, s.Annotations, annotations.NumaCountOfMemoryBlocks,
332333
opts.NumaMemoryBlocksCounts)
333-
opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe)
334334

335335
maps.Copy(opts.AdditionalHyperVConfig, parseHVSocketServiceTable(ctx, s.Annotations))
336336

@@ -377,6 +377,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (
377377
lopts.UVMReferenceInfoFile = ParseAnnotationsString(s.Annotations, annotations.LCOWReferenceInfoFile, lopts.UVMReferenceInfoFile)
378378
lopts.KernelBootOptions = ParseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions)
379379
lopts.DisableTimeSyncService = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService)
380+
lopts.WritableOverlayDirs = ParseAnnotationsBool(ctx, s.Annotations, iannotations.WritableOverlayDirs, lopts.WritableOverlayDirs)
380381
handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts)
381382
handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts)
382383
handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, lopts)

internal/uvm/create_lcow.go

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ type OptionsLCOW struct {
132132
ExtraVSockPorts []uint32 // Extra vsock ports to allow
133133
AssignedDevices []VPCIDeviceID // AssignedDevices are devices to add on pod boot
134134
PolicyBasedRouting bool // Whether we should use policy based routing when configuring net interfaces in guest
135+
WritableOverlayDirs bool // Whether init should create writable overlay mounts for /var and /etc
135136
}
136137

137138
// defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW
@@ -579,7 +580,9 @@ Example JSON document produced once the hcsschema.ComputeSytem returned by makeL
579580

580581
// Make the ComputeSystem document object that will be serialized to json to be presented to the HCS api.
581582
func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) {
582-
logrus.Tracef("makeLCOWDoc %v\n", opts)
583+
if logrus.IsLevelEnabled(logrus.TraceLevel) {
584+
log.G(ctx).WithField("options", log.Format(ctx, opts)).Trace("makeLCOWDoc")
585+
}
583586

584587
kernelFullPath := filepath.Join(opts.BootFilesPath, opts.KernelFile)
585588
if _, err := os.Stat(kernelFullPath); os.IsNotExist(err) {
@@ -868,10 +871,20 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs
868871
execCmdArgs += " -core-dump-location " + opts.ProcessDumpLocation
869872
}
870873

871-
initArgs := fmt.Sprintf("%s %s", entropyArgs, execCmdArgs)
874+
initArgs := entropyArgs
875+
if opts.WritableOverlayDirs {
876+
switch opts.PreferredRootFSType {
877+
case PreferredRootFSTypeInitRd:
878+
log.G(ctx).Warn("ignoring `WritableOverlayDirs` option since rootfs is already writable")
879+
case PreferredRootFSTypeVHD:
880+
initArgs += " -w"
881+
}
882+
}
872883
if vmDebugging {
873884
// Launch a shell on the console.
874-
initArgs = entropyArgs + ` sh -c "` + execCmdArgs + ` & exec sh"`
885+
initArgs += ` sh -c "` + execCmdArgs + ` & exec sh"`
886+
} else {
887+
initArgs += " " + execCmdArgs
875888
}
876889

877890
kernelArgs += fmt.Sprintf(" nr_cpus=%d", opts.ProcessorCount)
@@ -915,7 +928,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
915928
}
916929

917930
span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID))
918-
log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options")
931+
if logrus.IsLevelEnabled(logrus.DebugLevel) {
932+
log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options")
933+
}
919934

920935
// We don't serialize OutputHandlerCreator so if it is missing we need to put it back to the default.
921936
if opts.OutputHandlerCreator == nil {
@@ -960,10 +975,20 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
960975
var doc *hcsschema.ComputeSystem
961976
if opts.SecurityPolicyEnabled {
962977
doc, err = makeLCOWSecurityDoc(ctx, opts, uvm)
963-
log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWSecurityDoc result doc: %v err %v", doc, err)
978+
if logrus.IsLevelEnabled(logrus.TraceLevel) {
979+
log.G(ctx).WithFields(logrus.Fields{
980+
"doc": log.Format(ctx, doc),
981+
logrus.ErrorKey: err,
982+
}).Trace("create_lcow::CreateLCOW makeLCOWSecurityDoc result")
983+
}
964984
} else {
965985
doc, err = makeLCOWDoc(ctx, opts, uvm)
966-
log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWDoc result doc: %v err %v", doc, err)
986+
if logrus.IsLevelEnabled(logrus.TraceLevel) {
987+
log.G(ctx).WithFields(logrus.Fields{
988+
"doc": log.Format(ctx, doc),
989+
logrus.ErrorKey: err,
990+
}).Trace("create_lcow::CreateLCOW makeLCOWDoc result")
991+
}
967992
}
968993
if err != nil {
969994
return nil, err
@@ -972,7 +997,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
972997
if err = uvm.create(ctx, doc); err != nil {
973998
return nil, fmt.Errorf("error while creating the compute system: %w", err)
974999
}
975-
log.G(ctx).WithField("uvm", uvm).Trace("create_lcow::CreateLCOW uvm.create result")
1000+
if logrus.IsLevelEnabled(logrus.TraceLevel) {
1001+
log.G(ctx).WithField("uvm", log.Format(ctx, uvm)).Trace("create_lcow::CreateLCOW uvm.create result")
1002+
}
9761003

9771004
// Create a socket to inject entropy during boot.
9781005
uvm.entropyListener, err = uvm.listenVsock(entropyVsockPort)

0 commit comments

Comments
 (0)