Skip to content

Commit b0a9d5f

Browse files
Abhishek Singh (Manifold)helsaawy
authored andcommitted
multi pod changes for GCS
Introduce cgroup changes and per pod mount changes to support multiple pod. (cherry picked from commit 7170f3fae8d26fef6a975cdff8300f9ca67691d1) Signed-off-by: Hamza El-Saawy <[email protected]>
1 parent d4c4622 commit b0a9d5f

File tree

7 files changed

+617
-119
lines changed

7 files changed

+617
-119
lines changed

cmd/gcs/main.go

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"os/exec"
1212
"path/filepath"
13+
"strings"
1314
"syscall"
1415
"time"
1516

@@ -67,7 +68,12 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre
6768
}
6869

6970
count++
70-
msg := "memory usage for cgroup exceeded threshold"
71+
var msg string
72+
if strings.HasPrefix(cgName, "/virtual-pods") {
73+
msg = "memory usage for virtual pods cgroup exceeded threshold"
74+
} else {
75+
msg = "memory usage for cgroup exceeded threshold"
76+
}
7177
entry := logrus.WithFields(logrus.Fields{
7278
"gcsStartTime": startTime,
7379
"time": time.Now(),
@@ -294,40 +300,9 @@ func main() {
294300
// Continuously log /dev/kmsg
295301
go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel))
296302

297-
tport := &transport.VsockTransport{}
298-
rtime, err := runc.NewRuntime(baseLogPath)
299-
if err != nil {
300-
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
301-
}
302-
mux := bridge.NewBridgeMux()
303-
b := bridge.Bridge{
304-
Handler: mux,
305-
EnableV4: *v4,
306-
}
307-
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
308-
b.AssignHandlers(mux, h)
309-
310-
var bridgeIn io.ReadCloser
311-
var bridgeOut io.WriteCloser
312-
if *useInOutErr {
313-
bridgeIn = os.Stdin
314-
bridgeOut = os.Stdout
315-
} else {
316-
const commandPort uint32 = 0x40000000
317-
bridgeCon, err := tport.Dial(commandPort)
318-
if err != nil {
319-
logrus.WithFields(logrus.Fields{
320-
"port": commandPort,
321-
logrus.ErrorKey: err,
322-
}).Fatal("failed to dial host vsock connection")
323-
}
324-
bridgeIn = bridgeCon
325-
bridgeOut = bridgeCon
326-
}
327-
328303
// Setup the UVM cgroups to protect against a workload taking all available
329-
// memory and causing the GCS to malfunction we create two cgroups: gcs,
330-
// containers.
304+
// memory and causing the GCS to malfunction we create cgroups: gcs,
305+
// containers, and virtual-pods for multi-pod support.
331306
//
332307

333308
// Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy
@@ -357,6 +332,18 @@ func main() {
357332
}
358333
defer containersControl.Delete() //nolint:errcheck
359334

335+
// Create virtual-pods cgroup hierarchy for multi-pod support
336+
// This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID}
337+
virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{
338+
Memory: &oci.LinuxMemory{
339+
Limit: &containersLimit, // Share the same limit as containers
340+
},
341+
})
342+
if err != nil {
343+
logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup")
344+
}
345+
defer virtualPodsControl.Delete() //nolint:errcheck
346+
360347
gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{})
361348
if err != nil {
362349
logrus.WithError(err).Fatal("failed to create gcs cgroup")
@@ -366,6 +353,39 @@ func main() {
366353
logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup")
367354
}
368355

356+
tport := &transport.VsockTransport{}
357+
rtime, err := runc.NewRuntime(baseLogPath)
358+
if err != nil {
359+
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
360+
}
361+
mux := bridge.NewBridgeMux()
362+
b := bridge.Bridge{
363+
Handler: mux,
364+
EnableV4: *v4,
365+
}
366+
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
367+
// Initialize virtual pod support in the host
368+
h.InitializeVirtualPodSupport(virtualPodsControl)
369+
b.AssignHandlers(mux, h)
370+
371+
var bridgeIn io.ReadCloser
372+
var bridgeOut io.WriteCloser
373+
if *useInOutErr {
374+
bridgeIn = os.Stdin
375+
bridgeOut = os.Stdout
376+
} else {
377+
const commandPort uint32 = 0x40000000
378+
bridgeCon, err := tport.Dial(commandPort)
379+
if err != nil {
380+
logrus.WithFields(logrus.Fields{
381+
"port": commandPort,
382+
logrus.ErrorKey: err,
383+
}).Fatal("failed to dial host vsock connection")
384+
}
385+
bridgeIn = bridgeCon
386+
bridgeOut = bridgeCon
387+
}
388+
369389
event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false)
370390
gefd, err := gcsControl.RegisterMemoryEvent(event)
371391
if err != nil {
@@ -381,6 +401,14 @@ func main() {
381401
oomFile := os.NewFile(oom, "cefd")
382402
defer oomFile.Close()
383403

404+
// Setup OOM monitoring for virtual-pods cgroup
405+
virtualPodsOom, err := virtualPodsControl.OOMEventFD()
406+
if err != nil {
407+
logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd")
408+
}
409+
virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd")
410+
defer virtualPodsOomFile.Close()
411+
384412
// time synchronization service
385413
if !(*disableTimeSync) {
386414
if err = startTimeSyncService(); err != nil {
@@ -390,6 +418,7 @@ func main() {
390418

391419
go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl)
392420
go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl)
421+
go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl)
393422
err = b.ListenAndServe(bridgeIn, bridgeOut)
394423
if err != nil {
395424
logrus.WithFields(logrus.Fields{

internal/guest/runtime/hcsv2/container.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/Microsoft/hcsshim/internal/oc"
3131
"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
3232
"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
33+
"github.com/Microsoft/hcsshim/pkg/annotations"
3334
)
3435

3536
// containerStatus has been introduced to enable parallel container creation
@@ -193,13 +194,21 @@ func (c *Container) Delete(ctx context.Context) error {
193194
entity := log.G(ctx).WithField(logfields.ContainerID, c.id)
194195
entity.Info("opengcs::Container::Delete")
195196
if c.isSandbox {
196-
// remove user mounts in sandbox container
197-
if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDir(c.id), true); err != nil {
197+
// Check if this is a virtual pod
198+
virtualSandboxID := ""
199+
if c.spec != nil && c.spec.Annotations != nil {
200+
virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID]
201+
}
202+
203+
// remove user mounts in sandbox container - use virtual pod aware paths
204+
mountsDir := specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID)
205+
if err := storage.UnmountAllInPath(ctx, mountsDir, true); err != nil {
198206
entity.WithError(err).Error("failed to unmount sandbox mounts")
199207
}
200208

201-
// remove hugepages mounts in sandbox container
202-
if err := storage.UnmountAllInPath(ctx, specGuest.HugePagesMountsDir(c.id), true); err != nil {
209+
// remove hugepages mounts in sandbox container - use virtual pod aware paths
210+
hugePagesDir := specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID)
211+
if err := storage.UnmountAllInPath(ctx, hugePagesDir, true); err != nil {
203212
entity.WithError(err).Error("failed to unmount hugepages mounts")
204213
}
205214
}

internal/guest/runtime/hcsv2/sandbox_container.go

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/Microsoft/hcsshim/internal/guest/network"
1717
specGuest "github.com/Microsoft/hcsshim/internal/guest/spec"
18+
"github.com/Microsoft/hcsshim/internal/log"
1819
"github.com/Microsoft/hcsshim/internal/oc"
1920
"github.com/Microsoft/hcsshim/pkg/annotations"
2021
)
@@ -23,22 +24,37 @@ func getSandboxHostnamePath(id string) string {
2324
return filepath.Join(specGuest.SandboxRootDir(id), "hostname")
2425
}
2526

27+
func getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID string) string {
28+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname")
29+
}
30+
2631
func getSandboxHostsPath(id string) string {
2732
return filepath.Join(specGuest.SandboxRootDir(id), "hosts")
2833
}
2934

35+
func getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID string) string {
36+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts")
37+
}
38+
3039
func getSandboxResolvPath(id string) string {
3140
return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf")
3241
}
3342

43+
func getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID string) string {
44+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf")
45+
}
46+
3447
func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) {
3548
ctx, span := oc.StartSpan(ctx, "hcsv2::setupSandboxContainerSpec")
3649
defer span.End()
3750
defer func() { oc.SetSpanStatus(span, err) }()
3851
span.AddAttributes(trace.StringAttribute("cid", id))
3952

40-
// Generate the sandbox root dir
41-
rootDir := specGuest.SandboxRootDir(id)
53+
// Check if this is a virtual pod to use appropriate root directory
54+
virtualSandboxID := spec.Annotations[annotations.VirtualPodID]
55+
56+
// Generate the sandbox root dir - virtual pod aware
57+
rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID)
4258
if err := os.MkdirAll(rootDir, 0755); err != nil {
4359
return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir)
4460
}
@@ -58,39 +74,55 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
5874
}
5975
}
6076

61-
sandboxHostnamePath := getSandboxHostnamePath(id)
77+
sandboxHostnamePath := getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID)
6278
if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil {
6379
return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath)
6480
}
6581

6682
// Write the hosts
6783
sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname)
68-
sandboxHostsPath := getSandboxHostsPath(id)
84+
sandboxHostsPath := getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID)
6985
if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil {
7086
return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath)
7187
}
7288

89+
log.G(ctx).Debug("quick setup network namespace, cflick")
90+
// Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID
91+
isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID
92+
if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox {
93+
ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
94+
err := ns.Sync(ctx)
95+
if err != nil {
96+
return err
97+
}
98+
}
7399
// Write resolv.conf
100+
log.G(ctx).Debug("sandbox resolv.conf, cflick")
74101
ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
75102
if err != nil {
76-
return err
77-
}
78-
var searches, servers []string
79-
for _, n := range ns.Adapters() {
80-
if len(n.DNSSuffix) > 0 {
81-
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
103+
if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") {
104+
return err
82105
}
83-
if len(n.DNSServerList) > 0 {
84-
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
106+
// Networking is skipped, do not error out
107+
log.G(ctx).Infof("setupSandboxContainerSpec: Did not find NS spec %v, err %v", spec, err)
108+
} else {
109+
var searches, servers []string
110+
for _, n := range ns.Adapters() {
111+
if len(n.DNSSuffix) > 0 {
112+
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
113+
}
114+
if len(n.DNSServerList) > 0 {
115+
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
116+
}
117+
}
118+
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
119+
if err != nil {
120+
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
121+
}
122+
sandboxResolvPath := getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID)
123+
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
124+
return errors.Wrap(err, "failed to write sandbox resolv.conf")
85125
}
86-
}
87-
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
88-
if err != nil {
89-
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
90-
}
91-
sandboxResolvPath := getSandboxResolvPath(id)
92-
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
93-
return errors.Wrap(err, "failed to write sandbox resolv.conf")
94126
}
95127

96128
// User.Username is generally only used on Windows, but as there's no (easy/fast at least) way to grab
@@ -113,8 +145,14 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
113145
// also has a concept of a sandbox/shm file when the IPC NamespaceMode !=
114146
// NODE.
115147

116-
// Force the parent cgroup into our /containers root
117-
spec.Linux.CgroupsPath = "/containers/" + id
148+
// Set cgroup path - check if this is a virtual pod
149+
if virtualSandboxID != "" {
150+
// Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID
151+
spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID
152+
} else {
153+
// Traditional sandbox goes under /containers
154+
spec.Linux.CgroupsPath = "/containers/" + id
155+
}
118156

119157
// Clear the windows section as we dont want to forward to runc
120158
spec.Windows = nil

0 commit comments

Comments
 (0)