Skip to content

Commit 22f9510

Browse files
committed
[ms/release/0.1]Backport annotation, vNUMA, and GPU device changes (#2493)
* Organize annotations; change annotation expansions. (#2449) Break out `pkg\annotations\annotations.go` constants into sections for easier searching and readability. Deprecate `AnnotationExpansions` and instead provide `AnnotationExpansionMap()`, which returns the same value, but provides a new copy every call, so the `map` cannot be modified. Cannot delete it or change its type, since it is technically public. Signed-off-by: Hamza El-Saawy <[email protected]> (cherry picked from commit ffcf48b) Signed-off-by: Hamza El-Saawy <[email protected]> * Warn on incomplete vNUMA setting, clarify field names (#2466) Warn if vNUMA is not completely specified in uVM creation options, as this is likely a user error. Rename `"uvm".Opts.MaxSizePerNode` to `MaxMemorySizePerNumaNode` and clarify that it is measured in MiB. Similarly, rename `"annotations".NumaMaximumSizePerNode` to `NumaMaximumMemorySizePerNode`. Format `prepareVNumaTopology` doc comment to display appropriately. Related: switch to using `"logrus".IsLevelEnabled` rather than explicit logging level comparison, and fix bug where `--debug` flag was not added to runc if logging level is greater than `Debug` (i.e., `Trace`). Signed-off-by: Hamza El-Saawy <[email protected]> (cherry picked from commit 0842153) Signed-off-by: Hamza El-Saawy <[email protected]> * Fix CUDA for non-privileged containers (#2492) CUDA initialization for GPUs fails for non-privileged containers. Experimenting shows that adding `rw` for all character devices fixes the error, so expand the [default `c *:* m` permissions](https://github.com/opencontainers/runc/blob/6bae6cad4759a5b3537d550f43ea37d51c6b518a/libcontainer/specconv/spec_linux.go#L205-L222) to `c *:* rwm`. Add `"gpu"` string constant and streamline device assignment logic. Signed-off-by: Hamza El-Saawy <[email protected]> (cherry picked from commit 144c633) Signed-off-by: Hamza El-Saawy <[email protected]> --------- Signed-off-by: Hamza El-Saawy <[email protected]>
1 parent 6f9af87 commit 22f9510

File tree

17 files changed

+394
-258
lines changed

17 files changed

+394
-258
lines changed

cmd/runhcs/container.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ func launchShim(cmd, pidFile, logFile string, args []string, data interface{}) (
153153
}
154154

155155
fullargs = append(fullargs, "--log-format", logFormat)
156-
if logrus.GetLevel() == logrus.DebugLevel {
156+
if logrus.IsLevelEnabled(logrus.DebugLevel) {
157157
fullargs = append(fullargs, "--debug")
158158
}
159159
}

internal/annotations/annotations.go

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
// Do not rely on these annotations to customize production workload behavior.
66
package annotations
77

8-
// uVM specific annotations
9-
8+
// uVM annotations.
109
const (
1110
// UVMHyperVSocketConfigPrefix is the prefix of an annotation to map a [hyper-v socket] service GUID
1211
// to a JSON-encoded string of its [configuration].
@@ -30,24 +29,15 @@ const (
3029
// [configuration]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#HvSocketServiceConfig
3130
UVMHyperVSocketConfigPrefix = "io.microsoft.virtualmachine.hv-socket.service-table."
3231

33-
// AdditionalRegistryValues specifies additional registry keys and their values to set in the WCOW UVM.
34-
// The format is a JSON-encoded string of an array containing [HCS RegistryValue] objects.
35-
//
36-
// Registry values will be available under `HKEY_LOCAL_MACHINE` root key.
37-
//
38-
// For example:
39-
//
40-
// "[{\"Key\": {\"Hive\": \"System\", \"Name\": \"registry\\key\\path"}, \"Name\": \"ValueName\", \"Type\": \"String\", \"StringValue\": \"value\"}]"
41-
//
42-
// [HCS RegistryValue]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#registryvalue
43-
AdditionalRegistryValues = "io.microsoft.virtualmachine.wcow.additional-reg-keys"
44-
45-
// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
46-
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"
47-
4832
// UVMConsolePipe is the name of the named pipe that the UVM console is connected to. This works only for non-SNP
4933
// scenario, for SNP use a debugger.
5034
UVMConsolePipe = "io.microsoft.virtualmachine.console.pipe"
35+
)
36+
37+
// LCOW uVM annotations.
38+
const (
39+
// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
40+
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"
5141

5242
// NetworkingPolicyBasedRouting toggles on the ability to set policy based routing in the
5343
// guest for LCOW.
@@ -57,3 +47,18 @@ const (
5747
// LCOW scenarios. Ideally, this annotation should be removed if no issues are found.
5848
NetworkingPolicyBasedRouting = "io.microsoft.virtualmachine.lcow.network.policybasedrouting"
5949
)
50+
51+
// WCOW uVM annotations.
52+
const (
53+
// AdditionalRegistryValues specifies additional registry keys and their values to set in the WCOW UVM.
54+
// The format is a JSON-encoded string of an array containing [HCS RegistryValue] objects.
55+
//
56+
// Registry values will be available under `HKEY_LOCAL_MACHINE` root key.
57+
//
58+
// For example:
59+
//
60+
// "[{\"Key\": {\"Hive\": \"System\", \"Name\": \"registry\\key\\path"}, \"Name\": \"ValueName\", \"Type\": \"String\", \"StringValue\": \"value\"}]"
61+
//
62+
// [HCS RegistryValue]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#registryvalue
63+
AdditionalRegistryValues = "io.microsoft.virtualmachine.wcow.additional-reg-keys"
64+
)

internal/gcs/bridge.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ func (brdg *bridge) writeMessage(buf *bytes.Buffer, enc *json.Encoder, typ msgTy
404404
// Update the message header with the size.
405405
binary.LittleEndian.PutUint32(buf.Bytes()[hdrOffSize:], uint32(buf.Len()))
406406

407-
if brdg.log.Logger.GetLevel() > logrus.DebugLevel {
407+
if brdg.log.Logger.IsLevelEnabled(logrus.TraceLevel) {
408408
b := buf.Bytes()[hdrSize:]
409409
switch typ {
410410
// container environment vars are in rpCreate for linux; rpcExecuteProcess for windows

internal/guest/bridge/bridge.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser
310310
trace.StringAttribute("cid", base.ContainerID))
311311

312312
entry := log.G(ctx)
313-
if entry.Logger.GetLevel() > logrus.DebugLevel {
313+
if entry.Logger.IsLevelEnabled(logrus.TraceLevel) {
314314
var err error
315315
var msgBytes []byte
316316
switch header.Type {

internal/guest/network/netns.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func NetNSConfig(ctx context.Context, ifStr string, nsPid int, adapter *guestres
170170
}
171171

172172
// Add some debug logging
173-
if entry.Logger.GetLevel() >= logrus.DebugLevel {
173+
if entry.Logger.IsLevelEnabled(logrus.DebugLevel) {
174174
curNS, _ := netns.Get()
175175
// Refresh link attributes/state
176176
link, _ = netlink.LinkByIndex(link.Attrs().Index)

internal/guest/runtime/hcsv2/nvidia_utils.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ import (
2323
const nvidiaDebugFilePath = "nvidia-container.log"
2424
const nvidiaToolBinary = "nvidia-container-cli"
2525

26-
// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
27-
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the prestart hook
26+
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the createRuntime [OCI hooks].
27+
//
28+
// [OCI hooks]: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
2829
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec, ociBundlePath string) error {
2930
genericHookBinary := "generichook"
3031
genericHookPath, err := exec.LookPath(genericHookBinary)

internal/guest/runtime/hcsv2/workload_container.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,25 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
220220
if err := addNvidiaDeviceHook(ctx, spec, ociBundlePath); err != nil {
221221
return err
222222
}
223+
224+
// The NVIDIA device hook `nvidia-container-cli` adds `rw` permissions for the
225+
// GPU and ctl nodes (`c 195:*`) to the devices allow list, but CUDA apparently also
226+
// needs `rwm` permission for other device nodes (e.g., `c 235`)
227+
//
228+
// Grant `rwm` to all character devices (`c *:* rwm`) to avoid hard coding exact node
229+
// numbers, which are unknown before the driver runs (GPU devices are presented as I2C
230+
// devices initially) or could change with driver implementation.
231+
//
232+
// Note: runc already grants mknod, `c *:* m`, so this really adds `rw` permissions for
233+
// all character devices:
234+
// https://github.com/opencontainers/runc/blob/6bae6cad4759a5b3537d550f43ea37d51c6b518a/libcontainer/specconv/spec_linux.go#L205-L222
235+
spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices,
236+
oci.LinuxDeviceCgroup{
237+
Allow: true,
238+
Type: "c",
239+
Access: "rwm",
240+
},
241+
)
223242
}
224243
// add other assigned devices to the spec
225244
if err := specGuest.AddAssignedDevice(ctx, spec); err != nil {

internal/guest/spec/spec_devices.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ import (
1010
"strings"
1111
"time"
1212

13-
"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
14-
"github.com/Microsoft/hcsshim/internal/log"
1513
"github.com/opencontainers/runc/libcontainer/devices"
1614
oci "github.com/opencontainers/runtime-spec/specs-go"
1715
"github.com/pkg/errors"
16+
"github.com/sirupsen/logrus"
17+
18+
"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
19+
"github.com/Microsoft/hcsshim/internal/log"
1820
)
1921

2022
const (
@@ -23,13 +25,17 @@ const (
2325
charType = "char"
2426
blockType = "block"
2527

28+
// TODO: consolidate with `internal\uvm\virtual_device.go` and use in both locations
29+
gpuDeviceIDType = "gpu"
2630
vpciDeviceIDTypeLegacy = "vpci"
2731
vpciDeviceIDType = "vpci-instance-id"
2832
)
2933

3034
// AddAssignedDevice goes through the assigned devices that have been enumerated
3135
// on the spec and updates the spec so that the correct device nodes can be mounted
3236
// into the resulting container by the runtime.
37+
//
38+
// GPU devices are skipped, since they are handled in [addNvidiaDeviceHook].
3339
func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
3440
// Add an explicit timeout before we try to find the dev nodes so we
3541
// aren't waiting forever.
@@ -52,6 +58,12 @@ func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
5258
for _, dev := range devs {
5359
AddLinuxDeviceToSpec(ctx, dev, spec, true)
5460
}
61+
case gpuDeviceIDType:
62+
default:
63+
log.G(ctx).WithFields(logrus.Fields{
64+
"type": d.IDType,
65+
"id": d.ID,
66+
}).Warn("unknown device type")
5567
}
5668
}
5769

internal/hcsoci/devices.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -174,21 +174,21 @@ func handleAssignedDevicesLCOW(
174174

175175
// assign device into UVM and create corresponding spec windows devices
176176
for _, d := range specDevs {
177-
if uvm.IsValidDeviceType(d.IDType) {
178-
pciID, index := devices.GetDeviceInfoFromPath(d.ID)
179-
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
180-
if err != nil {
181-
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
182-
}
183-
closers = append(closers, vpci)
184-
185-
// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
186-
// map into the container
187-
d.ID = vpci.VMBusGUID
188-
resultDevs = append(resultDevs, d)
189-
} else {
177+
if !uvm.IsValidDeviceType(d.IDType) {
190178
return resultDevs, closers, errors.Errorf("specified device %s has unsupported type %s", d.ID, d.IDType)
191179
}
180+
181+
pciID, index := devices.GetDeviceInfoFromPath(d.ID)
182+
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
183+
if err != nil {
184+
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
185+
}
186+
closers = append(closers, vpci)
187+
188+
// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
189+
// map into the container
190+
d.ID = vpci.VMBusGUID
191+
resultDevs = append(resultDevs, d)
192192
}
193193

194194
return resultDevs, closers, nil

internal/oci/annotations.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func ProcessAnnotations(ctx context.Context, s *specs.Spec) error {
3434

3535
// expand annotations
3636
var errs []error
37-
for key, exps := range annotations.AnnotationExpansions {
37+
for key, exps := range annotations.AnnotationExpansionMap() {
3838
// check if annotation is present
3939
if val, ok := s.Annotations[key]; ok {
4040
// ideally, some normalization would occur here (ie, "True" -> "true")

0 commit comments

Comments
 (0)