Skip to content

Commit 0f694c4

Browse files
Merge pull request openshift#8765 from nutanix-cloud-native/nutanix-gpu-datadisks
CORS-3546: Nutanix: add gpus and dataDisks support
2 parents 7e98755 + fc00e04 commit 0f694c4

File tree

12 files changed

+1139
-53
lines changed

12 files changed

+1139
-53
lines changed

data/data/install.openshift.io_installconfigs.yaml

Lines changed: 540 additions & 1 deletion
Large diffs are not rendered by default.

pkg/asset/installconfig/nutanix/validation.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,26 @@ func ValidateForProvisioning(ic *types.InstallConfig) error {
7777
fmt.Sprintf("the failure domain %s configured subnet UUID does not correspond to a valid subnet in Prism: %v", fd.Name, err)))
7878
}
7979
}
80+
81+
// validate each configured dataSource image exists
82+
for _, dsImgRef := range fd.DataSourceImages {
83+
switch {
84+
case dsImgRef.UUID != "":
85+
if _, err = nc.V3.GetImage(ctx, dsImgRef.UUID); err != nil {
86+
errMsg := fmt.Sprintf("failureDomain %q: failed to find the dataSource image with uuid %s: %v", fd.Name, dsImgRef.UUID, err)
87+
errList = append(errList, field.Invalid(parentPath.Child("failureDomains", "dataSourceImage", "uuid"), dsImgRef.UUID, errMsg))
88+
}
89+
case dsImgRef.Name != "":
90+
if dsImgUUID, err := nutanixtypes.FindImageUUIDByName(ctx, nc, dsImgRef.Name); err != nil {
91+
errMsg := fmt.Sprintf("failureDomain %q: failed to find the dataSource image with name %q: %v", fd.Name, dsImgRef.UUID, err)
92+
errList = append(errList, field.Invalid(parentPath.Child("failureDomains", "dataSourceImage", "name"), dsImgRef.Name, errMsg))
93+
} else {
94+
dsImgRef.UUID = *dsImgUUID
95+
}
96+
default:
97+
errList = append(errList, field.Required(parentPath.Child("failureDomains", "dataSourceImage"), "both the dataSourceImage's uuid and name are empty, you need to configure one."))
98+
}
99+
}
80100
}
81101

82102
return errList.ToAggregate()

pkg/asset/machines/clusterapi.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ func (c *ClusterAPI) Generate(ctx context.Context, dependencies asset.Parents) e
450450
mpool.NumCPUs = 8
451451
mpool.Set(ic.Platform.Nutanix.DefaultMachinePlatform)
452452
mpool.Set(pool.Platform.Nutanix)
453-
if err = mpool.ValidateConfig(ic.Platform.Nutanix); err != nil {
453+
if err = mpool.ValidateConfig(ic.Platform.Nutanix, "master"); err != nil {
454454
return fmt.Errorf("failed to generate Cluster API machine manifests for control-plane: %w", err)
455455
}
456456
pool.Platform.Nutanix = &mpool

pkg/asset/machines/master.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ func (m *Master) Generate(ctx context.Context, dependencies asset.Parents) error
498498
mpool.NumCPUs = 8
499499
mpool.Set(ic.Platform.Nutanix.DefaultMachinePlatform)
500500
mpool.Set(pool.Platform.Nutanix)
501-
if err = mpool.ValidateConfig(ic.Platform.Nutanix); err != nil {
501+
if err = mpool.ValidateConfig(ic.Platform.Nutanix, "master"); err != nil {
502502
return errors.Wrap(err, "failed to create master machine objects")
503503
}
504504
pool.Platform.Nutanix = &mpool

pkg/asset/machines/nutanix/machines.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"k8s.io/apimachinery/pkg/api/resource"
99
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1010
"k8s.io/apimachinery/pkg/runtime"
11+
"k8s.io/utils/ptr"
1112

1213
configv1 "github.com/openshift/api/config/v1"
1314
machinev1 "github.com/openshift/api/machine/v1"
@@ -181,6 +182,7 @@ func provider(clusterID string, platform *nutanix.Platform, mpool *nutanix.Machi
181182
UUID: &peUUID,
182183
},
183184
SystemDiskSize: resource.MustParse(fmt.Sprintf("%dGi", mpool.OSDisk.DiskSizeGiB)),
185+
GPUs: mpool.GPUs,
184186
}
185187

186188
// FailureDomain
@@ -203,6 +205,49 @@ func provider(clusterID string, platform *nutanix.Platform, mpool *nutanix.Machi
203205
providerCfg.Categories = mpool.Categories
204206
}
205207

208+
for _, disk := range mpool.DataDisks {
209+
providerDisk := machinev1.NutanixVMDisk{
210+
DiskSize: disk.DiskSize,
211+
DeviceProperties: disk.DeviceProperties,
212+
}
213+
214+
if disk.StorageConfig != nil {
215+
providerDisk.StorageConfig = &machinev1.NutanixVMStorageConfig{
216+
DiskMode: disk.StorageConfig.DiskMode,
217+
}
218+
219+
if disk.StorageConfig.StorageContainer != nil {
220+
scRef := disk.StorageConfig.StorageContainer
221+
if scRef.ReferenceName != "" && failureDomain != nil {
222+
if scRef, err := platform.GetStorageContainerFromFailureDomain(failureDomain.Name, scRef.ReferenceName); err != nil {
223+
return nil, fmt.Errorf("not found storage container with reference name %q in failureDomain %q", scRef.ReferenceName, failureDomain.Name)
224+
}
225+
}
226+
227+
providerDisk.StorageConfig.StorageContainer = &machinev1.NutanixStorageResourceIdentifier{
228+
Type: machinev1.NutanixIdentifierUUID,
229+
UUID: ptr.To(scRef.UUID),
230+
}
231+
}
232+
}
233+
234+
if disk.DataSourceImage != nil {
235+
imgRef := disk.DataSourceImage
236+
if imgRef.ReferenceName != "" && failureDomain != nil {
237+
if imgRef, err := platform.GetDataSourceImageFromFailureDomain(failureDomain.Name, imgRef.ReferenceName); err != nil {
238+
return nil, fmt.Errorf("not found dataSource image with reference name %q in failureDomain %q", imgRef.ReferenceName, failureDomain.Name)
239+
}
240+
}
241+
242+
providerDisk.DataSource = &machinev1.NutanixResourceIdentifier{
243+
Type: machinev1.NutanixIdentifierUUID,
244+
UUID: ptr.To(imgRef.UUID),
245+
}
246+
}
247+
248+
providerCfg.DataDisks = append(providerCfg.DataDisks, providerDisk)
249+
}
250+
206251
return providerCfg, nil
207252
}
208253

pkg/asset/machines/worker.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -648,8 +648,8 @@ func (w *Worker) Generate(ctx context.Context, dependencies asset.Parents) error
648648
mpool := defaultNutanixMachinePoolPlatform()
649649
mpool.Set(ic.Platform.Nutanix.DefaultMachinePlatform)
650650
mpool.Set(pool.Platform.Nutanix)
651-
if err = mpool.ValidateConfig(ic.Platform.Nutanix); err != nil {
652-
return errors.Wrap(err, "failed to create master machine objects")
651+
if err = mpool.ValidateConfig(ic.Platform.Nutanix, "worker"); err != nil {
652+
return errors.Wrap(err, "failed to create worker machine objects")
653653
}
654654
pool.Platform.Nutanix = &mpool
655655
imageName := nutanixtypes.RHCOSImageName(clusterID.InfraID)

pkg/types/nutanix/helpers.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ import (
1414
"github.com/nutanix-cloud-native/prism-go-client/utils"
1515
nutanixclientv3 "github.com/nutanix-cloud-native/prism-go-client/v3"
1616
"github.com/pkg/errors"
17+
18+
machinev1 "github.com/openshift/api/machine/v1"
1719
)
1820

1921
const (
@@ -194,3 +196,94 @@ func CategoryKey(infraID string) string {
194196
categoryKey := fmt.Sprintf("%s%s", categoryKeyPrefix, infraID)
195197
return categoryKey
196198
}
199+
200+
// GetGPUList returns a list of VMGpus for the given list of GPU identifiers in the Prism Element (uuid).
201+
func GetGPUList(ctx context.Context, client *nutanixclientv3.Client, gpus []machinev1.NutanixGPU, peUUID string) ([]*nutanixclientv3.VMGpu, error) {
202+
vmGPUs := make([]*nutanixclientv3.VMGpu, 0)
203+
204+
if len(gpus) == 0 {
205+
return vmGPUs, nil
206+
}
207+
208+
peGPUs, err := GetGPUsForPE(ctx, client, peUUID)
209+
if err != nil {
210+
return nil, fmt.Errorf("failed to retrieve GPUs of the Prism Element cluster (uuid: %v): %w", peUUID, err)
211+
}
212+
if len(peGPUs) == 0 {
213+
return nil, fmt.Errorf("no available GPUs found in Prism Element cluster (uuid: %s)", peUUID)
214+
}
215+
216+
for _, gpu := range gpus {
217+
foundGPU, err := GetGPUFromList(ctx, client, gpu, peGPUs)
218+
if err != nil {
219+
return nil, err
220+
}
221+
vmGPUs = append(vmGPUs, foundGPU)
222+
}
223+
224+
return vmGPUs, nil
225+
}
226+
227+
// GetGPUFromList returns the VMGpu matching the input reqirements from the provided list of GPU devices.
228+
func GetGPUFromList(ctx context.Context, client *nutanixclientv3.Client, gpu machinev1.NutanixGPU, gpuDevices []*nutanixclientv3.GPU) (*nutanixclientv3.VMGpu, error) {
229+
for _, gd := range gpuDevices {
230+
if gd.Status != "UNUSED" {
231+
continue
232+
}
233+
234+
if (gpu.Type == machinev1.NutanixGPUIdentifierDeviceID && gd.DeviceID != nil && *gpu.DeviceID == int32(*gd.DeviceID)) ||
235+
(gpu.Type == machinev1.NutanixGPUIdentifierName && *gpu.Name == gd.Name) {
236+
return &nutanixclientv3.VMGpu{
237+
DeviceID: gd.DeviceID,
238+
Mode: &gd.Mode,
239+
Vendor: &gd.Vendor,
240+
}, nil
241+
}
242+
}
243+
244+
return nil, fmt.Errorf("no available GPU found that matches required GPU inputs")
245+
}
246+
247+
// GetGPUsForPE returns all the GPU devices for the given Prism Element (uuid).
248+
func GetGPUsForPE(ctx context.Context, client *nutanixclientv3.Client, peUUID string) ([]*nutanixclientv3.GPU, error) {
249+
gpus := make([]*nutanixclientv3.GPU, 0)
250+
hosts, err := client.V3.ListAllHost(ctx)
251+
if err != nil {
252+
return gpus, fmt.Errorf("failed to get hosts from Prism Central: %w", err)
253+
}
254+
255+
for _, host := range hosts.Entities {
256+
if host == nil ||
257+
host.Status == nil ||
258+
host.Status.ClusterReference == nil ||
259+
host.Status.ClusterReference.UUID != peUUID ||
260+
host.Status.Resources == nil ||
261+
len(host.Status.Resources.GPUList) == 0 {
262+
continue
263+
}
264+
265+
for _, peGpu := range host.Status.Resources.GPUList {
266+
if peGpu != nil {
267+
gpus = append(gpus, peGpu)
268+
}
269+
}
270+
}
271+
272+
return gpus, nil
273+
}
274+
275+
// FindImageUUIDByName retrieves the image resource uuid by the given image name from PC.
276+
func FindImageUUIDByName(ctx context.Context, ntnxclient *nutanixclientv3.Client, imageName string) (*string, error) {
277+
res, err := ntnxclient.V3.ListImage(ctx, &nutanixclientv3.DSMetadata{
278+
Filter: utils.StringPtr(fmt.Sprintf("name==%s", imageName)),
279+
})
280+
if err != nil || len(res.Entities) == 0 {
281+
return nil, fmt.Errorf("failed to find image by name %q. err: %w", imageName, err)
282+
}
283+
284+
if len(res.Entities) > 1 {
285+
return nil, fmt.Errorf("found more than one (%v) images with name %q", len(res.Entities), imageName)
286+
}
287+
288+
return res.Entities[0].Metadata.UUID, nil
289+
}

0 commit comments

Comments
 (0)