Skip to content

Commit ca15528

Browse files
chethanv28deepakkinnikavyashree-rsipriyaadivyenpatel
authored
Cherry pick commits from master - 2e42ee to 08a7bd8 (#3642)
* Initialize volume manager in WORKLOAD cluster mode (#3598) Signed-off-by: Deepak Kinni <[email protected]> * removing static-prov tests from cf (#3606) * Adding vks testcases to EF column for n-1 and n-2 tkg versions (#3601) * fix createSnapshotWithTransaction workflow (#3597) * Surface NotSupported error message during AttachVolume call (#3595) Signed-off-by: Deepak Kinni <[email protected]> * vmservice - remove IP check if private network (#3610) * Added wait time befoew querrying vm images (#3612) * Updated controller-gen version (#3614) * Fix incorrect volume attach error handling in CNSNodeVMAttachment reconciler (#3590) * wcp-snapshot test fix (#3620) * pq tag on block vanilla tests (#3619) * Allow devops users to attach GC RWX file volume with VM service VMs (#3611) * adding wcp-vsan-stretch-vmsvc-tests to pq (#3627) * adding disruptive testcases to PQ UTS section (#3624) * Remove validations for batch attach (#3622) * Wait until topology annotation gets added on supervisor PVCs while adding node affinity rules on PVs in guest cluster (#3588) * Addin vSAN Stretch TKG to UTS PQ (#3631) * Ensure that during detach PVC has a unique entry in volumeStatus (#3630) * Update vm-operator api version and parallelize sync volume api calls (#3604) * Fix CnsVolumeOperationRequest update logic when transaction support is enabled (#3623) Signed-off-by: Deepak Kinni <[email protected]> * Following enhancement made in the list-view task monitoring (#3584) Before recreating a ListView, destroy the old ListView to avoid leaks. Sends error results to pending tasks without blocking - change made in reportErrorOnAllPendingTasks Uses non-blocking send for task result channel in processTaskUpdate TaskResult channel is now buffered (size 1) - change made in waitOnTask * fixes (#3637) --------- Signed-off-by: Deepak Kinni <[email protected]> Co-authored-by: Deepak Kinni <[email protected]> Co-authored-by: kavyashree-r <[email protected]> Co-authored-by: sipriyaa <[email protected]> Co-authored-by: Divyen Patel <[email protected]> Co-authored-by: Satyanarayana Kolluri <[email protected]> Co-authored-by: Raj Kumar Gupta <[email protected]> Co-authored-by: skogta <[email protected]> Co-authored-by: Vipul Kotkar <[email protected]> Co-authored-by: nikhilbarge <[email protected]>
1 parent 593a6fa commit ca15528

File tree

64 files changed

+1973
-864
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+1973
-864
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ ifeq (, $(shell which controller-gen))
135135
CONTROLLER_GEN_TMP_DIR=$$(mktemp -d) ;\
136136
cd $$CONTROLLER_GEN_TMP_DIR ;\
137137
go mod init tmp ;\
138-
go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.14.0 ;\
138+
go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.19.0 ;\
139139
rm -rf $$CONTROLLER_GEN_TMP_DIR ;\
140140
}
141141
CONTROLLER_GEN=$(GOBIN)/controller-gen

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ require (
2323
github.com/pkg/sftp v1.13.6
2424
github.com/prometheus/client_golang v1.22.0
2525
github.com/stretchr/testify v1.10.0
26-
github.com/vmware-tanzu/vm-operator/api v1.8.7-0.20250820184450-53a697d52f9c
26+
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65
2727
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa
2828
github.com/vmware/govmomi v0.52.0-alpha.0.0.20250807230438-0eee109f1f2c
2929
go.uber.org/zap v1.27.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,8 @@ github.com/thecodeteam/gofsutil v0.1.2 h1:FL87mBzZeeuDMZm8hpYLFcYylQdq6bbm8UQ1oc
287287
github.com/thecodeteam/gofsutil v0.1.2/go.mod h1:7bDOpr2aMnmdm9RTdxBEeqdOr+8RpnQhsB/VUEI3DgM=
288288
github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75 h1:6fotK7otjonDflCTK0BCfls4SPy3NcCVb5dqqmbRknE=
289289
github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75/go.mod h1:KO6IkyS8Y3j8OdNO85qEYBsRPuteD+YciPomcXdrMnk=
290-
github.com/vmware-tanzu/vm-operator/api v1.8.7-0.20250820184450-53a697d52f9c h1:+lECoxyxQcMsxU2CJ53YvaLEwqfGduS/3yQaLLRDYDo=
291-
github.com/vmware-tanzu/vm-operator/api v1.8.7-0.20250820184450-53a697d52f9c/go.mod h1:hkc/QZCSHcosWWMPS6VWWR12WenZcNE3BaTJ/8A8sNE=
290+
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65 h1:oby5iJHxU5KgtajXxT8B8VxUUPH20Zh0KTYlI+18AYs=
291+
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65/go.mod h1:nWTPpxfe4gHuuYuFcrs86+NMxfkqPk3a3IlvI8TCWak=
292292
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa h1:4MKu14YJ7J54O6QKmT4ds5EUpysWLLtQRMff73cVkmU=
293293
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa/go.mod h1:8tiuyYslzjLIUmOlXZuGKQdQP2ZgWGCVhVeyptmZYnk=
294294
github.com/vmware/govmomi v0.52.0-alpha.0.0.20250807230438-0eee109f1f2c h1:1nMVFr1CBMSNLLjsfx3QPfZ5k0R1/O29QX/A2X0w3RQ=

manifests/supervisorcluster/1.29/cns-csi.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,9 @@ rules:
732732
- apiGroups: ["cns.vmware.com"]
733733
resources: ["cnsfileaccessconfigs"]
734734
verbs: ["get", "list", "update"]
735+
- apiGroups: ["vmoperator.vmware.com"]
736+
resources: ["virtualmachines"]
737+
verbs: ["get", "list"]
735738
---
736739
kind: ClusterRoleBinding
737740
apiVersion: rbac.authorization.k8s.io/v1

manifests/supervisorcluster/1.30/cns-csi.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,9 @@ rules:
737737
- apiGroups: ["cns.vmware.com"]
738738
resources: ["cnsfileaccessconfigs"]
739739
verbs: ["get", "list", "update"]
740+
- apiGroups: ["vmoperator.vmware.com"]
741+
resources: ["virtualmachines"]
742+
verbs: ["get", "list"]
740743
---
741744
kind: ClusterRoleBinding
742745
apiVersion: rbac.authorization.k8s.io/v1

manifests/supervisorcluster/1.31/cns-csi.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,9 @@ rules:
737737
- apiGroups: ["cns.vmware.com"]
738738
resources: ["cnsfileaccessconfigs"]
739739
verbs: ["get", "list", "update"]
740+
- apiGroups: ["vmoperator.vmware.com"]
741+
resources: ["virtualmachines"]
742+
verbs: ["get", "list"]
740743
---
741744
kind: ClusterRoleBinding
742745
apiVersion: rbac.authorization.k8s.io/v1

manifests/supervisorcluster/1.32/cns-csi.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,9 @@ rules:
737737
- apiGroups: ["cns.vmware.com"]
738738
resources: ["cnsfileaccessconfigs"]
739739
verbs: ["get", "list", "update"]
740+
- apiGroups: ["vmoperator.vmware.com"]
741+
resources: ["virtualmachines"]
742+
verbs: ["get", "list"]
740743
---
741744
kind: ClusterRoleBinding
742745
apiVersion: rbac.authorization.k8s.io/v1

pkg/apis/cnsoperator/cnsnodevmbatchattachment/v1alpha1/cnsnodebatchvmattachment_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ const (
2929
IndependentPersistent DiskMode = "independent_persistent"
3030
// Changes are immediately and permanently written to the virtual disk.
3131
Persistent DiskMode = "persistent"
32+
// Changes to virtual disk are made to a redo log and discarded at power off.
33+
// It is not affected by snapshots.
34+
IndependentNonPersistent = "independent_nonpersistent"
3235
)
3336

3437
// The sharing mode of the virtual disk.

pkg/common/cns-lib/volume/listview.go

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -324,13 +324,23 @@ func (l *ListViewImpl) listenToTaskUpdates() {
324324
l.mu.Lock()
325325
log.Infof("acquired lock before re-creating listview")
326326
if recreateView {
327+
if l.listView != nil {
328+
destroyListviewErr := l.listView.Destroy(l.ctx)
329+
if destroyListviewErr != nil {
330+
// ignoring the error and re-creating the list view
331+
log.Errorf("failed to destroy listview object. err: %v", destroyListviewErr)
332+
} else {
333+
log.Info("successfully destroyed existing listview")
334+
}
335+
}
327336
log.Info("re-creating the listView object")
328337
err := l.createListView(l.ctx, nil)
329338
if err != nil {
330339
log.Errorf("failed to create a ListView. error: %+v", err)
331340
l.mu.Unlock()
332341
continue
333342
}
343+
log.Info("successfully created listview")
334344

335345
filter = getListViewWaitFilter(l.listView)
336346
l.waitForUpdatesContext, l.waitForUpdatesCancelFunc = context.WithCancel(context.Background())
@@ -395,12 +405,19 @@ func (l *ListViewImpl) listenToTaskUpdates() {
395405

396406
// reportErrorOnAllPendingTasks returns failure to all pending tasks in the map in case of vc failure
397407
func (l *ListViewImpl) reportErrorOnAllPendingTasks(err error) {
408+
log := logger.GetLogger(context.Background())
398409
for _, taskDetails := range l.taskMap.GetAll() {
399410
result := TaskResult{
400411
TaskInfo: nil,
401412
Err: err,
402413
}
403-
taskDetails.ResultCh <- result
414+
// Non-blocking send
415+
select {
416+
case taskDetails.ResultCh <- result:
417+
log.Infof("reported error for task %+v", taskDetails.Reference)
418+
default:
419+
log.Warnf("failed to report error for task %+v: channel blocked", taskDetails.Reference)
420+
}
404421
}
405422
}
406423

@@ -431,8 +448,16 @@ func (l *ListViewImpl) processTaskUpdate(prop types.PropertyChange) {
431448
result.TaskInfo = &taskInfo
432449
result.Err = nil
433450
}
434-
435-
taskDetails.ResultCh <- result
451+
// Use a non-blocking send to prevent deadlocks when multiple goroutines
452+
// try to send to the same channel (e.g., due to duplicate task updates from vSphere)
453+
select {
454+
case taskDetails.ResultCh <- result:
455+
log.Infof("Successfully sent task result for task %+v", taskInfo.Task)
456+
default:
457+
// Channel is full/blocked, which means another goroutine already sent the result
458+
// This can happen when vSphere sends duplicate task update events
459+
log.Warnf("result channel full for task %+v, ignoring duplicate update", taskInfo.Task)
460+
}
436461
}
437462

438463
// RemoveTasksMarkedForDeletion goes over the list of tasks in the map

pkg/common/cns-lib/volume/manager.go

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -827,9 +827,9 @@ func (m *defaultManager) waitOnTask(csiOpContext context.Context,
827827
return nil, err
828828
}
829829
}
830-
ch := make(chan TaskResult)
830+
ch := make(chan TaskResult, 1)
831831
err := m.listViewIf.AddTask(csiOpContext, taskMoRef, ch)
832-
if errors.Unwrap(err) == ErrListViewTaskAddition {
832+
if errors.Is(err, ErrListViewTaskAddition) {
833833
return nil, logger.LogNewErrorf(log, "%s. err: %v", listviewAdditionError, err)
834834
} else if err != nil {
835835
// in case the task is not found in VC, we are returning a ManagedObjectNotFound error wrapped as a soap fault
@@ -1119,7 +1119,7 @@ func (m *defaultManager) AttachVolume(ctx context.Context,
11191119
}
11201120

11211121
volumeOperationRes := taskResult.GetCnsVolumeOperationResult()
1122-
if volumeOperationRes.Fault != nil {
1122+
if volumeOperationRes.Fault != nil && volumeOperationRes.Fault.Fault != nil {
11231123
faultType = ExtractFaultTypeFromVolumeResponseResult(ctx, volumeOperationRes)
11241124
_, isResourceInUseFault := volumeOperationRes.Fault.Fault.(*vim25types.ResourceInUse)
11251125
if isResourceInUseFault {
@@ -1133,6 +1133,37 @@ func (m *defaultManager) AttachVolume(ctx context.Context,
11331133
return diskUUID, "", nil
11341134
}
11351135
}
1136+
1137+
// Check if this is a CnsFault with NotSupported fault cause
1138+
if cnsFault, isCnsFault := volumeOperationRes.Fault.Fault.(*cnstypes.CnsFault); isCnsFault {
1139+
if cnsFault.FaultCause != nil {
1140+
notSupportedFault, isNotSupportedFault := cnsFault.FaultCause.Fault.(*vim25types.NotSupported)
1141+
if isNotSupportedFault {
1142+
log.Infof("observed CnsFault with NotSupported fault cause while attaching volume: %q with vm: %q",
1143+
volumeID, vm.String())
1144+
1145+
// Extract the specific error message from NotSupported fault's FaultMessage array
1146+
var errorMessages []string
1147+
for _, faultMsg := range notSupportedFault.FaultMessage {
1148+
if faultMsg.Message != "" {
1149+
errorMessages = append(errorMessages, faultMsg.Message)
1150+
}
1151+
}
1152+
1153+
if len(errorMessages) > 0 {
1154+
extractedMessage := strings.Join(errorMessages, " - ")
1155+
log.Infof("NotSupported fault extracted message: %s", extractedMessage)
1156+
return "", faultType, logger.LogNewErrorf(log,
1157+
"%q Failed to attach cns volume: %q to node vm: %q. fault: %q. opId: %q",
1158+
extractedMessage, volumeID, vm.String(), spew.Sdump(volumeOperationRes.Fault), taskInfo.ActivationId)
1159+
}
1160+
1161+
// Fallback to detailed dump for debugging
1162+
log.Debugf("NotSupported fault details: %+v", spew.Sdump(cnsFault.FaultCause))
1163+
}
1164+
}
1165+
}
1166+
11361167
return "", faultType, logger.LogNewErrorf(log, "failed to attach cns volume: %q to node vm: %q. fault: %q. opId: %q",
11371168
volumeID, vm.String(), spew.Sdump(volumeOperationRes.Fault), taskInfo.ActivationId)
11381169
}
@@ -2795,20 +2826,29 @@ func (m *defaultManager) createSnapshotWithImprovedIdempotencyCheck(ctx context.
27952826
// This function ensures that no orphaned snapshots are left behind on the vSphere backend
27962827
// in case of failures during the snapshot creation process
27972828
func (m *defaultManager) createSnapshotWithTransaction(ctx context.Context, volumeID string,
2798-
snapshotID string, extraParams interface{}) (*CnsSnapshotInfo, string, error) {
2829+
snapshotName string, extraParams interface{}) (*CnsSnapshotInfo, string, error) {
27992830
log := logger.GetLogger(ctx)
28002831
var (
28012832
// Reference to the CreateSnapshot task on CNS.
28022833
createSnapshotsTask *object.Task
28032834
// Name of the CnsVolumeOperationRequest instance.
2804-
instanceName = snapshotID + "-" + volumeID
2835+
instanceName = snapshotName + "-" + volumeID
28052836
// Local instance of CreateSnapshot details that needs to be persisted.
28062837
volumeOperationDetails *cnsvolumeoperationrequest.VolumeOperationRequestDetails
28072838
// error
28082839
err error
28092840
quotaInfo *cnsvolumeoperationrequest.QuotaDetails
28102841
isStorageQuotaM2FSSEnabled bool
28112842
)
2843+
// By default, external-snapshotter sets the snapshot name prefix to "snapshot-".
2844+
// This logic will break if the prefix configuration is changed.
2845+
// In Supervisor deployments, we assume this configuration remains unchanged by admin/DevOps.
2846+
// In Vanilla deployments, we publish the deployment manifest with the default configuration to ensure consistency.
2847+
if !strings.HasPrefix(snapshotName, "snapshot-") {
2848+
return nil, csifault.CSIInternalFault,
2849+
logger.LogNewErrorf(log, "invalid snapshotName %q: must start with 'snapshot-'", snapshotName)
2850+
}
2851+
snapshotID := strings.TrimPrefix(snapshotName, "snapshot-")
28122852
if extraParams != nil {
28132853
createSnapParams, ok := extraParams.(*CreateSnapshotExtraParams)
28142854
if !ok {
@@ -2870,6 +2910,14 @@ func (m *defaultManager) createSnapshotWithTransaction(ctx context.Context, volu
28702910
faultType := ExtractFaultTypeFromErr(ctx, err)
28712911
return nil, faultType, logger.LogNewErrorf(log, "failed to create snapshot with error: %v", err)
28722912
}
2913+
// Persist the volume operation details.
2914+
volumeOperationDetails = createRequestDetails(instanceName, volumeID, "", 0, quotaInfo,
2915+
volumeOperationDetails.OperationDetails.TaskInvocationTimestamp,
2916+
createSnapshotsTask.Reference().Value, "", "", taskInvocationStatusInProgress, "")
2917+
if err := m.operationStore.StoreRequestDetails(ctx, volumeOperationDetails); err != nil {
2918+
// Don't return if CreateSnapshot details can't be stored.
2919+
log.Warnf("failed to store CreateSnapshot details with error: %v", err)
2920+
}
28732921

28742922
var createSnapshotsTaskInfo *vim25types.TaskInfo
28752923
var faultType string
@@ -2888,8 +2936,20 @@ func (m *defaultManager) createSnapshotWithTransaction(ctx context.Context, volu
28882936
"from vCenter %q with err: %v", m.virtualCenter.Config.Host, err)
28892937
}
28902938
log.Infof("CreateSnapshots: VolumeID: %q, opId: %q", volumeID, createSnapshotsTaskInfo.ActivationId)
2891-
2892-
snapshotCreateResult := interface{}(createSnapshotsTaskInfo).(*cnstypes.CnsSnapshotCreateResult)
2939+
createSnapshotsTaskResult, err := cns.GetTaskResult(ctx, createSnapshotsTaskInfo)
2940+
if err != nil || createSnapshotsTaskResult == nil {
2941+
return nil, "", logger.LogNewErrorf(log, "unable to find the task result for CreateSnapshots task: %q "+
2942+
"from vCenter %q with err: %v", createSnapshotsTaskInfo.Task.Value, m.virtualCenter.Config.Host, err)
2943+
}
2944+
snapshotCreateResult, ok := createSnapshotsTaskResult.(*cnstypes.CnsSnapshotCreateResult)
2945+
if !ok || snapshotCreateResult == nil {
2946+
return nil, "", logger.LogNewErrorf(log,
2947+
"invalid task result: got %T with value %+v", createSnapshotsTaskResult, createSnapshotsTaskResult)
2948+
}
2949+
if snapshotCreateResult.Fault != nil {
2950+
return nil, "", logger.LogNewErrorf(log, "failed to create snapshot %q on volume %q with fault: %+v",
2951+
instanceName, volumeID, snapshotCreateResult.Fault)
2952+
}
28932953
cnsSnapshotInfo := &CnsSnapshotInfo{
28942954
SnapshotID: snapshotCreateResult.Snapshot.SnapshotId.Id,
28952955
SourceVolumeID: snapshotCreateResult.Snapshot.VolumeId.Id,
@@ -2947,14 +3007,15 @@ func (m *defaultManager) CreateSnapshot(
29473007
}
29483008
}
29493009
if createSnapParams != nil && createSnapParams.IsCSITransactionSupportEnabled {
2950-
var snapcontentPrefix = "snapcontent-"
29513010
cnssnapshotInfo, fault, err := m.createSnapshotWithTransaction(ctx, volumeID,
2952-
strings.TrimPrefix(snapshotName, snapcontentPrefix), extraParams)
3011+
snapshotName, extraParams)
29533012
if err != nil {
29543013
if IsNotSupportedFaultType(ctx, fault) {
29553014
log.Infof("Creating Snapshot with Transaction is not supported. " +
29563015
"Re-creating Snapshot without setting Snapshot ID in the spec")
29573016
return m.createSnapshotWithImprovedIdempotencyCheck(ctx, volumeID, snapshotName, extraParams)
3017+
} else {
3018+
return nil, logger.LogNewErrorf(log, "failed to create snapshot. error :%+v", err)
29583019
}
29593020
}
29603021
return cnssnapshotInfo, nil

0 commit comments

Comments
 (0)