Skip to content

Commit 0e3619d

Browse files
pranavsriram8l-technicore
authored andcommitted
NSG bugfixes with respect to pushing metrics, nil checks,
follow-up improvements for the nsg rule management feature
1 parent db7995f commit 0e3619d

File tree

14 files changed

+327
-187
lines changed

14 files changed

+327
-187
lines changed

pkg/cloudprovider/providers/oci/instances_test.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -278,18 +278,6 @@ func (MockSecurityListManager) Delete(ctx context.Context, lbSubnets []*core.Sub
278278

279279
type MockSecurityListManagerFactory func(mode string) MockSecurityListManager
280280

281-
type MockNsgManager struct{}
282-
283-
func (MockNsgManager) Add(ctx context.Context, lbService serviceComponents) error {
284-
return nil
285-
}
286-
287-
func (MockNsgManager) Delete(ctx context.Context, lbService serviceComponents) error {
288-
return nil
289-
}
290-
291-
type MockNsgManagerFactory func(mode string) MockNsgManager
292-
293281
type MockOCIClient struct{}
294282

295283
func (MockOCIClient) Compute() client.ComputeInterface {

pkg/cloudprovider/providers/oci/load_balancer.go

Lines changed: 119 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/apimachinery/pkg/labels"
3131
"k8s.io/apimachinery/pkg/util/sets"
3232
k8sports "k8s.io/kubernetes/pkg/cluster/ports"
33+
"k8s.io/utils/pointer"
3334

3435
providercfg "github.com/oracle/oci-cloud-controller-manager/pkg/cloudprovider/providers/oci/config"
3536
"github.com/oracle/oci-cloud-controller-manager/pkg/metrics"
@@ -474,7 +475,7 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
474475
startTime := time.Now()
475476
lbName := GetLoadBalancerName(service)
476477
loadBalancerType := getLoadBalancerType(service)
477-
logger := cp.logger.With("loadBalancerName", lbName, "serviceName", service.Name, "loadBalancerType", loadBalancerType)
478+
logger := cp.logger.With("loadBalancerName", lbName, "serviceName", service.Name, "loadBalancerType", loadBalancerType, "serviceUid", service.UID)
478479
if sa, useWI := service.Annotations[ServiceAnnotationServiceAccountName]; useWI { // When using Workload Identity
479480
logger = logger.With("serviceAccount", sa, "nameSpace", service.Namespace)
480481
}
@@ -522,7 +523,7 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
522523
metrics.SendMetricData(cp.metricPusher, getMetric(loadBalancerType, Update), time.Since(startTime).Seconds(), dimensionsMap)
523524
return nil, err
524525
}
525-
exists := !client.IsNotFound(err)
526+
lbExists := !client.IsNotFound(err)
526527
lbOCID := ""
527528
if lb != nil && lb.Id != nil {
528529
lbOCID = *lb.Id
@@ -609,6 +610,20 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
609610
frontendNsgId := ""
610611
backendNsgs := spec.ManagedNetworkSecurityGroup.backendNsgId
611612

613+
// Check if there are any NSGs which are created by CCM (and use that), but didn't get attached to LB because the LB creation failed.
614+
if !lbExists {
615+
frontendNsgId, _, err = cp.getFrontendNsgByName(ctx, logger, generateNsgName(service), cp.config.CompartmentID, cp.config.VCNID, fmt.Sprintf("%s", service.UID))
616+
if err != nil {
617+
return nil, err
618+
}
619+
logger.Infof("found managed NSG %s", frontendNsgId)
620+
if frontendNsgId != "" {
621+
spec, err = addFrontendNsgToSpec(spec, frontendNsgId)
622+
if err != nil {
623+
return nil, err
624+
}
625+
}
626+
}
612627
if lb != nil && lb.Id != nil && lb.NetworkSecurityGroupIds != nil {
613628
nsgs := lb.NetworkSecurityGroupIds
614629
for _, id := range nsgs {
@@ -620,32 +635,51 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
620635
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Get), time.Since(startTime).Seconds(), dimensionsMap)
621636
}
622637
if frontendNsgId != "" {
623-
if !contains(spec.NetworkSecurityGroupIds, frontendNsgId) {
624-
spec.NetworkSecurityGroupIds = append(spec.NetworkSecurityGroupIds, frontendNsgId)
638+
spec, err = addFrontendNsgToSpec(spec, frontendNsgId)
639+
if err != nil {
640+
return nil, err
625641
}
626642
logger.With("loadBalancerID", *lb.Id).Infof("using existing frontendNsg %s", frontendNsgId)
627643
break
628644
}
629645
}
646+
647+
if frontendNsgId == "" {
648+
// Check if there are any CCM created NSGs which might be manually removed by customer causing a dirty LB
649+
logger.Info("Check if managed NSGs present in VCN")
650+
frontendNsgId, _, err = cp.getFrontendNsgByName(ctx, logger, generateNsgName(service), cp.config.CompartmentID, cp.config.VCNID, fmt.Sprintf("%s", service.UID))
651+
if err != nil {
652+
return nil, err
653+
}
654+
logger.Infof("found managed NSG %s", frontendNsgId)
655+
if frontendNsgId != "" {
656+
spec, err = addFrontendNsgToSpec(spec, frontendNsgId)
657+
if err != nil {
658+
return nil, err
659+
}
660+
}
661+
}
630662
}
631-
logger = logger.With("serviceId", service.UID)
663+
632664
// Create the NSG and add it to the LbSpec
633665
if frontendNsgId == "" {
634666
if len(spec.NetworkSecurityGroupIds) >= MaxNsgPerVnic {
635-
return nil, fmt.Errorf("invalid number of Network Security Groups (Max: 5) including CCM managed nsg")
667+
return nil, fmt.Errorf("invalid number of Network Security Groups (Max: 5) including managed nsg")
636668
}
637669
resp, err := cp.client.Networking().CreateNetworkSecurityGroup(ctx, cp.config.CompartmentID, cp.config.VCNID, generateNsgName(service), fmt.Sprintf("%s", service.UID))
638670
if err != nil {
639671
logger.With(zap.Error(err)).Error("Failed to create nsg")
640672
errorType = util.GetError(err)
641673
nsgMetricDimension = util.GetMetricDimensionForComponent(errorType, util.NSGType)
642674
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
643-
dimensionsMap[metrics.ResourceOCIDDimension] = *resp.Id
644675
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Create), time.Since(startTime).Seconds(), dimensionsMap)
645676
return nil, err
646677
}
647678
frontendNsgId = *resp.Id
648-
spec.NetworkSecurityGroupIds = append(spec.NetworkSecurityGroupIds, frontendNsgId)
679+
spec, err = addFrontendNsgToSpec(spec, frontendNsgId)
680+
if err != nil {
681+
return nil, err
682+
}
649683
logger.With("frontendNsgId", *resp.Id).
650684
Info("Successfully created nsg")
651685
nsgMetricDimension = util.GetMetricDimensionForComponent(util.Success, util.NSGType)
@@ -662,24 +696,27 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
662696
errorType = util.GetError(err)
663697
nsgMetricDimension = util.GetMetricDimensionForComponent(errorType, util.NSGType)
664698
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
665-
dimensionsMap[metrics.ResourceOCIDDimension] = *resp.Id
666699
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Get), time.Since(startTime).Seconds(), dimensionsMap)
667700
return nil, err
668701
}
669702
freeformTags := resp.FreeformTags
670703
if _, ok := freeformTags["ManagedBy"]; !ok {
671704
if etag != nil {
672-
freeformTags["ManagedBy"] = "OKE-CCM"
705+
freeformTags["ManagedBy"] = "CCM"
673706
response, err := cp.client.Networking().UpdateNetworkSecurityGroup(ctx, nsg, *etag, freeformTags)
674707
if err != nil {
675-
logger.With(zap.Error(err)).Error("Failed to get nsg")
708+
logger.With(zap.Error(err)).Errorf("Failed to update nsg %s", nsg)
676709
errorType = util.GetError(err)
677710
nsgMetricDimension = util.GetMetricDimensionForComponent(errorType, util.NSGType)
678711
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
679-
dimensionsMap[metrics.ResourceOCIDDimension] = *response.Id
712+
dimensionsMap[metrics.ResourceOCIDDimension] = nsg
680713
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Update), time.Since(startTime).Seconds(), dimensionsMap)
681714
return nil, err
682715
}
716+
nsgMetricDimension = util.GetMetricDimensionForComponent(util.Success, util.NSGType)
717+
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
718+
dimensionsMap[metrics.ResourceOCIDDimension] = *response.Id
719+
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Update), time.Since(startTime).Seconds(), dimensionsMap)
683720
}
684721
}
685722
}
@@ -698,7 +735,7 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
698735
}
699736
}
700737

701-
if !exists {
738+
if !lbExists {
702739
lbStatus, newLBOCID, err := lbProvider.createLoadBalancer(ctx, spec)
703740
if err != nil {
704741
logger.With(zap.Error(err)).Error("Failed to provision LoadBalancer")
@@ -728,7 +765,10 @@ func (cp *CloudProvider) EnsureLoadBalancer(ctx context.Context, clusterName str
728765
// was just created then these values would be equal; however, if the load balancer
729766
// already existed and the default subnet ids changed, then this would ensure
730767
// we are setting the security rules on the correct subnets.
731-
spec.Subnets = lb.SubnetIds
768+
spec, err = updateSpecWithLbSubnets(spec, lb.SubnetIds)
769+
if err != nil {
770+
return nil, err
771+
}
732772

733773
// If the load balancer needs an SSL cert ensure it is present.
734774
if requiresCertificate(service) {
@@ -1139,6 +1179,15 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
11391179
var nsgMetricDimension string
11401180

11411181
dimensionsMap := make(map[string]string)
1182+
var frontendNsgId = ""
1183+
uid := fmt.Sprintf("%s", service.UID)
1184+
var etag *string
1185+
1186+
securityRuleManagementMode, nsg, err := getRuleManagementMode(service)
1187+
if err != nil {
1188+
logger.With(zap.Error(err)).Error("failed to get rule management mode")
1189+
return errors.Wrap(err, "failed to get rule management mode")
1190+
}
11421191

11431192
lbProvider, err := cp.getLoadBalancerProvider(ctx, service)
11441193
if err != nil {
@@ -1148,6 +1197,33 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
11481197
if err != nil {
11491198
if client.IsNotFound(err) {
11501199
logger.Info("Could not find load balancer. Nothing to do.")
1200+
if securityRuleManagementMode == NSG {
1201+
displayName := generateNsgName(service)
1202+
nsg.frontendNsgId, etag, err = cp.getFrontendNsgByName(ctx, logger, displayName, cp.config.CompartmentID, cp.config.VCNID, uid)
1203+
if err != nil {
1204+
return errors.Wrap(err, "failed to get frontend NSG")
1205+
}
1206+
// Delete of NSG happens if NSG was created but LB creation fails
1207+
if nsg != nil && nsg.nsgRuleManagementMode == RuleManagementModeNsg && nsg.frontendNsgId != "" {
1208+
if etag != nil {
1209+
logger = logger.With("frontendNsgId", nsg.frontendNsgId)
1210+
logger.Infof("deleting frontend nsg %s", nsg.frontendNsgId)
1211+
nsgDeleted, err := cp.deleteNsg(ctx, logger, nsg.frontendNsgId, *etag)
1212+
if !nsgDeleted || err != nil {
1213+
logger.With(zap.Error(err)).Error("failed to delete nsg")
1214+
errorType = util.GetError(err)
1215+
nsgMetricDimension = util.GetMetricDimensionForComponent(errorType, util.NSGType)
1216+
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
1217+
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Delete), time.Since(startTime).Seconds(), dimensionsMap)
1218+
return err
1219+
}
1220+
nsgMetricDimension = util.GetMetricDimensionForComponent(util.Success, util.NSGType)
1221+
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
1222+
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Delete), time.Since(startTime).Seconds(), dimensionsMap)
1223+
logger.Infof("Managed nsg with id %s deleted", nsg.frontendNsgId)
1224+
}
1225+
}
1226+
}
11511227
return nil
11521228
}
11531229
errorType = util.GetError(err)
@@ -1162,18 +1238,10 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
11621238
id := *lb.Id
11631239
dimensionsMap[metrics.ResourceOCIDDimension] = id
11641240
logger = logger.With("loadBalancerID", id, "loadBalancerType", getLoadBalancerType(service))
1165-
securityRuleManagementMode, nsg, err := getRuleManagementMode(service)
1166-
if err != nil {
1167-
logger.With(zap.Error(err)).Error("failed to get rule management mode")
1168-
return errors.Wrap(err, "failed to get rule management mode")
1169-
}
11701241

1171-
var etag *string
1172-
var frontendNsgId = ""
11731242
if securityRuleManagementMode == NSG {
11741243
// List network security groups
11751244
nsgs := lb.NetworkSecurityGroupIds
1176-
uid := fmt.Sprintf("%s", service.UID)
11771245
for _, nsgId := range nsgs {
11781246
frontendNsgId, etag, err = cp.getFrontendNsg(ctx, logger, nsgId, uid)
11791247
if err != nil {
@@ -1183,7 +1251,7 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
11831251
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Get), time.Since(startTime).Seconds(), dimensionsMap)
11841252
}
11851253
if frontendNsgId != "" {
1186-
logger.With("frontendNsgId", frontendNsgId).Infof("queue frontend nsg for delete %s", frontendNsgId)
1254+
logger = logger.With("frontendNsgId", frontendNsgId)
11871255
nsg.frontendNsgId = frontendNsgId
11881256
break
11891257
}
@@ -1231,8 +1299,10 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
12311299
metrics.SendMetricData(cp.metricPusher, getMetric(loadBalancerType, Delete), time.Since(startTime).Seconds(), dimensionsMap)
12321300

12331301
// Delete of NSG happens after delete of the Loadbalancer
1234-
if nsg != nil && nsg.nsgRuleManagementMode == RuleManagementModeNsg && frontendNsgId != "" {
1302+
if nsg != nil && nsg.nsgRuleManagementMode == RuleManagementModeNsg && nsg.frontendNsgId != "" {
12351303
if etag != nil {
1304+
logger = logger.With("frontendNsgId", nsg.frontendNsgId)
1305+
logger.Infof("deleting frontend nsg %s", nsg.frontendNsgId)
12361306
nsgDeleted, err := cp.deleteNsg(ctx, logger, nsg.frontendNsgId, *etag)
12371307
if !nsgDeleted || err != nil {
12381308
logger.With(zap.Error(err)).Error("failed to delete nsg")
@@ -1245,7 +1315,7 @@ func (cp *CloudProvider) EnsureLoadBalancerDeleted(ctx context.Context, clusterN
12451315
nsgMetricDimension = util.GetMetricDimensionForComponent(util.Success, util.NSGType)
12461316
dimensionsMap[metrics.ComponentDimension] = nsgMetricDimension
12471317
metrics.SendMetricData(cp.metricPusher, getMetric(util.NSGType, Delete), time.Since(startTime).Seconds(), dimensionsMap)
1248-
logger.With("frontendNsgId", nsg.frontendNsgId).Infof("ccm managed nsg with id %s deleted", nsg.frontendNsgId)
1318+
logger.Infof("managed nsg with id %s deleted", nsg.frontendNsgId)
12491319
}
12501320
}
12511321

@@ -1450,26 +1520,41 @@ func (cp *CloudProvider) checkAllBackendNodesNotReady(nodeList []*v1.Node) bool
14501520
func (cp *CloudProvider) deleteNsg(ctx context.Context, logger *zap.SugaredLogger, id, etag string) (bool, error) {
14511521
opcRequestId, err := cp.client.Networking().DeleteNetworkSecurityGroup(ctx, id, etag)
14521522
if err != nil {
1453-
logger.Errorf("failed to delete nsg %s OpcRequestId %s", id, *opcRequestId)
1523+
logger.Errorf("failed to delete nsg %s OpcRequestId %s", id, pointer.StringDeref(opcRequestId, ""))
14541524
return false, err
14551525
}
1456-
logger.Infof("delete nsg OpcRequestId %s", *opcRequestId)
1526+
logger.Infof("delete nsg OpcRequestId %s", pointer.StringDeref(opcRequestId, ""))
14571527
return true, nil
14581528
}
14591529

1460-
func (cp *CloudProvider) getFrontendNsg(ctx context.Context, logger *zap.SugaredLogger, id, uid string) (string, *string, error) {
1530+
func (cp *CloudProvider) getFrontendNsg(ctx context.Context, logger *zap.SugaredLogger, id, uid string) (frontendNsgId string, etag *string, err error) {
14611531
nsg, etag, err := cp.client.Networking().GetNetworkSecurityGroup(ctx, id)
1462-
if err != nil || etag == nil {
1532+
if err != nil || nsg == nil || etag == nil {
14631533
logger.Errorf("failed to get nsg %s", id)
14641534
return "", nil, err
14651535
}
1466-
logger.Infof("Get nsg %s", *nsg)
1467-
freeFormTags := map[string]string{"CreatedBy": "OKE-CCM", "ServiceUid": uid}
1536+
freeFormTags := map[string]string{"CreatedBy": "CCM", "ServiceUid": uid}
14681537
if reflect.DeepEqual(nsg.FreeformTags, freeFormTags) {
1469-
logger.Infof("Found OKE created frontend nsg %s", *nsg.Id)
1470-
return *nsg.Id, etag, nil
1538+
nsgId := pointer.StringDeref(nsg.Id, "")
1539+
logger.Infof("Found managed frontend nsg %s", nsgId)
1540+
return nsgId, etag, nil
14711541
} else {
1472-
logger.Infof("Found attached nsg %s but not OKE created", *nsg.Id)
1542+
logger.Infof("Found attached nsgs %s but not managed", pointer.StringDeref(nsg.Id, ""))
14731543
return "", nil, nil
14741544
}
14751545
}
1546+
1547+
func (cp *CloudProvider) getFrontendNsgByName(ctx context.Context, logger *zap.SugaredLogger, displayName, compartmentId, vcnId, uid string) (frontendNsgId string, etag *string, err error) {
1548+
nsgs, err := cp.client.Networking().ListNetworkSecurityGroups(ctx, displayName, compartmentId, vcnId)
1549+
for _, nsg := range nsgs {
1550+
frontendNsgId, etag, err = cp.getFrontendNsg(ctx, logger, pointer.StringDeref(nsg.Id, ""), uid)
1551+
if err != nil {
1552+
return "", nil, err
1553+
}
1554+
if frontendNsgId != "" {
1555+
logger.Infof("found frontend NSG %s", frontendNsgId)
1556+
return frontendNsgId, etag, nil
1557+
}
1558+
}
1559+
return "", nil, nil
1560+
}

0 commit comments

Comments
 (0)