Skip to content

Commit 27376dd

Browse files
ramiro-gamarrajpayne3506
authored andcommitted
CNS - Ensuring no stale NCs during NNC reconcile (#2192)
* ensuring no stale ncs during nnc reconcile * only save state if mutated * ensuring we only remove stale ncs if none of their ips are assigned
1 parent 0255a9c commit 27376dd

File tree

4 files changed

+282
-5
lines changed

4 files changed

+282
-5
lines changed

cns/kubecontroller/nodenetworkconfig/reconciler.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
type cnsClient interface {
2626
CreateOrUpdateNetworkContainerInternal(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode
27+
MustEnsureNoStaleNCs(validNCIDs []string)
2728
}
2829

2930
type nodeNetworkConfigListener interface {
@@ -74,6 +75,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
7475

7576
ipAssignments := 0
7677

78+
// during node upgrades, an nnc may be updated with new ncs. at any given time, only the ncs
79+
// that exist in the nnc are valid. any others that may have been previously created and no
80+
// longer exist in the nnc should be considered stale.
81+
validNCIDs := make([]string, len(nnc.Status.NetworkContainers))
82+
for i := range nnc.Status.NetworkContainers {
83+
validNCIDs[i] = nnc.Status.NetworkContainers[i].ID
84+
}
85+
r.cnscli.MustEnsureNoStaleNCs(validNCIDs)
86+
7787
// for each NC, parse it in to a CreateNCRequest and forward it to the appropriate Listener
7888
for i := range nnc.Status.NetworkContainers {
7989
// check if this NC matches the Node IP if we have one to check against

cns/kubecontroller/nodenetworkconfig/reconciler_test.go

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ import (
1818
)
1919

2020
type cnsClientState struct {
21-
req *cns.CreateNetworkContainerRequest
22-
nnc *v1alpha.NodeNetworkConfig
21+
reqsByNCID map[string]*cns.CreateNetworkContainerRequest
22+
nnc *v1alpha.NodeNetworkConfig
2323
}
2424

2525
type mockCNSClient struct {
@@ -29,10 +29,23 @@ type mockCNSClient struct {
2929
}
3030

3131
func (m *mockCNSClient) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) cnstypes.ResponseCode {
32-
m.state.req = req
32+
m.state.reqsByNCID[req.NetworkContainerid] = req
3333
return m.createOrUpdateNC(req)
3434
}
3535

36+
func (m *mockCNSClient) MustEnsureNoStaleNCs(validNCIDs []string) {
37+
valid := make(map[string]struct{})
38+
for _, ncID := range validNCIDs {
39+
valid[ncID] = struct{}{}
40+
}
41+
42+
for ncID := range m.state.reqsByNCID {
43+
if _, ok := valid[ncID]; !ok {
44+
delete(m.state.reqsByNCID, ncID)
45+
}
46+
}
47+
}
48+
3649
func (m *mockCNSClient) Update(nnc *v1alpha.NodeNetworkConfig) error {
3750
m.state.nnc = nnc
3851
return m.update(nnc)
@@ -112,7 +125,7 @@ func TestReconcile(t *testing.T) {
112125
},
113126
wantErr: true,
114127
wantCNSClientState: cnsClientState{
115-
req: validSwiftRequest,
128+
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
116129
},
117130
},
118131
{
@@ -137,7 +150,7 @@ func TestReconcile(t *testing.T) {
137150
},
138151
wantErr: false,
139152
wantCNSClientState: cnsClientState{
140-
req: validSwiftRequest,
153+
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
141154
nnc: &v1alpha.NodeNetworkConfig{
142155
Status: validSwiftStatus,
143156
Spec: v1alpha.NodeNetworkConfigSpec{
@@ -173,6 +186,11 @@ func TestReconcile(t *testing.T) {
173186
}
174187
for _, tt := range tests {
175188
tt := tt
189+
tt.cnsClient.state.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
190+
if tt.wantCNSClientState.reqsByNCID == nil {
191+
tt.wantCNSClientState.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
192+
}
193+
176194
t.Run(tt.name, func(t *testing.T) {
177195
r := NewReconciler(&tt.cnsClient, &tt.cnsClient, tt.nodeIP)
178196
r.nnccli = &tt.ncGetter
@@ -187,3 +205,64 @@ func TestReconcile(t *testing.T) {
187205
})
188206
}
189207
}
208+
209+
func TestReconcileStaleNCs(t *testing.T) {
210+
logger.InitLogger("", 0, 0, "")
211+
212+
cnsClient := mockCNSClient{
213+
state: cnsClientState{reqsByNCID: make(map[string]*cns.CreateNetworkContainerRequest)},
214+
createOrUpdateNC: func(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode { return cnstypes.Success },
215+
update: func(*v1alpha.NodeNetworkConfig) error { return nil },
216+
}
217+
218+
nodeIP := "10.0.0.10"
219+
220+
nncv1 := v1alpha.NodeNetworkConfig{
221+
Status: v1alpha.NodeNetworkConfigStatus{
222+
NetworkContainers: []v1alpha.NetworkContainer{
223+
{ID: "nc1", PrimaryIP: "10.1.0.10", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
224+
{ID: "nc2", PrimaryIP: "10.1.0.11", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
225+
},
226+
},
227+
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
228+
}
229+
230+
nncv2 := v1alpha.NodeNetworkConfig{
231+
Status: v1alpha.NodeNetworkConfigStatus{
232+
NetworkContainers: []v1alpha.NetworkContainer{
233+
{ID: "nc3", PrimaryIP: "10.1.0.12", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
234+
{ID: "nc4", PrimaryIP: "10.1.0.13", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
235+
},
236+
},
237+
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
238+
}
239+
240+
i := 0
241+
nncIterator := func(context.Context, types.NamespacedName) (*v1alpha.NodeNetworkConfig, error) {
242+
nncLog := []v1alpha.NodeNetworkConfig{nncv1, nncv2}
243+
for i < len(nncLog) {
244+
j := i
245+
i++
246+
return &nncLog[j], nil
247+
}
248+
249+
return &nncLog[len(nncLog)-1], nil
250+
}
251+
252+
r := NewReconciler(&cnsClient, &cnsClient, nodeIP)
253+
r.nnccli = &mockNCGetter{get: nncIterator}
254+
255+
_, err := r.Reconcile(context.Background(), reconcile.Request{})
256+
require.NoError(t, err)
257+
258+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc1")
259+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc2")
260+
261+
_, err = r.Reconcile(context.Background(), reconcile.Request{})
262+
require.NoError(t, err)
263+
264+
assert.NotContains(t, cnsClient.state.reqsByNCID, "nc1")
265+
assert.NotContains(t, cnsClient.state.reqsByNCID, "nc2")
266+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc3")
267+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc4")
268+
}

cns/restserver/internalapi.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,43 @@ func (service *HTTPRestService) DeleteNetworkContainerInternal(
362362
return types.Success
363363
}
364364

365+
func (service *HTTPRestService) MustEnsureNoStaleNCs(validNCIDs []string) {
366+
valid := make(map[string]struct{})
367+
for _, ncID := range validNCIDs {
368+
valid[ncID] = struct{}{}
369+
}
370+
371+
service.Lock()
372+
defer service.Unlock()
373+
374+
ncIDToAssignedIPs := make(map[string][]cns.IPConfigurationStatus)
375+
for _, ipInfo := range service.PodIPConfigState { // nolint:gocritic // copy is fine; it's a larger change to modify the map to hold pointers
376+
if ipInfo.GetState() == types.Assigned {
377+
ncIDToAssignedIPs[ipInfo.NCID] = append(ncIDToAssignedIPs[ipInfo.NCID], ipInfo)
378+
}
379+
}
380+
381+
mutated := false
382+
for ncID := range service.state.ContainerStatus {
383+
if _, ok := valid[ncID]; !ok {
384+
// stale NCs with assigned IPs are an unexpected CNS state which we need to alert on.
385+
if assignedIPs, hasAssignedIPs := ncIDToAssignedIPs[ncID]; hasAssignedIPs {
386+
msg := fmt.Sprintf("Unexpected state: found stale NC ID %s in CNS state with %d assigned IPs: %+v", ncID, len(assignedIPs), assignedIPs)
387+
logger.Errorf(msg)
388+
panic(msg)
389+
}
390+
391+
logger.Errorf("[Azure CNS] Found stale NC ID %s in CNS state. Removing...", ncID)
392+
delete(service.state.ContainerStatus, ncID)
393+
mutated = true
394+
}
395+
}
396+
397+
if mutated {
398+
_ = service.saveState()
399+
}
400+
}
401+
365402
// This API will be called by CNS RequestController on CRD update.
366403
func (service *HTTPRestService) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) types.ResponseCode {
367404
if req.NetworkContainerid == "" {

cns/restserver/internalapi_test.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/Azure/azure-container-networking/cns/types"
2020
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
2121
nma "github.com/Azure/azure-container-networking/nmagent"
22+
"github.com/Azure/azure-container-networking/store"
2223
"github.com/google/uuid"
2324
"github.com/pkg/errors"
2425
"github.com/stretchr/testify/assert"
@@ -1065,3 +1066,153 @@ func TestCNIConflistGenerationOnNMAError(t *testing.T) {
10651066
time.Sleep(time.Second)
10661067
assert.Equal(t, 1, mockgen.getGeneratedCount())
10671068
}
1069+
1070+
func TestMustEnsureNoStaleNCs(t *testing.T) {
1071+
tests := []struct {
1072+
name string
1073+
storedNCs map[string]containerstatus
1074+
ipStates map[string]cns.IPConfigurationStatus
1075+
ipStatesOverrideFunc func(map[string]cns.IPConfigurationStatus) // defaults to available
1076+
ncsFromReconcile []string
1077+
expectedRemovedNCs []string
1078+
}{
1079+
{
1080+
name: "normal reconcile, single nc",
1081+
storedNCs: map[string]containerstatus{
1082+
"nc1": {},
1083+
},
1084+
ipStates: map[string]cns.IPConfigurationStatus{
1085+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1086+
},
1087+
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
1088+
for k, is := range ipStates {
1089+
is.SetState(types.Assigned)
1090+
ipStates[k] = is
1091+
}
1092+
},
1093+
ncsFromReconcile: []string{"nc1"},
1094+
expectedRemovedNCs: []string{},
1095+
},
1096+
{
1097+
name: "normal reconcile, dual stack ncs",
1098+
storedNCs: map[string]containerstatus{
1099+
"nc1": {},
1100+
"nc2": {},
1101+
},
1102+
ipStates: map[string]cns.IPConfigurationStatus{
1103+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1104+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1105+
},
1106+
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
1107+
for k, is := range ipStates {
1108+
is.SetState(types.Assigned)
1109+
ipStates[k] = is
1110+
}
1111+
},
1112+
ncsFromReconcile: []string{"nc1", "nc2"},
1113+
expectedRemovedNCs: []string{},
1114+
},
1115+
{
1116+
name: "stale nc",
1117+
storedNCs: map[string]containerstatus{
1118+
"nc1": {},
1119+
},
1120+
ipStates: map[string]cns.IPConfigurationStatus{
1121+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1122+
},
1123+
ncsFromReconcile: []string{"nc2"},
1124+
expectedRemovedNCs: []string{"nc1"},
1125+
},
1126+
{
1127+
name: "stale dual stack ncs",
1128+
storedNCs: map[string]containerstatus{
1129+
"nc1": {},
1130+
"nc2": {},
1131+
},
1132+
ipStates: map[string]cns.IPConfigurationStatus{
1133+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1134+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1135+
},
1136+
ncsFromReconcile: []string{"nc3", "nc4"},
1137+
expectedRemovedNCs: []string{"nc1", "nc2"},
1138+
},
1139+
{
1140+
name: "stale ncs, ips in flight",
1141+
storedNCs: map[string]containerstatus{
1142+
"nc1": {},
1143+
"nc2": {},
1144+
},
1145+
ipStates: map[string]cns.IPConfigurationStatus{
1146+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1147+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1148+
},
1149+
ipStatesOverrideFunc: func(m map[string]cns.IPConfigurationStatus) {
1150+
ip1 := m["aaaa"]
1151+
ip1.SetState(types.PendingRelease)
1152+
m["aaaa"] = ip1
1153+
1154+
ip2 := m["aabb"]
1155+
ip2.SetState(types.PendingProgramming)
1156+
m["aabb"] = ip2
1157+
},
1158+
ncsFromReconcile: []string{"nc3", "nc4"},
1159+
expectedRemovedNCs: []string{"nc1", "nc2"},
1160+
},
1161+
}
1162+
1163+
for _, tt := range tests {
1164+
tt := tt
1165+
t.Run(tt.name, func(t *testing.T) {
1166+
if tt.ipStatesOverrideFunc != nil {
1167+
tt.ipStatesOverrideFunc(tt.ipStates)
1168+
} else {
1169+
for k, is := range tt.ipStates {
1170+
is.SetState(types.Available)
1171+
tt.ipStates[k] = is
1172+
}
1173+
}
1174+
1175+
svc := HTTPRestService{
1176+
store: store.NewMockStore("foo"),
1177+
state: &httpRestServiceState{ContainerStatus: tt.storedNCs},
1178+
PodIPConfigState: tt.ipStates,
1179+
}
1180+
1181+
require.NotPanics(t, func() {
1182+
svc.MustEnsureNoStaleNCs(tt.ncsFromReconcile)
1183+
})
1184+
1185+
for _, expectedRemovedNCID := range tt.expectedRemovedNCs {
1186+
assert.NotContains(t, svc.state.ContainerStatus, expectedRemovedNCID)
1187+
}
1188+
})
1189+
}
1190+
}
1191+
1192+
func TestMustEnsureNoStaleNCs_PanicsWhenIPsFromStaleNCAreAssigned(t *testing.T) {
1193+
staleNC1 := "nc1"
1194+
staleNC2 := "nc2"
1195+
1196+
ncs := map[string]containerstatus{staleNC1: {}, staleNC2: {}}
1197+
1198+
ipStates := map[string]cns.IPConfigurationStatus{
1199+
"aaaa": {IPAddress: "10.0.0.10", NCID: staleNC1},
1200+
"aabb": {IPAddress: "10.0.0.11", NCID: staleNC1},
1201+
"aacc": {IPAddress: "10.0.0.12", NCID: staleNC2},
1202+
"aadd": {IPAddress: "10.0.0.13", NCID: staleNC2},
1203+
}
1204+
for k, is := range ipStates {
1205+
is.SetState(types.Assigned)
1206+
ipStates[k] = is
1207+
}
1208+
1209+
svc := HTTPRestService{
1210+
store: store.NewMockStore("foo"),
1211+
state: &httpRestServiceState{ContainerStatus: ncs},
1212+
PodIPConfigState: ipStates,
1213+
}
1214+
1215+
require.Panics(t, func() {
1216+
svc.MustEnsureNoStaleNCs([]string{"nc3", "nc4"})
1217+
})
1218+
}

0 commit comments

Comments
 (0)