Skip to content

Commit 666f36c

Browse files
CNS - Ensuring no stale NCs during NNC reconcile (#2192)
* ensuring no stale ncs during nnc reconcile * only save state if mutated * ensuring we only remove stale ncs if none of their ips are assigned
1 parent 1360e02 commit 666f36c

File tree

4 files changed

+282
-5
lines changed

4 files changed

+282
-5
lines changed

cns/kubecontroller/nodenetworkconfig/reconciler.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
type cnsClient interface {
2626
CreateOrUpdateNetworkContainerInternal(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode
27+
MustEnsureNoStaleNCs(validNCIDs []string)
2728
}
2829

2930
type nodeNetworkConfigListener interface {
@@ -74,6 +75,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
7475

7576
ipAssignments := 0
7677

78+
// during node upgrades, an nnc may be updated with new ncs. at any given time, only the ncs
79+
// that exist in the nnc are valid. any others that may have been previously created and no
80+
// longer exist in the nnc should be considered stale.
81+
validNCIDs := make([]string, len(nnc.Status.NetworkContainers))
82+
for i := range nnc.Status.NetworkContainers {
83+
validNCIDs[i] = nnc.Status.NetworkContainers[i].ID
84+
}
85+
r.cnscli.MustEnsureNoStaleNCs(validNCIDs)
86+
7787
// for each NC, parse it in to a CreateNCRequest and forward it to the appropriate Listener
7888
for i := range nnc.Status.NetworkContainers {
7989
// check if this NC matches the Node IP if we have one to check against

cns/kubecontroller/nodenetworkconfig/reconciler_test.go

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ import (
1818
)
1919

2020
type cnsClientState struct {
21-
req *cns.CreateNetworkContainerRequest
22-
nnc *v1alpha.NodeNetworkConfig
21+
reqsByNCID map[string]*cns.CreateNetworkContainerRequest
22+
nnc *v1alpha.NodeNetworkConfig
2323
}
2424

2525
type mockCNSClient struct {
@@ -29,10 +29,23 @@ type mockCNSClient struct {
2929
}
3030

3131
func (m *mockCNSClient) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) cnstypes.ResponseCode {
32-
m.state.req = req
32+
m.state.reqsByNCID[req.NetworkContainerid] = req
3333
return m.createOrUpdateNC(req)
3434
}
3535

36+
func (m *mockCNSClient) MustEnsureNoStaleNCs(validNCIDs []string) {
37+
valid := make(map[string]struct{})
38+
for _, ncID := range validNCIDs {
39+
valid[ncID] = struct{}{}
40+
}
41+
42+
for ncID := range m.state.reqsByNCID {
43+
if _, ok := valid[ncID]; !ok {
44+
delete(m.state.reqsByNCID, ncID)
45+
}
46+
}
47+
}
48+
3649
func (m *mockCNSClient) Update(nnc *v1alpha.NodeNetworkConfig) error {
3750
m.state.nnc = nnc
3851
return m.update(nnc)
@@ -112,7 +125,7 @@ func TestReconcile(t *testing.T) {
112125
},
113126
wantErr: true,
114127
wantCNSClientState: cnsClientState{
115-
req: validSwiftRequest,
128+
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
116129
},
117130
},
118131
{
@@ -137,7 +150,7 @@ func TestReconcile(t *testing.T) {
137150
},
138151
wantErr: false,
139152
wantCNSClientState: cnsClientState{
140-
req: validSwiftRequest,
153+
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
141154
nnc: &v1alpha.NodeNetworkConfig{
142155
Status: validSwiftStatus,
143156
Spec: v1alpha.NodeNetworkConfigSpec{
@@ -173,6 +186,11 @@ func TestReconcile(t *testing.T) {
173186
}
174187
for _, tt := range tests {
175188
tt := tt
189+
tt.cnsClient.state.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
190+
if tt.wantCNSClientState.reqsByNCID == nil {
191+
tt.wantCNSClientState.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
192+
}
193+
176194
t.Run(tt.name, func(t *testing.T) {
177195
r := NewReconciler(&tt.cnsClient, &tt.cnsClient, tt.nodeIP)
178196
r.nnccli = &tt.ncGetter
@@ -187,3 +205,64 @@ func TestReconcile(t *testing.T) {
187205
})
188206
}
189207
}
208+
209+
func TestReconcileStaleNCs(t *testing.T) {
210+
logger.InitLogger("", 0, 0, "")
211+
212+
cnsClient := mockCNSClient{
213+
state: cnsClientState{reqsByNCID: make(map[string]*cns.CreateNetworkContainerRequest)},
214+
createOrUpdateNC: func(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode { return cnstypes.Success },
215+
update: func(*v1alpha.NodeNetworkConfig) error { return nil },
216+
}
217+
218+
nodeIP := "10.0.0.10"
219+
220+
nncv1 := v1alpha.NodeNetworkConfig{
221+
Status: v1alpha.NodeNetworkConfigStatus{
222+
NetworkContainers: []v1alpha.NetworkContainer{
223+
{ID: "nc1", PrimaryIP: "10.1.0.10", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
224+
{ID: "nc2", PrimaryIP: "10.1.0.11", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
225+
},
226+
},
227+
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
228+
}
229+
230+
nncv2 := v1alpha.NodeNetworkConfig{
231+
Status: v1alpha.NodeNetworkConfigStatus{
232+
NetworkContainers: []v1alpha.NetworkContainer{
233+
{ID: "nc3", PrimaryIP: "10.1.0.12", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
234+
{ID: "nc4", PrimaryIP: "10.1.0.13", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
235+
},
236+
},
237+
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
238+
}
239+
240+
i := 0
241+
nncIterator := func(context.Context, types.NamespacedName) (*v1alpha.NodeNetworkConfig, error) {
242+
nncLog := []v1alpha.NodeNetworkConfig{nncv1, nncv2}
243+
for i < len(nncLog) {
244+
j := i
245+
i++
246+
return &nncLog[j], nil
247+
}
248+
249+
return &nncLog[len(nncLog)-1], nil
250+
}
251+
252+
r := NewReconciler(&cnsClient, &cnsClient, nodeIP)
253+
r.nnccli = &mockNCGetter{get: nncIterator}
254+
255+
_, err := r.Reconcile(context.Background(), reconcile.Request{})
256+
require.NoError(t, err)
257+
258+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc1")
259+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc2")
260+
261+
_, err = r.Reconcile(context.Background(), reconcile.Request{})
262+
require.NoError(t, err)
263+
264+
assert.NotContains(t, cnsClient.state.reqsByNCID, "nc1")
265+
assert.NotContains(t, cnsClient.state.reqsByNCID, "nc2")
266+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc3")
267+
assert.Contains(t, cnsClient.state.reqsByNCID, "nc4")
268+
}

cns/restserver/internalapi.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,43 @@ func (service *HTTPRestService) DeleteNetworkContainerInternal(
475475
return types.Success
476476
}
477477

478+
func (service *HTTPRestService) MustEnsureNoStaleNCs(validNCIDs []string) {
479+
valid := make(map[string]struct{})
480+
for _, ncID := range validNCIDs {
481+
valid[ncID] = struct{}{}
482+
}
483+
484+
service.Lock()
485+
defer service.Unlock()
486+
487+
ncIDToAssignedIPs := make(map[string][]cns.IPConfigurationStatus)
488+
for _, ipInfo := range service.PodIPConfigState { // nolint:gocritic // copy is fine; it's a larger change to modify the map to hold pointers
489+
if ipInfo.GetState() == types.Assigned {
490+
ncIDToAssignedIPs[ipInfo.NCID] = append(ncIDToAssignedIPs[ipInfo.NCID], ipInfo)
491+
}
492+
}
493+
494+
mutated := false
495+
for ncID := range service.state.ContainerStatus {
496+
if _, ok := valid[ncID]; !ok {
497+
// stale NCs with assigned IPs are an unexpected CNS state which we need to alert on.
498+
if assignedIPs, hasAssignedIPs := ncIDToAssignedIPs[ncID]; hasAssignedIPs {
499+
msg := fmt.Sprintf("Unexpected state: found stale NC ID %s in CNS state with %d assigned IPs: %+v", ncID, len(assignedIPs), assignedIPs)
500+
logger.Errorf(msg)
501+
panic(msg)
502+
}
503+
504+
logger.Errorf("[Azure CNS] Found stale NC ID %s in CNS state. Removing...", ncID)
505+
delete(service.state.ContainerStatus, ncID)
506+
mutated = true
507+
}
508+
}
509+
510+
if mutated {
511+
_ = service.saveState()
512+
}
513+
}
514+
478515
// This API will be called by CNS RequestController on CRD update.
479516
func (service *HTTPRestService) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) types.ResponseCode {
480517
if req.NetworkContainerid == "" {

cns/restserver/internalapi_test.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/Azure/azure-container-networking/cns/types"
2121
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
2222
nma "github.com/Azure/azure-container-networking/nmagent"
23+
"github.com/Azure/azure-container-networking/store"
2324
"github.com/google/uuid"
2425
"github.com/pkg/errors"
2526
"github.com/stretchr/testify/assert"
@@ -1275,3 +1276,153 @@ func TestCNIConflistGenerationOnNMAError(t *testing.T) {
12751276
time.Sleep(time.Second)
12761277
assert.Equal(t, 1, mockgen.getGeneratedCount())
12771278
}
1279+
1280+
func TestMustEnsureNoStaleNCs(t *testing.T) {
1281+
tests := []struct {
1282+
name string
1283+
storedNCs map[string]containerstatus
1284+
ipStates map[string]cns.IPConfigurationStatus
1285+
ipStatesOverrideFunc func(map[string]cns.IPConfigurationStatus) // defaults to available
1286+
ncsFromReconcile []string
1287+
expectedRemovedNCs []string
1288+
}{
1289+
{
1290+
name: "normal reconcile, single nc",
1291+
storedNCs: map[string]containerstatus{
1292+
"nc1": {},
1293+
},
1294+
ipStates: map[string]cns.IPConfigurationStatus{
1295+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1296+
},
1297+
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
1298+
for k, is := range ipStates {
1299+
is.SetState(types.Assigned)
1300+
ipStates[k] = is
1301+
}
1302+
},
1303+
ncsFromReconcile: []string{"nc1"},
1304+
expectedRemovedNCs: []string{},
1305+
},
1306+
{
1307+
name: "normal reconcile, dual stack ncs",
1308+
storedNCs: map[string]containerstatus{
1309+
"nc1": {},
1310+
"nc2": {},
1311+
},
1312+
ipStates: map[string]cns.IPConfigurationStatus{
1313+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1314+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1315+
},
1316+
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
1317+
for k, is := range ipStates {
1318+
is.SetState(types.Assigned)
1319+
ipStates[k] = is
1320+
}
1321+
},
1322+
ncsFromReconcile: []string{"nc1", "nc2"},
1323+
expectedRemovedNCs: []string{},
1324+
},
1325+
{
1326+
name: "stale nc",
1327+
storedNCs: map[string]containerstatus{
1328+
"nc1": {},
1329+
},
1330+
ipStates: map[string]cns.IPConfigurationStatus{
1331+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1332+
},
1333+
ncsFromReconcile: []string{"nc2"},
1334+
expectedRemovedNCs: []string{"nc1"},
1335+
},
1336+
{
1337+
name: "stale dual stack ncs",
1338+
storedNCs: map[string]containerstatus{
1339+
"nc1": {},
1340+
"nc2": {},
1341+
},
1342+
ipStates: map[string]cns.IPConfigurationStatus{
1343+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1344+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1345+
},
1346+
ncsFromReconcile: []string{"nc3", "nc4"},
1347+
expectedRemovedNCs: []string{"nc1", "nc2"},
1348+
},
1349+
{
1350+
name: "stale ncs, ips in flight",
1351+
storedNCs: map[string]containerstatus{
1352+
"nc1": {},
1353+
"nc2": {},
1354+
},
1355+
ipStates: map[string]cns.IPConfigurationStatus{
1356+
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
1357+
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
1358+
},
1359+
ipStatesOverrideFunc: func(m map[string]cns.IPConfigurationStatus) {
1360+
ip1 := m["aaaa"]
1361+
ip1.SetState(types.PendingRelease)
1362+
m["aaaa"] = ip1
1363+
1364+
ip2 := m["aabb"]
1365+
ip2.SetState(types.PendingProgramming)
1366+
m["aabb"] = ip2
1367+
},
1368+
ncsFromReconcile: []string{"nc3", "nc4"},
1369+
expectedRemovedNCs: []string{"nc1", "nc2"},
1370+
},
1371+
}
1372+
1373+
for _, tt := range tests {
1374+
tt := tt
1375+
t.Run(tt.name, func(t *testing.T) {
1376+
if tt.ipStatesOverrideFunc != nil {
1377+
tt.ipStatesOverrideFunc(tt.ipStates)
1378+
} else {
1379+
for k, is := range tt.ipStates {
1380+
is.SetState(types.Available)
1381+
tt.ipStates[k] = is
1382+
}
1383+
}
1384+
1385+
svc := HTTPRestService{
1386+
store: store.NewMockStore("foo"),
1387+
state: &httpRestServiceState{ContainerStatus: tt.storedNCs},
1388+
PodIPConfigState: tt.ipStates,
1389+
}
1390+
1391+
require.NotPanics(t, func() {
1392+
svc.MustEnsureNoStaleNCs(tt.ncsFromReconcile)
1393+
})
1394+
1395+
for _, expectedRemovedNCID := range tt.expectedRemovedNCs {
1396+
assert.NotContains(t, svc.state.ContainerStatus, expectedRemovedNCID)
1397+
}
1398+
})
1399+
}
1400+
}
1401+
1402+
func TestMustEnsureNoStaleNCs_PanicsWhenIPsFromStaleNCAreAssigned(t *testing.T) {
1403+
staleNC1 := "nc1"
1404+
staleNC2 := "nc2"
1405+
1406+
ncs := map[string]containerstatus{staleNC1: {}, staleNC2: {}}
1407+
1408+
ipStates := map[string]cns.IPConfigurationStatus{
1409+
"aaaa": {IPAddress: "10.0.0.10", NCID: staleNC1},
1410+
"aabb": {IPAddress: "10.0.0.11", NCID: staleNC1},
1411+
"aacc": {IPAddress: "10.0.0.12", NCID: staleNC2},
1412+
"aadd": {IPAddress: "10.0.0.13", NCID: staleNC2},
1413+
}
1414+
for k, is := range ipStates {
1415+
is.SetState(types.Assigned)
1416+
ipStates[k] = is
1417+
}
1418+
1419+
svc := HTTPRestService{
1420+
store: store.NewMockStore("foo"),
1421+
state: &httpRestServiceState{ContainerStatus: ncs},
1422+
PodIPConfigState: ipStates,
1423+
}
1424+
1425+
require.Panics(t, func() {
1426+
svc.MustEnsureNoStaleNCs([]string{"nc3", "nc4"})
1427+
})
1428+
}

0 commit comments

Comments
 (0)