Skip to content

Commit 764d89e

Browse files
authored
Swift v2 CNS design (#2093)
* draft cns swiftv2 changes Signed-off-by: Evan Baker <[email protected]> * revisions per design review Signed-off-by: Evan Baker <[email protected]> --------- Signed-off-by: Evan Baker <[email protected]>
1 parent 14b916b commit 764d89e

File tree

2 files changed

+262
-0
lines changed

2 files changed

+262
-0
lines changed
File renamed without changes.

docs/feature/swift-v2/cns.md

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
# Swift V2 CNS implementation
2+
3+
## Introduction
4+
Swift V2 is a multitenant scenario where CNS handles IPAM for both a host cluster (2P subnet) and CX Pods (3P subnet). The CX pods are are networked via secondary NICs and CNS IPAM maps those interfaces to those pods. CNI queries CNS during CX netns bring-up and is told which interface to inject in to the CX pod.
5+
6+
The control boundaries for CNS are its CRDs and IPAM API which the CNI communicates over.
7+
CRDs are:
8+
- PodNetwork (represents a subnet delegation/registration)
9+
- PodNetworkInstance (represents a reservation set)
10+
- NodeInfo (used to pass VM Unique ID from CNS to DNC-RC)
11+
- MultiTenantPodNetworkConfig (represents a NetworkContainer)
12+
13+
The IPAM APIs are:
14+
- requestIPAddress
15+
- releaseIPAddress
16+
17+
#### Requirements/Notes
18+
- CNS must run with ManagedEndpointState in the MTv2 Scenario.
19+
- CNS must wait until the MT interface is programmed before responding to CNI requests with the MT interface IP.
20+
21+
22+
### Implementation
23+
24+
#### NodeInfo
25+
26+
At startup, CNS will detect that it is running in V2 mode based on Node labels.
27+
28+
CNS will check for the existence of the `NodeInfo` CRD and will create an instance of a `NodeInfo` associated with its current Node. This CRD is used to pass the VM Unique ID from CNS to DNC-RC. The CRD will have an OwnerReference set to the Kubernetes Node object, so that it is automatically GC'd when the Node is deleted.
29+
30+
31+
```mermaid
32+
sequenceDiagram
33+
participant Kubernetes
34+
participant CNS
35+
CNS->>+Kubernetes: Get current Node
36+
Kubernetes->>-CNS: Node Object
37+
Note over CNS: Build NodeInfo CRD
38+
CNS->>+Kubernetes: Create NodeInfo CRD
39+
```
40+
41+
```yaml
42+
apiVersion: acn.azure.com/v1alpha1
43+
kind: NodeInfo
44+
metadata:
45+
name: <node-name>
46+
ownerReferences:
47+
- apiVersion: v1
48+
blockOwnerDeletion: false
49+
kind: Node
50+
name: <node-name>
51+
uid: <node-uid>
52+
status:
53+
vmUniqueID: <vm-unique-id>
54+
```
55+
56+
#### Pod Network
57+
58+
CNS will watch Pods on its Node via controller-runtime. Controller-runtime will locally cache all known Pods and CNS will simply `GET` using a cache-aware client to read the latest Pod state.
59+
60+
When it receives an IP Request for a Pod, it will look up the Pod to determine if it is a V2 Multitenant Pod. If it is, CNS will cross-reference the Pod with a MultiTenantPodNetworkConfig (waiting for the controlplane to create the MTPNC if necessary).
61+
62+
63+
```yaml
64+
apiVersion: acn.azure.com/v1alpha1
65+
kind: MultiTenantPodNetworkConfig
66+
metadata:
67+
name: <pod-name>
68+
namespace: <pod-namespace>
69+
ownerReferences:
70+
- apiVersion: v1
71+
controller: true
72+
kind: Pod
73+
name: <pod-name>
74+
uid: <pod-uid>
75+
spec:
76+
podNetwork: <podnetwork-name>
77+
[podNetworkInstance: <podnetworkinstance-name>]
78+
status:
79+
uuid: <uuid>
80+
primaryIP: <primary-ip>
81+
macAddress: <mac-address>
82+
status: [provisioned]
83+
```
84+
85+
86+
<details>
87+
<summary>PodWatcher Example Implementation</summary>
88+
89+
```go
90+
package podwatcher
91+
92+
import (
93+
"context"
94+
95+
"github.com/pkg/errors"
96+
v1 "k8s.io/api/core/v1"
97+
"k8s.io/apimachinery/pkg/fields"
98+
ctrl "sigs.k8s.io/controller-runtime"
99+
"sigs.k8s.io/controller-runtime/pkg/client"
100+
"sigs.k8s.io/controller-runtime/pkg/event"
101+
"sigs.k8s.io/controller-runtime/pkg/predicate"
102+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
103+
)
104+
105+
type podcli interface {
106+
List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error
107+
}
108+
109+
type podWatcher struct {
110+
cli podcli
111+
sink chan<- []v1.Pod
112+
listOpt client.ListOption
113+
}
114+
115+
func New(nodename string, podsink chan<- []v1.Pod) *podWatcher {
116+
return &podWatcher{
117+
sink: podsink,
118+
listOpt: &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodename})},
119+
}
120+
}
121+
122+
func (p *podWatcher) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
123+
podList := &v1.PodList{}
124+
if err := p.cli.List(ctx, podList, p.listOpt); err != nil {
125+
return reconcile.Result{}, errors.Wrap(err, "failed to list pods")
126+
}
127+
p.sink <- podList.Items
128+
return reconcile.Result{}, nil
129+
}
130+
131+
// SetupWithManager Sets up the reconciler with a new manager.
132+
func (p *podWatcher) SetupWithManager(mgr ctrl.Manager) error {
133+
p.cli = mgr.GetClient()
134+
err := ctrl.NewControllerManagedBy(mgr).
135+
For(&v1.Pod{}).
136+
WithEventFilter(predicate.Funcs{
137+
// ignore Status only changes - they don't update the generation
138+
UpdateFunc: func(ue event.UpdateEvent) bool {
139+
return ue.ObjectOld.GetGeneration() != ue.ObjectNew.GetGeneration()
140+
},
141+
}).
142+
Complete(p)
143+
if err != nil {
144+
return errors.Wrap(err, "failed to set up pod watcher with manager")
145+
}
146+
return nil
147+
}
148+
```
149+
150+
</details>
151+
152+
#### Changes to RequestIPAddress
153+
154+
CNS needs context about the Pod (labels) to be able to make IPAM decisions when a RequestIPAddress call is receieved.
155+
156+
The RequestIPAddress handler chain will be modified so that the Request is enriched as it comes in based on cached Pod data. If the Request is for a V2 Pod, the Response will contain the IP assignment from the MTPNC. If the Request is for an infrastructure Pod, it will be handled by the normal IPAM flow. If the request is for an unknown Pod (not in cache), we will return an error and let the CNI retry.
157+
158+
```mermaid
159+
sequenceDiagram
160+
participant Kubernetes
161+
participant CNS
162+
participant CNI
163+
loop
164+
Kubernetes->>CNS: Pod Create Event
165+
Note right of CNS: Pod cache updated
166+
end
167+
CNI->>+CNS: RequestIPAddress
168+
alt Known Pod
169+
alt V2 Pod
170+
CNS->>CNI: Assign IP from MTPNC
171+
else V1 Pod
172+
CNS->>CNI: Assign IP from IPAM
173+
end
174+
else Unkown Pod
175+
CNS->>-CNI: Retry
176+
end
177+
178+
```
179+
180+
181+
<details>
182+
<summary>HTTPRestService</summary>
183+
184+
```diff
185+
+type IPConfigValidator func(ipConfigsRequest cns.IPConfigsRequest) (cns.PodInfo, types.ResponseCode, string)
186+
187+
type HTTPRestService struct {
188+
*cns.Service
189+
dockerClient *dockerclient.Client
190+
wscli interfaceGetter
191+
ipamClient *ipamclient.IpamClient
192+
nma nmagentClient
193+
wsproxy wireserverProxy
194+
homeAzMonitor *HomeAzMonitor
195+
networkContainer *networkcontainers.NetworkContainers
196+
PodIPIDByPodInterfaceKey map[string][]string // PodInterfaceId is key and value is slice of Pod IP (SecondaryIP) uuids.
197+
PodIPConfigState map[string]cns.IPConfigurationStatus // Secondary IP ID(uuid) is key
198+
IPAMPoolMonitor cns.IPAMPoolMonitor
199+
routingTable *routes.RoutingTable
200+
store store.KeyValueStore
201+
state *httpRestServiceState
202+
podsPendingIPAssignment *bounded.TimedSet
203+
sync.RWMutex
204+
dncPartitionKey string
205+
EndpointState map[string]*EndpointInfo // key : container id
206+
EndpointStateStore store.KeyValueStore
207+
cniConflistGenerator CNIConflistGenerator
208+
generateCNIConflistOnce sync.Once
209+
+ ipConfigsValidators []IPConfigValidator
210+
+
211+
}
212+
```
213+
214+
</details>
215+
216+
217+
<details>
218+
<summary>requestIPConfigHandlerHelper</summary>
219+
220+
```go
221+
// requestIPConfigHandlerHelper validates the request, assigns IPs, and returns a response
222+
func (service *HTTPRestService) requestIPConfigHandlerHelper(ipconfigsRequest cns.IPConfigsRequest) (*cns.IPConfigsResponse, error) {
223+
podInfo, returnCode, returnMessage := service.validateIPConfigsRequest(ipconfigsRequest)
224+
if returnCode != types.Success {
225+
return &cns.IPConfigsResponse{
226+
Response: cns.Response{
227+
ReturnCode: returnCode,
228+
Message: returnMessage,
229+
},
230+
}, errors.New("failed to validate ip config request")
231+
}
232+
233+
//[...]
234+
}
235+
```
236+
237+
</details>
238+
239+
240+
<details>
241+
<summary> validateIPConfigsRequest </summary>
242+
243+
```go
244+
func (service *HTTPRestService) v2KnownPodValidator(ipConfigsRequest cns.IPConfigsRequest) (cns.PodInfo, types.ResponseCode, string) {
245+
pod, ok := service.knownPods[ipConfigsRequest.OrchestratorContext.NamespacedName]
246+
if !ok {
247+
return UnkownPodError
248+
}
249+
return [...]
250+
}
251+
252+
253+
func (service *HTTPRestService) validateIPConfigsRequest(ipConfigsRequest cns.IPConfigsRequest) (cns.PodInfo, types.ResponseCode, string) {
254+
for v := range service.ipConfigValidators {
255+
if err := v(ipConfisRequest); err != nil {
256+
// todo handle error
257+
}
258+
}
259+
}
260+
```
261+
262+
</details>

0 commit comments

Comments
 (0)