diff --git a/cns/healthserver/healthz.go b/cns/healthserver/healthz.go new file mode 100644 index 0000000000..45bc2f303c --- /dev/null +++ b/cns/healthserver/healthz.go @@ -0,0 +1,61 @@ +package healthserver + +import ( + "net/http" + + "github.com/Azure/azure-container-networking/cns" + "github.com/Azure/azure-container-networking/cns/configuration" + "github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha" + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/healthz" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(v1alpha.AddToScheme(scheme)) +} + +// NewHealthzHandlerWithChecks will return a [http.Handler] for CNS's /healthz endpoint. +// Depending on what we expect CNS to be able to read (based on the [configuration.CNSConfig]) +// then the checks registered to the handler will test for those expectations. For example, in +// ChannelMode: CRD, the health check will ensure that CNS is able to list NNCs successfully. +func NewHealthzHandlerWithChecks(cnsConfig *configuration.CNSConfig) (http.Handler, error) { + checks := make(map[string]healthz.Checker) + if cnsConfig.ChannelMode == cns.CRD { + cfg, err := ctrl.GetConfig() + if err != nil { + return nil, errors.Wrap(err, "failed to get kubeconfig") + } + cli, err := client.New(cfg, client.Options{ + Scheme: scheme, + }) + if err != nil { + return nil, errors.Wrap(err, "failed to build client") + } + + checks["nnc"] = func(req *http.Request) error { + ctx := req.Context() + // we just care that we're allowed to List NNCs so set limit to 1 to minimize + // additional load on apiserver + if err := cli.List(ctx, &v1alpha.NodeNetworkConfigList{}, &client.ListOptions{ + Namespace: metav1.NamespaceSystem, + Limit: int64(1), + }); err != nil { + return errors.Wrap(err, "failed to list NodeNetworkConfig") + } + return nil + } + } + + // strip prefix so that it runs through all checks registered on the handler. + // otherwise it will look for a check named "healthz" and return a 404 if not there. + return http.StripPrefix("/healthz", &healthz.Handler{ + Checks: checks, + }), nil +} diff --git a/cns/healthserver/healthz_test.go b/cns/healthserver/healthz_test.go new file mode 100644 index 0000000000..805509a1f4 --- /dev/null +++ b/cns/healthserver/healthz_test.go @@ -0,0 +1,290 @@ +package healthserver + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/Azure/azure-container-networking/cns/configuration" + "github.com/stretchr/testify/require" +) + +const nncCRD = `{ + "kind": "APIResourceList", + "apiVersion": "v1", + "groupVersion": "acn.azure.com/v1alpha", + "resources": [ + { + "name": "nodenetworkconfigs", + "singularName": "nodenetworkconfig", + "namespaced": true, + "kind": "NodeNetworkConfig", + "verbs": [ + "delete", + "deletecollection", + "get", + "list", + "patch", + "create", + "update", + "watch" + ], + "shortNames": [ + "nnc" + ], + "storageVersionHash": "aGVsbG93cmxk" + }, + { + "name": "nodenetworkconfigs/status", + "singularName": "", + "namespaced": true, + "kind": "NodeNetworkConfig", + "verbs": [ + "get", + "patch", + "update" + ] + } + ] +}` + +const nncResult = `{ + "apiVersion": "acn.azure.com/v1alpha", + "items": [ + { + "apiVersion": "acn.azure.com/v1alpha", + "kind": "NodeNetworkConfig", + "metadata": { + "creationTimestamp": "2024-12-04T20:42:17Z", + "finalizers": [ + "finalizers.acn.azure.com/dnc-operations" + ], + "generation": 1, + "labels": { + "kubernetes.azure.com/podnetwork-delegationguid": "", + "kubernetes.azure.com/podnetwork-subnet": "", + "kubernetes.azure.com/podnetwork-type": "overlay", + "managed": "true", + "owner": "aks-nodepool1-1234567-vmss000000" + }, + "managedFields": [ + { + "apiVersion": "acn.azure.com/v1alpha", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:finalizers": { + ".": {}, + "v:\"finalizers.acn.azure.com/dnc-operations\"": {} + }, + "f:labels": { + ".": {}, + "f:kubernetes.azure.com/podnetwork-delegationguid": {}, + "f:kubernetes.azure.com/podnetwork-subnet": {}, + "f:kubernetes.azure.com/podnetwork-type": {}, + "f:managed": {}, + "f:owner": {} + }, + "f:ownerReferences": { + ".": {}, + "k:{\"uid\":\"f5117020-bbc5-11ef-8433-1b9e59caeb1d\"}": {} + } + }, + "f:spec": { + ".": {}, + "f:requestedIPCount": {} + } + }, + "manager": "dnc-rc", + "operation": "Update", + "time": "2024-12-04T20:42:17Z" + }, + { + "apiVersion": "acn.azure.com/v1alpha", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { + ".": {}, + "f:assignedIPCount": {}, + "f:networkContainers": {} + } + }, + "manager": "dnc-rc", + "operation": "Update", + "subresource": "status", + "time": "2024-12-04T20:42:18Z" + } + ], + "name": "aks-nodepool1-1234567-vmss000000", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Node", + "name": "aks-nodepool1-1234567-vmss000000", + "uid": "02df1fcc-bbc6-11ef-a76a-4b1af8d399a2" + } + ], + "resourceVersion": "123456789", + "uid": "0dc75e5e-bbc6-11ef-878f-ab45432262d6" + }, + "spec": { + "requestedIPCount": 0 + }, + "status": { + "assignedIPCount": 256, + "networkContainers": [ + { + "assignmentMode": "static", + "id": "13f630c0-bbc6-11ef-b3b7-bb8e46de5973", + "nodeIP": "10.224.0.4", + "primaryIP": "10.244.2.0/24", + "subnetAddressSpace": "10.244.0.0/16", + "subnetName": "routingdomain_1f7eb6ba-bbc6-11ef-8c54-7b2c1e3cbbe4_overlaysubnet", + "type": "overlay", + "version": 0 + } + ] + } + } + ], + "kind": "NodeNetworkConfigList", + "metadata": { + "continue": "", + "resourceVersion": "9876543210" + } +}` + +func TestNewHealthzHandlerWithChecks(t *testing.T) { + tests := []struct { + name string + cnsConfig *configuration.CNSConfig + apiStatusCode int + expectedHealthy bool + }{ + { + name: "list NNC gives 200 should indicate healthy", + cnsConfig: &configuration.CNSConfig{ + ChannelMode: "CRD", + }, + apiStatusCode: http.StatusOK, + expectedHealthy: true, + }, + { + name: "unauthorized (401) from apiserver should be unhealthy", + cnsConfig: &configuration.CNSConfig{ + ChannelMode: "CRD", + }, + apiStatusCode: http.StatusUnauthorized, + expectedHealthy: false, + }, + { + name: "channel nodesubnet should not call apiserver so it doesn't matter if the status code is a 401", + cnsConfig: &configuration.CNSConfig{ + ChannelMode: "AzureHost", + }, + apiStatusCode: http.StatusUnauthorized, + expectedHealthy: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + configureLocalAPIServer(t, tt.apiStatusCode) + + responseRecorder := httptest.NewRecorder() + healthHandler, err := NewHealthzHandlerWithChecks(tt.cnsConfig) + require.NoError(t, err) + + healthHandler.ServeHTTP(responseRecorder, httptest.NewRequest("GET", "/healthz", http.NoBody)) + + require.Equal(t, tt.expectedHealthy, responseRecorder.Code == http.StatusOK) + }) + } +} + +func configureLocalAPIServer(t *testing.T, expectedNNCStatusCode int) { + // setup apiserver + server := setupMockAPIServer(expectedNNCStatusCode) + + // write kubeConfig for test server + kubeConfigFile, err := writeTmpKubeConfig(server.URL) + require.NoError(t, err) + + // set env var to kubeconfig + os.Setenv("KUBECONFIG", kubeConfigFile) + + t.Cleanup(func() { + server.Close() + os.Remove(kubeConfigFile) + os.Unsetenv("KUBECONFIG") + }) +} + +func writeTmpKubeConfig(host string) (string, error) { + tempKubeConfig := ` +apiVersion: v1 +clusters: +- cluster: + server: ` + host + ` + name: test-cluster +contexts: +- context: + cluster: test-cluster + user: test-user + name: test-context +current-context: test-context +kind: Config +preferences: {} +users: +- name: test-user + user: + token: test-token +` + kubeConfigFile, err := os.CreateTemp("", "kubeconfig") + if err != nil { + return "", fmt.Errorf("failed to create temp kubeconfig file: %w", err) + } + + _, err = kubeConfigFile.WriteString(tempKubeConfig) + if err != nil { + return "", fmt.Errorf("failed to write kubeconfig to temp file: %w", err) + } + kubeConfigFile.Close() + return kubeConfigFile.Name(), nil +} + +func setupMockAPIServer(code int) *httptest.Server { + // Start a mock HTTP server + mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Handle requests based on the path + switch r.URL.Path { + case "/apis/acn.azure.com/v1alpha": + _, err := w.Write([]byte(nncCRD)) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + case "/apis/acn.azure.com/v1alpha/namespaces/kube-system/nodenetworkconfigs": + if code == http.StatusOK { + w.Header().Set("Cache-Control", "no-cache, private") + w.Header().Set("Content-Type", "application/json") + _, err := w.Write([]byte(nncResult)) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + } else { + w.WriteHeader(code) + } + default: + w.WriteHeader(http.StatusNotFound) + } + })) + + return mockServer +} diff --git a/cns/service/main.go b/cns/service/main.go index 36f24dffa7..b5ef339f95 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -642,7 +642,13 @@ func main() { return nil }), } - go healthserver.Start(z, cnsconfig.MetricsBindAddress, &healthz.Handler{}, readyChecker) + + healthzHandler, err := healthserver.NewHealthzHandlerWithChecks(cnsconfig) + if err != nil { + logger.Errorf("unable to initialize a healthz handler: %v", err) + return + } + go healthserver.Start(z, cnsconfig.MetricsBindAddress, healthzHandler, readyChecker) nmaConfig, err := nmagent.NewConfig(cnsconfig.WireserverIP) if err != nil { @@ -981,7 +987,7 @@ func main() { // Start fs watcher here z.Info("AsyncPodDelete is enabled") logger.Printf("AsyncPodDelete is enabled") - cnsclient, err := cnsclient.New("", cnsReqTimeout) //nolint + cnsclient, err := cnsclient.New("", cnsReqTimeout) // nolint if err != nil { z.Error("failed to create cnsclient", zap.Error(err)) } @@ -1482,7 +1488,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn // wait for the Reconciler to run once on a NNC that was made for this Node. // the nncReadyCtx has a timeout of 15 minutes, after which we will consider // this false and the NNC Reconciler stuck/failed, log and retry. - nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak + nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) // nolint // it will time out and not leak if started, err := nncReconciler.Started(nncReadyCtx); !started { logger.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err) nncReconcilerStartFailures.Inc()