Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions cns/healthserver/healthz.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package healthserver

import (
"net/http"

"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/configuration"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

var scheme = runtime.NewScheme()

func init() {
utilruntime.Must(v1alpha.AddToScheme(scheme))
}

// NewHealthzHandlerWithChecks will return a [http.Handler] for CNS's /healthz endpoint.
// Depending on what we expect CNS to be able to read (based on the [configuration.CNSConfig])
// then the checks registered to the handler will test for those expectations. For example, in
// ChannelMode: CRD, the health check will ensure that CNS is able to list NNCs successfully.
func NewHealthzHandlerWithChecks(cnsConfig *configuration.CNSConfig) (http.Handler, error) {
checks := make(map[string]healthz.Checker)
if cnsConfig.ChannelMode == cns.CRD {
cfg, err := ctrl.GetConfig()
if err != nil {
return nil, errors.Wrap(err, "failed to get kubeconfig")
}
cli, err := client.New(cfg, client.Options{
Scheme: scheme,
})
if err != nil {
return nil, errors.Wrap(err, "failed to build client")
}

checks["nnc"] = func(req *http.Request) error {
ctx := req.Context()
// we just care that we're allowed to List NNCs so set limit to 1 to minimize
// additional load on apiserver
if err := cli.List(ctx, &v1alpha.NodeNetworkConfigList{}, &client.ListOptions{
Namespace: metav1.NamespaceSystem,
Limit: int64(1),
}); err != nil {
return errors.Wrap(err, "failed to list NodeNetworkConfig")
}
return nil
}
}

// strip prefix so that it runs through all checks registered on the handler.
// otherwise it will look for a check named "healthz" and return a 404 if not there.
return http.StripPrefix("/healthz", &healthz.Handler{
Checks: checks,
}), nil
}
290 changes: 290 additions & 0 deletions cns/healthserver/healthz_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
package healthserver

import (
"fmt"
"net/http"
"net/http/httptest"
"os"
"testing"

"github.com/Azure/azure-container-networking/cns/configuration"
"github.com/stretchr/testify/require"
)

const nncCRD = `{
"kind": "APIResourceList",
"apiVersion": "v1",
"groupVersion": "acn.azure.com/v1alpha",
"resources": [
{
"name": "nodenetworkconfigs",
"singularName": "nodenetworkconfig",
"namespaced": true,
"kind": "NodeNetworkConfig",
"verbs": [
"delete",
"deletecollection",
"get",
"list",
"patch",
"create",
"update",
"watch"
],
"shortNames": [
"nnc"
],
"storageVersionHash": "aGVsbG93cmxk"
},
{
"name": "nodenetworkconfigs/status",
"singularName": "",
"namespaced": true,
"kind": "NodeNetworkConfig",
"verbs": [
"get",
"patch",
"update"
]
}
]
}`

const nncResult = `{
"apiVersion": "acn.azure.com/v1alpha",
"items": [
{
"apiVersion": "acn.azure.com/v1alpha",
"kind": "NodeNetworkConfig",
"metadata": {
"creationTimestamp": "2024-12-04T20:42:17Z",
"finalizers": [
"finalizers.acn.azure.com/dnc-operations"
],
"generation": 1,
"labels": {
"kubernetes.azure.com/podnetwork-delegationguid": "",
"kubernetes.azure.com/podnetwork-subnet": "",
"kubernetes.azure.com/podnetwork-type": "overlay",
"managed": "true",
"owner": "aks-nodepool1-1234567-vmss000000"
},
"managedFields": [
{
"apiVersion": "acn.azure.com/v1alpha",
"fieldsType": "FieldsV1",
"fieldsV1": {
"f:metadata": {
"f:finalizers": {
".": {},
"v:\"finalizers.acn.azure.com/dnc-operations\"": {}
},
"f:labels": {
".": {},
"f:kubernetes.azure.com/podnetwork-delegationguid": {},
"f:kubernetes.azure.com/podnetwork-subnet": {},
"f:kubernetes.azure.com/podnetwork-type": {},
"f:managed": {},
"f:owner": {}
},
"f:ownerReferences": {
".": {},
"k:{\"uid\":\"f5117020-bbc5-11ef-8433-1b9e59caeb1d\"}": {}
}
},
"f:spec": {
".": {},
"f:requestedIPCount": {}
}
},
"manager": "dnc-rc",
"operation": "Update",
"time": "2024-12-04T20:42:17Z"
},
{
"apiVersion": "acn.azure.com/v1alpha",
"fieldsType": "FieldsV1",
"fieldsV1": {
"f:status": {
".": {},
"f:assignedIPCount": {},
"f:networkContainers": {}
}
},
"manager": "dnc-rc",
"operation": "Update",
"subresource": "status",
"time": "2024-12-04T20:42:18Z"
}
],
"name": "aks-nodepool1-1234567-vmss000000",
"namespace": "kube-system",
"ownerReferences": [
{
"apiVersion": "v1",
"blockOwnerDeletion": true,
"controller": true,
"kind": "Node",
"name": "aks-nodepool1-1234567-vmss000000",
"uid": "02df1fcc-bbc6-11ef-a76a-4b1af8d399a2"
}
],
"resourceVersion": "123456789",
"uid": "0dc75e5e-bbc6-11ef-878f-ab45432262d6"
},
"spec": {
"requestedIPCount": 0
},
"status": {
"assignedIPCount": 256,
"networkContainers": [
{
"assignmentMode": "static",
"id": "13f630c0-bbc6-11ef-b3b7-bb8e46de5973",
"nodeIP": "10.224.0.4",
"primaryIP": "10.244.2.0/24",
"subnetAddressSpace": "10.244.0.0/16",
"subnetName": "routingdomain_1f7eb6ba-bbc6-11ef-8c54-7b2c1e3cbbe4_overlaysubnet",
"type": "overlay",
"version": 0
}
]
}
}
],
"kind": "NodeNetworkConfigList",
"metadata": {
"continue": "",
"resourceVersion": "9876543210"
}
}`

func TestNewHealthzHandlerWithChecks(t *testing.T) {
tests := []struct {
name string
cnsConfig *configuration.CNSConfig
apiStatusCode int
expectedHealthy bool
}{
{
name: "list NNC gives 200 should indicate healthy",
cnsConfig: &configuration.CNSConfig{
ChannelMode: "CRD",
},
apiStatusCode: http.StatusOK,
expectedHealthy: true,
},
{
name: "unauthorized (401) from apiserver should be unhealthy",
cnsConfig: &configuration.CNSConfig{
ChannelMode: "CRD",
},
apiStatusCode: http.StatusUnauthorized,
expectedHealthy: false,
},
{
name: "channel nodesubnet should not call apiserver so it doesn't matter if the status code is a 401",
cnsConfig: &configuration.CNSConfig{
ChannelMode: "AzureHost",
},
apiStatusCode: http.StatusUnauthorized,
expectedHealthy: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
configureLocalAPIServer(t, tt.apiStatusCode)

responseRecorder := httptest.NewRecorder()
healthHandler, err := NewHealthzHandlerWithChecks(tt.cnsConfig)
require.NoError(t, err)

healthHandler.ServeHTTP(responseRecorder, httptest.NewRequest("GET", "/healthz", http.NoBody))

require.Equal(t, tt.expectedHealthy, responseRecorder.Code == http.StatusOK)
})
}
}

func configureLocalAPIServer(t *testing.T, expectedNNCStatusCode int) {
// setup apiserver
server := setupMockAPIServer(expectedNNCStatusCode)

// write kubeConfig for test server
kubeConfigFile, err := writeTmpKubeConfig(server.URL)
require.NoError(t, err)

// set env var to kubeconfig
os.Setenv("KUBECONFIG", kubeConfigFile)

t.Cleanup(func() {
server.Close()
os.Remove(kubeConfigFile)
os.Unsetenv("KUBECONFIG")
})
}

func writeTmpKubeConfig(host string) (string, error) {
tempKubeConfig := `
apiVersion: v1
clusters:
- cluster:
server: ` + host + `
name: test-cluster
contexts:
- context:
cluster: test-cluster
user: test-user
name: test-context
current-context: test-context
kind: Config
preferences: {}
users:
- name: test-user
user:
token: test-token
`
kubeConfigFile, err := os.CreateTemp("", "kubeconfig")
if err != nil {
return "", fmt.Errorf("failed to create temp kubeconfig file: %w", err)
}

_, err = kubeConfigFile.WriteString(tempKubeConfig)
if err != nil {
return "", fmt.Errorf("failed to write kubeconfig to temp file: %w", err)
}
kubeConfigFile.Close()
return kubeConfigFile.Name(), nil
}

func setupMockAPIServer(code int) *httptest.Server {
// Start a mock HTTP server
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Handle requests based on the path
switch r.URL.Path {
case "/apis/acn.azure.com/v1alpha":
_, err := w.Write([]byte(nncCRD))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
case "/apis/acn.azure.com/v1alpha/namespaces/kube-system/nodenetworkconfigs":
if code == http.StatusOK {
w.Header().Set("Cache-Control", "no-cache, private")
w.Header().Set("Content-Type", "application/json")
_, err := w.Write([]byte(nncResult))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
} else {
w.WriteHeader(code)
}
default:
w.WriteHeader(http.StatusNotFound)
}
}))

return mockServer
}
12 changes: 9 additions & 3 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,13 @@ func main() {
return nil
}),
}
go healthserver.Start(z, cnsconfig.MetricsBindAddress, &healthz.Handler{}, readyChecker)

healthzHandler, err := healthserver.NewHealthzHandlerWithChecks(cnsconfig)
if err != nil {
logger.Errorf("unable to initialize a healthz handler: %v", err)
return
}
go healthserver.Start(z, cnsconfig.MetricsBindAddress, healthzHandler, readyChecker)

nmaConfig, err := nmagent.NewConfig(cnsconfig.WireserverIP)
if err != nil {
Expand Down Expand Up @@ -981,7 +987,7 @@ func main() {
// Start fs watcher here
z.Info("AsyncPodDelete is enabled")
logger.Printf("AsyncPodDelete is enabled")
cnsclient, err := cnsclient.New("", cnsReqTimeout) //nolint
cnsclient, err := cnsclient.New("", cnsReqTimeout) // nolint
if err != nil {
z.Error("failed to create cnsclient", zap.Error(err))
}
Expand Down Expand Up @@ -1482,7 +1488,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
// wait for the Reconciler to run once on a NNC that was made for this Node.
// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
// this false and the NNC Reconciler stuck/failed, log and retry.
nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) //nolint // it will time out and not leak
nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) // nolint // it will time out and not leak
if started, err := nncReconciler.Started(nncReadyCtx); !started {
logger.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
nncReconcilerStartFailures.Inc()
Expand Down
Loading