Skip to content

Commit a7be139

Browse files
authored
Merge pull request #36 from monobilisim/k8s-new-checks
feat: Add Kubernetes compliance checks for topology…
2 parents d7876a9 + 3a1df4f commit a7be139

File tree

6 files changed

+341
-11
lines changed

6 files changed

+341
-11
lines changed

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ require (
3131
github.com/valkey-io/valkey-go v1.0.64
3232
golang.org/x/crypto v0.41.0
3333
golang.org/x/net v0.43.0
34-
google.golang.org/protobuf v1.36.7
34+
google.golang.org/protobuf v1.36.10
3535
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
3636
gopkg.in/yaml.v3 v3.0.1
3737
gorm.io/driver/postgres v1.6.0
@@ -168,6 +168,7 @@ require (
168168
golang.org/x/time v0.12.0 // indirect
169169
golang.org/x/tools v0.36.0 // indirect
170170
google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b // indirect
171+
google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.6.0 // indirect
171172
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
172173
gopkg.in/ini.v1 v1.67.0 // indirect
173174
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,8 @@ google.golang.org/grpc v1.67.1 h1:zWnc1Vrcno+lHZCOofnIMvycFcc0QRGIzm9dhnDX68E=
791791
google.golang.org/grpc v1.67.1/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA=
792792
google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4=
793793
google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM=
794+
google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.6.0 h1:6Al3kEFFP9VJhRz3DID6quisgPnTeZVr4lep9kkxdPA=
795+
google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.6.0/go.mod h1:QLvsjh0OIR0TYBeiu2bkWGTJBUNQ64st52iWj/yA93I=
794796
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
795797
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
796798
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -804,6 +806,8 @@ google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwl
804806
google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
805807
google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
806808
google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
809+
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
810+
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
807811
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc h1:2gGKlE2+asNV9m7xrywl36YYNnBG5ZQ0r/BOOxqPpmk=
808812
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc/go.mod h1:m7x9LTH6d71AHyAX77c9yqWCCa3UKHcVEj9y7hAtKDk=
809813
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

k8sHealth/k8s.go

Lines changed: 272 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
probing "github.com/prometheus-community/pro-bing"
2525
"github.com/rs/zerolog/log"
2626
"github.com/spf13/viper"
27+
appsv1 "k8s.io/api/apps/v1" // Added for Deployments and StatefulSets
2728
v1 "k8s.io/api/core/v1"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/version" // Added for GetKubernetesServerVersion
@@ -321,6 +322,18 @@ func CollectK8sHealthData() *K8sHealthData {
321322
// Collect RKE2 Information
322323
healthData.RKE2Info = CollectRKE2Information() // This is from k8s.go
323324

325+
// Collect Namespace Compliance
326+
nsData := CollectNamespaceCompliance(Clientset, K8sHealthConfig.K8s.Check_namespaces)
327+
328+
// Collect Master Taint Compliance
329+
mtData := CollectMasterTaintCompliance(Clientset)
330+
331+
healthData.ComplianceChecks = nsData
332+
if healthData.ComplianceChecks == nil {
333+
healthData.ComplianceChecks = &ComplianceCheckResults{}
334+
}
335+
healthData.ComplianceChecks.MasterTaint = mtData
336+
324337
// Clean up orphaned alarm logs for pods and containers that no longer exist
325338
// For plugin context, assume cleanup is enabled (disableCleanupOrphanedAlarms = false)
326339
// If granular control is needed, this could become a config option.
@@ -778,7 +791,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
778791

779792
if Clientset == nil {
780793
health.Error = "kubernetes clientset is not initialized"
781-
return health, fmt.Errorf(health.Error)
794+
return health, fmt.Errorf("%s", health.Error)
782795
}
783796

784797
// Check cert-manager namespace
@@ -801,7 +814,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
801814
Msg("Error getting cert-manager namespace")
802815
health.Error = errMsg
803816
alarmCheckDown("cert_manager_namespace", errMsg, false, "", "")
804-
return health, fmt.Errorf(errMsg) // This is a more significant k8s API error.
817+
return health, fmt.Errorf("%s", errMsg) // This is a more significant k8s API error.
805818
}
806819
health.NamespaceAvailable = true
807820
alarmCheckUp("cert_manager_namespace", "cert-manager namespace exists.", false)
@@ -839,7 +852,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
839852
Msg("Error parsing cert-manager Certificate JSON")
840853
health.Error = errMsg
841854
alarmCheckDown("cert_manager_json_parse", errMsg, false, "", "")
842-
return health, fmt.Errorf(errMsg) // Parsing error is more critical
855+
return health, fmt.Errorf("%s", errMsg) // Parsing error is more critical
843856
}
844857

845858
for _, item := range certManagerCR.Items {
@@ -1103,7 +1116,7 @@ func CollectKubeVipHealth() (*KubeVipHealth, error) {
11031116

11041117
if Clientset == nil {
11051118
health.Error = "kubernetes clientset is not initialized"
1106-
return health, fmt.Errorf(health.Error)
1119+
return health, fmt.Errorf("%s", health.Error)
11071120
}
11081121

11091122
var pods *v1.PodList
@@ -1259,7 +1272,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12591272
Msg("Error reading Cluster API server certificate file")
12601273
health.Error = errMsg
12611274
alarmCheckDown("kube_apiserver_cert_read", errMsg, false, "", "")
1262-
return health, fmt.Errorf(errMsg) // This is a file read error
1275+
return health, fmt.Errorf("%s", errMsg) // This is a file read error
12631276
}
12641277

12651278
block, _ := pem.Decode(certFileContent)
@@ -1272,7 +1285,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12721285
Msg("Failed to parse PEM block from Cluster API server certificate file")
12731286
health.Error = errMsg
12741287
alarmCheckDown("kube_apiserver_cert_parse", errMsg, false, "", "")
1275-
return health, fmt.Errorf(errMsg)
1288+
return health, fmt.Errorf("%s", errMsg)
12761289
}
12771290

12781291
cert, err := x509.ParseCertificate(block.Bytes)
@@ -1285,7 +1298,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12851298
Msg("Error parsing Cluster API server certificate")
12861299
health.Error = errMsg
12871300
alarmCheckDown("kube_apiserver_cert_parse", errMsg, false, "", "")
1288-
return health, fmt.Errorf(errMsg)
1301+
return health, fmt.Errorf("%s", errMsg)
12891302
}
12901303

12911304
health.NotAfter = cert.NotAfter
@@ -1854,3 +1867,255 @@ func alarmCheckDown(service, message string, noInterval bool, customStream, cust
18541867
}
18551868
common.AlarmCheckDown(service, message, noInterval, customStream, customTopic)
18561869
}
1870+
1871+
// CollectNamespaceCompliance checks for compliance rules in configured namespaces.
1872+
func CollectNamespaceCompliance(clientset kubernetes.Interface, namespaces []string) *ComplianceCheckResults {
1873+
results := &ComplianceCheckResults{
1874+
TopologySkew: []ComplianceItem{},
1875+
ReplicaCount: []ComplianceItem{},
1876+
ImagePull: []ComplianceItem{},
1877+
}
1878+
1879+
if len(namespaces) == 0 {
1880+
return results
1881+
}
1882+
1883+
log.Debug().
1884+
Str("component", "k8sHealth").
1885+
Str("operation", "collect_namespace_compliance").
1886+
Strs("namespaces", namespaces).
1887+
Msg("Starting namespace compliance checks")
1888+
1889+
if clientset == nil {
1890+
log.Error().
1891+
Str("component", "k8sHealth").
1892+
Str("operation", "collect_namespace_compliance").
1893+
Msg("Kubernetes clientset not initialized")
1894+
return results
1895+
}
1896+
1897+
// Count total worker nodes for replica compliance
1898+
workerCount := 0
1899+
nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
1900+
if err != nil {
1901+
log.Error().
1902+
Str("component", "k8sHealth").
1903+
Str("operation", "collect_namespace_compliance").
1904+
Err(err).
1905+
Msg("Error listing nodes for replica compliance")
1906+
} else {
1907+
for _, node := range nodes.Items {
1908+
if !isMaster(node) {
1909+
workerCount++
1910+
}
1911+
}
1912+
}
1913+
1914+
for _, ns := range namespaces {
1915+
// Check Deployments
1916+
deployments, err := clientset.AppsV1().Deployments(ns).List(context.TODO(), metav1.ListOptions{})
1917+
if err != nil {
1918+
log.Error().
1919+
Str("component", "k8sHealth").
1920+
Str("operation", "collect_namespace_compliance").
1921+
Str("namespace", ns).
1922+
Err(err).
1923+
Msg("Error listing deployments")
1924+
alarmCheckDown("k8s_compliance_list_deployments_"+ns, fmt.Sprintf("Error listing deployments in namespace %s: %v", ns, err), false, "", "")
1925+
continue
1926+
}
1927+
1928+
for _, deploy := range deployments.Items {
1929+
// Update checkTopologySpreadConstraints to return item
1930+
if item := checkTopologySpreadConstraints(&deploy); item != nil {
1931+
results.TopologySkew = append(results.TopologySkew, *item)
1932+
}
1933+
// Update checkReplicaCount to return item, passing workerCount
1934+
if item := checkReplicaCount(&deploy, workerCount); item != nil {
1935+
results.ReplicaCount = append(results.ReplicaCount, *item)
1936+
}
1937+
// Update checkImagePullPolicy to return items
1938+
if items := checkImagePullPolicy(&deploy.Spec.Template.Spec, "deployment", deploy.Namespace, deploy.Name); len(items) > 0 {
1939+
results.ImagePull = append(results.ImagePull, items...)
1940+
}
1941+
}
1942+
}
1943+
return results
1944+
}
1945+
1946+
// Helper to determine if a node is a master/control-plane
1947+
func isMaster(node v1.Node) bool {
1948+
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
1949+
return true
1950+
}
1951+
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
1952+
return true
1953+
}
1954+
if val, ok := node.Labels["kubernetes.io/role"]; ok && val == "master" {
1955+
return true
1956+
}
1957+
return false
1958+
}
1959+
1960+
// checkTopologySpreadConstraints verifies if the deployment has the correct topology spread constraints.
1961+
func checkTopologySpreadConstraints(deploy *appsv1.Deployment) *ComplianceItem {
1962+
// Check for ignore label
1963+
if val, ok := deploy.Labels["monokit.policy/topology-skew-ignored"]; ok && val == "true" {
1964+
return nil // Ignored
1965+
}
1966+
1967+
alarmKey := fmt.Sprintf("deployment_%s_%s_topology_skew", deploy.Namespace, deploy.Name)
1968+
resourceName := fmt.Sprintf("%s/%s", deploy.Namespace, deploy.Name)
1969+
1970+
TopologySpreadConstraints := deploy.Spec.Template.Spec.TopologySpreadConstraints
1971+
hasHostnameConstraint := false
1972+
isCorrect := true
1973+
var paramErrors []string
1974+
1975+
for _, constraint := range TopologySpreadConstraints {
1976+
if constraint.TopologyKey == "kubernetes.io/hostname" {
1977+
hasHostnameConstraint = true
1978+
if constraint.MaxSkew != 1 {
1979+
isCorrect = false
1980+
paramErrors = append(paramErrors, fmt.Sprintf("maxSkew is %d (expected 1)", constraint.MaxSkew))
1981+
}
1982+
if constraint.WhenUnsatisfiable != v1.DoNotSchedule {
1983+
isCorrect = false
1984+
paramErrors = append(paramErrors, fmt.Sprintf("whenUnsatisfiable is %s (expected DoNotSchedule)", constraint.WhenUnsatisfiable))
1985+
}
1986+
// Check logic for LabelSelector match (simplified: just checking if it exists)
1987+
if constraint.LabelSelector == nil {
1988+
isCorrect = false
1989+
paramErrors = append(paramErrors, "labelSelector is missing")
1990+
}
1991+
break
1992+
}
1993+
}
1994+
1995+
if !hasHostnameConstraint {
1996+
alarmCheckDown(alarmKey, fmt.Sprintf("Deployment '%s' in namespace '%s' missing topologySpreadConstraints for kubernetes.io/hostname.", deploy.Name, deploy.Namespace), false, "", "")
1997+
return &ComplianceItem{Resource: resourceName, Status: false, Message: "Missing topologySpreadConstraints for kubernetes.io/hostname"}
1998+
} else if !isCorrect {
1999+
msg := fmt.Sprintf("Deployment '%s' in namespace '%s' has invalid topologySpreadConstraints: %s", deploy.Name, deploy.Namespace, strings.Join(paramErrors, ", "))
2000+
alarmCheckDown(alarmKey, msg, false, "", "")
2001+
return &ComplianceItem{Resource: resourceName, Status: false, Message: fmt.Sprintf("Invalid topologySpreadConstraints: %s", strings.Join(paramErrors, ", "))}
2002+
} else {
2003+
alarmCheckUp(alarmKey, fmt.Sprintf("Deployment '%s' in namespace '%s' has correct topologySpreadConstraints.", deploy.Name, deploy.Namespace), false)
2004+
return &ComplianceItem{Resource: resourceName, Status: true, Message: "Correct topologySpreadConstraints"}
2005+
}
2006+
}
2007+
2008+
// checkReplicaCount verifies if the deployment's replica count matches the total number of worker nodes.
2009+
func checkReplicaCount(deploy *appsv1.Deployment, workerCount int) *ComplianceItem {
2010+
// Check for ignore label
2011+
if val, ok := deploy.Labels["monokit.policy/replica-count-ignored"]; ok && val == "true" {
2012+
return nil // Ignored
2013+
}
2014+
2015+
alarmKey := fmt.Sprintf("deployment_%s_%s_replica_count", deploy.Namespace, deploy.Name)
2016+
resourceName := fmt.Sprintf("%s/%s", deploy.Namespace, deploy.Name)
2017+
2018+
if deploy.Spec.Replicas == nil {
2019+
return nil // Should not happen usually
2020+
}
2021+
specReplicas := int(*deploy.Spec.Replicas)
2022+
2023+
// Policy Check: Spec Replicas must equal Total Worker Count
2024+
if specReplicas != workerCount {
2025+
msg := fmt.Sprintf("Deployment '%s' in namespace '%s' replica mismatch: spec=%d, expected=%d (Total Workers)", deploy.Name, deploy.Namespace, specReplicas, workerCount)
2026+
alarmCheckDown(alarmKey, msg, false, "", "")
2027+
return &ComplianceItem{Resource: resourceName, Status: false, Message: fmt.Sprintf("Mismatch: spec=%d, expected=%d", specReplicas, workerCount)}
2028+
} else {
2029+
alarmCheckUp(alarmKey, fmt.Sprintf("Deployment '%s' in namespace '%s' replica count matches worker count (%d).", deploy.Name, deploy.Namespace, workerCount), false)
2030+
return &ComplianceItem{Resource: resourceName, Status: true, Message: fmt.Sprintf("Match: %d", workerCount)}
2031+
}
2032+
}
2033+
2034+
// checkImagePullPolicy verifies if containers use imagePullPolicy: IfNotPresent
2035+
func checkImagePullPolicy(podSpec *v1.PodSpec, kind, namespace, name string) []ComplianceItem {
2036+
var items []ComplianceItem
2037+
alarmKeyBase := fmt.Sprintf("%s_%s_%s_image_pull", kind, namespace, name)
2038+
resourceName := fmt.Sprintf("%s/%s", namespace, name)
2039+
2040+
for _, container := range podSpec.Containers {
2041+
containerName := container.Name
2042+
alarmKey := fmt.Sprintf("%s_%s", alarmKeyBase, containerName)
2043+
2044+
if container.ImagePullPolicy != v1.PullIfNotPresent {
2045+
msg := fmt.Sprintf("%s '%s' container '%s' ImagePullPolicy is '%s' (expected IfNotPresent).", strings.Title(kind), name, containerName, container.ImagePullPolicy)
2046+
alarmCheckDown(alarmKey, msg, false, "", "")
2047+
items = append(items, ComplianceItem{
2048+
Resource: fmt.Sprintf("%s [%s]", resourceName, containerName),
2049+
Status: false,
2050+
Message: fmt.Sprintf("Policy is %s (expected IfNotPresent)", container.ImagePullPolicy),
2051+
})
2052+
} else {
2053+
alarmCheckUp(alarmKey, fmt.Sprintf("%s '%s' container '%s' has correct ImagePullPolicy.", strings.Title(kind), name, containerName), false)
2054+
items = append(items, ComplianceItem{
2055+
Resource: fmt.Sprintf("%s [%s]", resourceName, containerName),
2056+
Status: true,
2057+
Message: "Correct ImagePullPolicy",
2058+
})
2059+
}
2060+
}
2061+
return items
2062+
}
2063+
2064+
// CollectMasterTaintCompliance checks if master nodes have the correct taints.
2065+
func CollectMasterTaintCompliance(client kubernetes.Interface) []ComplianceItem {
2066+
var results []ComplianceItem
2067+
2068+
if client == nil {
2069+
return results
2070+
}
2071+
2072+
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
2073+
if err != nil {
2074+
log.Error().Err(err).Msg("Error listing nodes for taint compliance")
2075+
return results
2076+
}
2077+
2078+
for _, node := range nodes.Items {
2079+
isMaster := false
2080+
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
2081+
isMaster = true
2082+
} else if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
2083+
isMaster = true
2084+
}
2085+
2086+
if !isMaster {
2087+
continue
2088+
}
2089+
2090+
alarmKey := fmt.Sprintf("node_%s_master_taint", node.Name)
2091+
hasTaint := false
2092+
for _, taint := range node.Spec.Taints {
2093+
if taint.Key == "node-role.kubernetes.io/master" && taint.Effect == v1.TaintEffectNoSchedule {
2094+
hasTaint = true
2095+
break
2096+
}
2097+
// Also check control-plane taint which is standard in newer k8s
2098+
if taint.Key == "node-role.kubernetes.io/control-plane" && taint.Effect == v1.TaintEffectNoSchedule {
2099+
hasTaint = true
2100+
break
2101+
}
2102+
}
2103+
2104+
if hasTaint {
2105+
alarmCheckUp(alarmKey, fmt.Sprintf("Master node '%s' has correct NoSchedule taint.", node.Name), false)
2106+
results = append(results, ComplianceItem{
2107+
Resource: node.Name,
2108+
Status: true,
2109+
Message: "Correct NoSchedule taint found",
2110+
})
2111+
} else {
2112+
alarmCheckDown(alarmKey, fmt.Sprintf("Master node '%s' missing NoSchedule taint (node-role.kubernetes.io/master:NoSchedule).", node.Name), false, "", "")
2113+
results = append(results, ComplianceItem{
2114+
Resource: node.Name,
2115+
Status: false,
2116+
Message: "Missing NoSchedule taint",
2117+
})
2118+
}
2119+
}
2120+
return results
2121+
}

k8sHealth/kubernetes/configmap.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ metadata:
55
data:
66
k8s.yml: |
77
k8s:
8+
namespaces:
9+
- YOUR_NAMESPACE_1
810
floating_ips:
911
- YOUR_FLOATING_IP_1
1012
ingress_floating_ips:

0 commit comments

Comments
 (0)