@@ -24,6 +24,7 @@ import (
2424 probing "github.com/prometheus-community/pro-bing"
2525 "github.com/rs/zerolog/log"
2626 "github.com/spf13/viper"
27+ appsv1 "k8s.io/api/apps/v1" // Added for Deployments and StatefulSets
2728 v1 "k8s.io/api/core/v1"
2829 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930 "k8s.io/apimachinery/pkg/version" // Added for GetKubernetesServerVersion
@@ -321,6 +322,18 @@ func CollectK8sHealthData() *K8sHealthData {
321322 // Collect RKE2 Information
322323 healthData .RKE2Info = CollectRKE2Information () // This is from k8s.go
323324
325+ // Collect Namespace Compliance
326+ nsData := CollectNamespaceCompliance (Clientset , K8sHealthConfig .K8s .Check_namespaces )
327+
328+ // Collect Master Taint Compliance
329+ mtData := CollectMasterTaintCompliance (Clientset )
330+
331+ healthData .ComplianceChecks = nsData
332+ if healthData .ComplianceChecks == nil {
333+ healthData .ComplianceChecks = & ComplianceCheckResults {}
334+ }
335+ healthData .ComplianceChecks .MasterTaint = mtData
336+
324337 // Clean up orphaned alarm logs for pods and containers that no longer exist
325338 // For plugin context, assume cleanup is enabled (disableCleanupOrphanedAlarms = false)
326339 // If granular control is needed, this could become a config option.
@@ -778,7 +791,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
778791
779792 if Clientset == nil {
780793 health .Error = "kubernetes clientset is not initialized"
781- return health , fmt .Errorf (health .Error )
794+ return health , fmt .Errorf ("%s" , health .Error )
782795 }
783796
784797 // Check cert-manager namespace
@@ -801,7 +814,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
801814 Msg ("Error getting cert-manager namespace" )
802815 health .Error = errMsg
803816 alarmCheckDown ("cert_manager_namespace" , errMsg , false , "" , "" )
804- return health , fmt .Errorf (errMsg ) // This is a more significant k8s API error.
817+ return health , fmt .Errorf ("%s" , errMsg ) // This is a more significant k8s API error.
805818 }
806819 health .NamespaceAvailable = true
807820 alarmCheckUp ("cert_manager_namespace" , "cert-manager namespace exists." , false )
@@ -839,7 +852,7 @@ func CollectCertManagerHealth() (*CertManagerHealth, error) {
839852 Msg ("Error parsing cert-manager Certificate JSON" )
840853 health .Error = errMsg
841854 alarmCheckDown ("cert_manager_json_parse" , errMsg , false , "" , "" )
842- return health , fmt .Errorf (errMsg ) // Parsing error is more critical
855+ return health , fmt .Errorf ("%s" , errMsg ) // Parsing error is more critical
843856 }
844857
845858 for _ , item := range certManagerCR .Items {
@@ -1103,7 +1116,7 @@ func CollectKubeVipHealth() (*KubeVipHealth, error) {
11031116
11041117 if Clientset == nil {
11051118 health .Error = "kubernetes clientset is not initialized"
1106- return health , fmt .Errorf (health .Error )
1119+ return health , fmt .Errorf ("%s" , health .Error )
11071120 }
11081121
11091122 var pods * v1.PodList
@@ -1259,7 +1272,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12591272 Msg ("Error reading Cluster API server certificate file" )
12601273 health .Error = errMsg
12611274 alarmCheckDown ("kube_apiserver_cert_read" , errMsg , false , "" , "" )
1262- return health , fmt .Errorf (errMsg ) // This is a file read error
1275+ return health , fmt .Errorf ("%s" , errMsg ) // This is a file read error
12631276 }
12641277
12651278 block , _ := pem .Decode (certFileContent )
@@ -1272,7 +1285,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12721285 Msg ("Failed to parse PEM block from Cluster API server certificate file" )
12731286 health .Error = errMsg
12741287 alarmCheckDown ("kube_apiserver_cert_parse" , errMsg , false , "" , "" )
1275- return health , fmt .Errorf (errMsg )
1288+ return health , fmt .Errorf ("%s" , errMsg )
12761289 }
12771290
12781291 cert , err := x509 .ParseCertificate (block .Bytes )
@@ -1285,7 +1298,7 @@ func CollectClusterApiCertHealth() (*ClusterApiCertHealth, error) {
12851298 Msg ("Error parsing Cluster API server certificate" )
12861299 health .Error = errMsg
12871300 alarmCheckDown ("kube_apiserver_cert_parse" , errMsg , false , "" , "" )
1288- return health , fmt .Errorf (errMsg )
1301+ return health , fmt .Errorf ("%s" , errMsg )
12891302 }
12901303
12911304 health .NotAfter = cert .NotAfter
@@ -1854,3 +1867,255 @@ func alarmCheckDown(service, message string, noInterval bool, customStream, cust
18541867 }
18551868 common .AlarmCheckDown (service , message , noInterval , customStream , customTopic )
18561869}
1870+
1871+ // CollectNamespaceCompliance checks for compliance rules in configured namespaces.
1872+ func CollectNamespaceCompliance (clientset kubernetes.Interface , namespaces []string ) * ComplianceCheckResults {
1873+ results := & ComplianceCheckResults {
1874+ TopologySkew : []ComplianceItem {},
1875+ ReplicaCount : []ComplianceItem {},
1876+ ImagePull : []ComplianceItem {},
1877+ }
1878+
1879+ if len (namespaces ) == 0 {
1880+ return results
1881+ }
1882+
1883+ log .Debug ().
1884+ Str ("component" , "k8sHealth" ).
1885+ Str ("operation" , "collect_namespace_compliance" ).
1886+ Strs ("namespaces" , namespaces ).
1887+ Msg ("Starting namespace compliance checks" )
1888+
1889+ if clientset == nil {
1890+ log .Error ().
1891+ Str ("component" , "k8sHealth" ).
1892+ Str ("operation" , "collect_namespace_compliance" ).
1893+ Msg ("Kubernetes clientset not initialized" )
1894+ return results
1895+ }
1896+
1897+ // Count total worker nodes for replica compliance
1898+ workerCount := 0
1899+ nodes , err := clientset .CoreV1 ().Nodes ().List (context .TODO (), metav1.ListOptions {})
1900+ if err != nil {
1901+ log .Error ().
1902+ Str ("component" , "k8sHealth" ).
1903+ Str ("operation" , "collect_namespace_compliance" ).
1904+ Err (err ).
1905+ Msg ("Error listing nodes for replica compliance" )
1906+ } else {
1907+ for _ , node := range nodes .Items {
1908+ if ! isMaster (node ) {
1909+ workerCount ++
1910+ }
1911+ }
1912+ }
1913+
1914+ for _ , ns := range namespaces {
1915+ // Check Deployments
1916+ deployments , err := clientset .AppsV1 ().Deployments (ns ).List (context .TODO (), metav1.ListOptions {})
1917+ if err != nil {
1918+ log .Error ().
1919+ Str ("component" , "k8sHealth" ).
1920+ Str ("operation" , "collect_namespace_compliance" ).
1921+ Str ("namespace" , ns ).
1922+ Err (err ).
1923+ Msg ("Error listing deployments" )
1924+ alarmCheckDown ("k8s_compliance_list_deployments_" + ns , fmt .Sprintf ("Error listing deployments in namespace %s: %v" , ns , err ), false , "" , "" )
1925+ continue
1926+ }
1927+
1928+ for _ , deploy := range deployments .Items {
1929+ // Update checkTopologySpreadConstraints to return item
1930+ if item := checkTopologySpreadConstraints (& deploy ); item != nil {
1931+ results .TopologySkew = append (results .TopologySkew , * item )
1932+ }
1933+ // Update checkReplicaCount to return item, passing workerCount
1934+ if item := checkReplicaCount (& deploy , workerCount ); item != nil {
1935+ results .ReplicaCount = append (results .ReplicaCount , * item )
1936+ }
1937+ // Update checkImagePullPolicy to return items
1938+ if items := checkImagePullPolicy (& deploy .Spec .Template .Spec , "deployment" , deploy .Namespace , deploy .Name ); len (items ) > 0 {
1939+ results .ImagePull = append (results .ImagePull , items ... )
1940+ }
1941+ }
1942+ }
1943+ return results
1944+ }
1945+
1946+ // Helper to determine if a node is a master/control-plane
1947+ func isMaster (node v1.Node ) bool {
1948+ if _ , ok := node .Labels ["node-role.kubernetes.io/master" ]; ok {
1949+ return true
1950+ }
1951+ if _ , ok := node .Labels ["node-role.kubernetes.io/control-plane" ]; ok {
1952+ return true
1953+ }
1954+ if val , ok := node .Labels ["kubernetes.io/role" ]; ok && val == "master" {
1955+ return true
1956+ }
1957+ return false
1958+ }
1959+
1960+ // checkTopologySpreadConstraints verifies if the deployment has the correct topology spread constraints.
1961+ func checkTopologySpreadConstraints (deploy * appsv1.Deployment ) * ComplianceItem {
1962+ // Check for ignore label
1963+ if val , ok := deploy .Labels ["monokit.policy/topology-skew-ignored" ]; ok && val == "true" {
1964+ return nil // Ignored
1965+ }
1966+
1967+ alarmKey := fmt .Sprintf ("deployment_%s_%s_topology_skew" , deploy .Namespace , deploy .Name )
1968+ resourceName := fmt .Sprintf ("%s/%s" , deploy .Namespace , deploy .Name )
1969+
1970+ TopologySpreadConstraints := deploy .Spec .Template .Spec .TopologySpreadConstraints
1971+ hasHostnameConstraint := false
1972+ isCorrect := true
1973+ var paramErrors []string
1974+
1975+ for _ , constraint := range TopologySpreadConstraints {
1976+ if constraint .TopologyKey == "kubernetes.io/hostname" {
1977+ hasHostnameConstraint = true
1978+ if constraint .MaxSkew != 1 {
1979+ isCorrect = false
1980+ paramErrors = append (paramErrors , fmt .Sprintf ("maxSkew is %d (expected 1)" , constraint .MaxSkew ))
1981+ }
1982+ if constraint .WhenUnsatisfiable != v1 .DoNotSchedule {
1983+ isCorrect = false
1984+ paramErrors = append (paramErrors , fmt .Sprintf ("whenUnsatisfiable is %s (expected DoNotSchedule)" , constraint .WhenUnsatisfiable ))
1985+ }
1986+ // Check logic for LabelSelector match (simplified: just checking if it exists)
1987+ if constraint .LabelSelector == nil {
1988+ isCorrect = false
1989+ paramErrors = append (paramErrors , "labelSelector is missing" )
1990+ }
1991+ break
1992+ }
1993+ }
1994+
1995+ if ! hasHostnameConstraint {
1996+ alarmCheckDown (alarmKey , fmt .Sprintf ("Deployment '%s' in namespace '%s' missing topologySpreadConstraints for kubernetes.io/hostname." , deploy .Name , deploy .Namespace ), false , "" , "" )
1997+ return & ComplianceItem {Resource : resourceName , Status : false , Message : "Missing topologySpreadConstraints for kubernetes.io/hostname" }
1998+ } else if ! isCorrect {
1999+ msg := fmt .Sprintf ("Deployment '%s' in namespace '%s' has invalid topologySpreadConstraints: %s" , deploy .Name , deploy .Namespace , strings .Join (paramErrors , ", " ))
2000+ alarmCheckDown (alarmKey , msg , false , "" , "" )
2001+ return & ComplianceItem {Resource : resourceName , Status : false , Message : fmt .Sprintf ("Invalid topologySpreadConstraints: %s" , strings .Join (paramErrors , ", " ))}
2002+ } else {
2003+ alarmCheckUp (alarmKey , fmt .Sprintf ("Deployment '%s' in namespace '%s' has correct topologySpreadConstraints." , deploy .Name , deploy .Namespace ), false )
2004+ return & ComplianceItem {Resource : resourceName , Status : true , Message : "Correct topologySpreadConstraints" }
2005+ }
2006+ }
2007+
2008+ // checkReplicaCount verifies if the deployment's replica count matches the total number of worker nodes.
2009+ func checkReplicaCount (deploy * appsv1.Deployment , workerCount int ) * ComplianceItem {
2010+ // Check for ignore label
2011+ if val , ok := deploy .Labels ["monokit.policy/replica-count-ignored" ]; ok && val == "true" {
2012+ return nil // Ignored
2013+ }
2014+
2015+ alarmKey := fmt .Sprintf ("deployment_%s_%s_replica_count" , deploy .Namespace , deploy .Name )
2016+ resourceName := fmt .Sprintf ("%s/%s" , deploy .Namespace , deploy .Name )
2017+
2018+ if deploy .Spec .Replicas == nil {
2019+ return nil // Should not happen usually
2020+ }
2021+ specReplicas := int (* deploy .Spec .Replicas )
2022+
2023+ // Policy Check: Spec Replicas must equal Total Worker Count
2024+ if specReplicas != workerCount {
2025+ msg := fmt .Sprintf ("Deployment '%s' in namespace '%s' replica mismatch: spec=%d, expected=%d (Total Workers)" , deploy .Name , deploy .Namespace , specReplicas , workerCount )
2026+ alarmCheckDown (alarmKey , msg , false , "" , "" )
2027+ return & ComplianceItem {Resource : resourceName , Status : false , Message : fmt .Sprintf ("Mismatch: spec=%d, expected=%d" , specReplicas , workerCount )}
2028+ } else {
2029+ alarmCheckUp (alarmKey , fmt .Sprintf ("Deployment '%s' in namespace '%s' replica count matches worker count (%d)." , deploy .Name , deploy .Namespace , workerCount ), false )
2030+ return & ComplianceItem {Resource : resourceName , Status : true , Message : fmt .Sprintf ("Match: %d" , workerCount )}
2031+ }
2032+ }
2033+
2034+ // checkImagePullPolicy verifies if containers use imagePullPolicy: IfNotPresent
2035+ func checkImagePullPolicy (podSpec * v1.PodSpec , kind , namespace , name string ) []ComplianceItem {
2036+ var items []ComplianceItem
2037+ alarmKeyBase := fmt .Sprintf ("%s_%s_%s_image_pull" , kind , namespace , name )
2038+ resourceName := fmt .Sprintf ("%s/%s" , namespace , name )
2039+
2040+ for _ , container := range podSpec .Containers {
2041+ containerName := container .Name
2042+ alarmKey := fmt .Sprintf ("%s_%s" , alarmKeyBase , containerName )
2043+
2044+ if container .ImagePullPolicy != v1 .PullIfNotPresent {
2045+ msg := fmt .Sprintf ("%s '%s' container '%s' ImagePullPolicy is '%s' (expected IfNotPresent)." , strings .Title (kind ), name , containerName , container .ImagePullPolicy )
2046+ alarmCheckDown (alarmKey , msg , false , "" , "" )
2047+ items = append (items , ComplianceItem {
2048+ Resource : fmt .Sprintf ("%s [%s]" , resourceName , containerName ),
2049+ Status : false ,
2050+ Message : fmt .Sprintf ("Policy is %s (expected IfNotPresent)" , container .ImagePullPolicy ),
2051+ })
2052+ } else {
2053+ alarmCheckUp (alarmKey , fmt .Sprintf ("%s '%s' container '%s' has correct ImagePullPolicy." , strings .Title (kind ), name , containerName ), false )
2054+ items = append (items , ComplianceItem {
2055+ Resource : fmt .Sprintf ("%s [%s]" , resourceName , containerName ),
2056+ Status : true ,
2057+ Message : "Correct ImagePullPolicy" ,
2058+ })
2059+ }
2060+ }
2061+ return items
2062+ }
2063+
2064+ // CollectMasterTaintCompliance checks if master nodes have the correct taints.
2065+ func CollectMasterTaintCompliance (client kubernetes.Interface ) []ComplianceItem {
2066+ var results []ComplianceItem
2067+
2068+ if client == nil {
2069+ return results
2070+ }
2071+
2072+ nodes , err := client .CoreV1 ().Nodes ().List (context .TODO (), metav1.ListOptions {})
2073+ if err != nil {
2074+ log .Error ().Err (err ).Msg ("Error listing nodes for taint compliance" )
2075+ return results
2076+ }
2077+
2078+ for _ , node := range nodes .Items {
2079+ isMaster := false
2080+ if _ , ok := node .Labels ["node-role.kubernetes.io/master" ]; ok {
2081+ isMaster = true
2082+ } else if _ , ok := node .Labels ["node-role.kubernetes.io/control-plane" ]; ok {
2083+ isMaster = true
2084+ }
2085+
2086+ if ! isMaster {
2087+ continue
2088+ }
2089+
2090+ alarmKey := fmt .Sprintf ("node_%s_master_taint" , node .Name )
2091+ hasTaint := false
2092+ for _ , taint := range node .Spec .Taints {
2093+ if taint .Key == "node-role.kubernetes.io/master" && taint .Effect == v1 .TaintEffectNoSchedule {
2094+ hasTaint = true
2095+ break
2096+ }
2097+ // Also check control-plane taint which is standard in newer k8s
2098+ if taint .Key == "node-role.kubernetes.io/control-plane" && taint .Effect == v1 .TaintEffectNoSchedule {
2099+ hasTaint = true
2100+ break
2101+ }
2102+ }
2103+
2104+ if hasTaint {
2105+ alarmCheckUp (alarmKey , fmt .Sprintf ("Master node '%s' has correct NoSchedule taint." , node .Name ), false )
2106+ results = append (results , ComplianceItem {
2107+ Resource : node .Name ,
2108+ Status : true ,
2109+ Message : "Correct NoSchedule taint found" ,
2110+ })
2111+ } else {
2112+ alarmCheckDown (alarmKey , fmt .Sprintf ("Master node '%s' missing NoSchedule taint (node-role.kubernetes.io/master:NoSchedule)." , node .Name ), false , "" , "" )
2113+ results = append (results , ComplianceItem {
2114+ Resource : node .Name ,
2115+ Status : false ,
2116+ Message : "Missing NoSchedule taint" ,
2117+ })
2118+ }
2119+ }
2120+ return results
2121+ }
0 commit comments