diff --git a/aks-flex-node-sudoers b/aks-flex-node-sudoers index 50b350b..834ae50 100644 --- a/aks-flex-node-sudoers +++ b/aks-flex-node-sudoers @@ -112,9 +112,9 @@ aks-flex-node ALL=(root) NOPASSWD:SETENV: /sbin/ip addr aks-flex-node ALL=(root) NOPASSWD:SETENV: /bin/netstat -rn # Read-only Kubernetes API check for node readiness (used by status collector) -# This is intentionally limited to 'get node' with the kubelet kubeconfig. -aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/kubectl --kubeconfig /var/lib/kubelet/kubeconfig get node * -aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/local/bin/kubectl --kubeconfig /var/lib/kubelet/kubeconfig get node * +# This is intentionally limited to 'get' with the kubelet kubeconfig. +aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/bin/kubectl --kubeconfig /var/lib/kubelet/kubeconfig get * +aks-flex-node ALL=(root) NOPASSWD:SETENV: /usr/local/bin/kubectl --kubeconfig /var/lib/kubelet/kubeconfig get * # Note: Arc agent (azcmagent) is managed by install.sh and should not be removed during unbootstrap # Unbootstrap only cleans up what AKS Flex Node created, not the underlying Arc installation diff --git a/commands.go b/commands.go index f76dda7..290bb39 100644 --- a/commands.go +++ b/commands.go @@ -6,15 +6,18 @@ import ( "fmt" "os" "path/filepath" + "strings" "time" "github.com/sirupsen/logrus" "github.com/spf13/cobra" "go.goms.io/aks/AKSFlexNode/pkg/bootstrapper" + "go.goms.io/aks/AKSFlexNode/pkg/components/kubelet" "go.goms.io/aks/AKSFlexNode/pkg/config" "go.goms.io/aks/AKSFlexNode/pkg/logger" "go.goms.io/aks/AKSFlexNode/pkg/status" + "go.goms.io/aks/AKSFlexNode/pkg/utils" ) // Version information variables (set at build time) @@ -173,6 +176,8 @@ func runDaemonLoop(ctx context.Context, cfg *config.Config) error { } else { logger.Infof("Bootstrap health check completed at %s", time.Now().Format("2006-01-02 15:04:05")) } + + checkAndReboot(ctx) } } } @@ -276,3 +281,21 @@ func handleExecutionResult(result *bootstrapper.ExecutionResult, operation strin // For bootstrap, return error on failure return fmt.Errorf("%s failed: %s", operation, result.Error) } + +// checkAndReboot is a PoC function, just demonstrate the node agent's remediation capability. +func checkAndReboot(ctx context.Context) { + logger := logger.GetLoggerFromContext(ctx) + + args := []string{"--kubeconfig", kubelet.KubeletKubeconfigPath, + "get", "events", "--sort-by=.lastTimestamp"} + + output, err := utils.RunCommandWithOutput("kubectl", args...) + if err != nil { + logger.Errorf("Failed to get last event: %s", err) + return + } + + if strings.Contains(output, "kernel NULL pointer") { + logger.Warn("Node has kernel NULL pointer error, initiating reboot...") + } +}