Skip to content

Commit 8c987d1

Browse files
authored
Log a clear error message when starting if the operator fails to connect to the Kubernetes API server (#817)
1 parent 750dbbc commit 8c987d1

File tree

6 files changed

+85
-9
lines changed

6 files changed

+85
-9
lines changed

Makefile

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ GITHUB_REGISTRY := ghcr.io/oracle
5959
OPERATOR_IMAGE_NAME := coherence-operator
6060
OPERATOR_IMAGE_REGISTRY ?= $(ORACLE_REGISTRY)
6161
OPERATOR_IMAGE_TAG_SUFFIX ?=
62-
OPERATOR_IMAGE_TAG := $(VERSION)$(OPERATOR_IMAGE_TAG_SUFFIX)
63-
OPERATOR_IMAGE_TAG_ARM := $(VERSION)-arm64$(OPERATOR_IMAGE_TAG_SUFFIX)
64-
OPERATOR_IMAGE_TAG_AMD := $(VERSION)-amd64$(OPERATOR_IMAGE_TAG_SUFFIX)
62+
OPERATOR_IMAGE_TAG ?= $(VERSION)$(OPERATOR_IMAGE_TAG_SUFFIX)
63+
OPERATOR_IMAGE_TAG_ARM ?= $(VERSION)-arm64$(OPERATOR_IMAGE_TAG_SUFFIX)
64+
OPERATOR_IMAGE_TAG_AMD ?= $(VERSION)-amd64$(OPERATOR_IMAGE_TAG_SUFFIX)
6565
OPERATOR_IMAGE := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG)
6666
OPERATOR_IMAGE_ARM := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG_ARM)
6767
OPERATOR_IMAGE_AMD := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG_AMD)
@@ -1945,7 +1945,7 @@ endif
19451945

19461946

19471947
.PHONY: just-deploy
1948-
just-deploy: ensure-pull-secret ## Deploy the Coherence Operator without rebuilding anything
1948+
just-deploy: $(TOOLS_BIN)/kustomize ensure-pull-secret ## Deploy the Coherence Operator without rebuilding anything
19491949
$(call prepare_deploy,$(OPERATOR_IMAGE),$(OPERATOR_NAMESPACE))
19501950
ifeq ("$(OPERATOR_IMAGE_REGISTRY)","$(ORACLE_REGISTRY)")
19511951
$(KUSTOMIZE) build $(BUILD_DEPLOY)/default | $(KUBECTL_CMD) apply -f -
@@ -2709,7 +2709,10 @@ test-examples: build-examples
27092709
PUSH_ARGS ?=
27102710

27112711
.PHONY: push-operator-image
2712-
push-operator-image: $(BUILD_TARGETS)/build-operator
2712+
push-operator-image: $(BUILD_TARGETS)/build-operator just-push-operator-image
2713+
2714+
.PHONY: just-push-operator-image
2715+
just-push-operator-image:
27132716
ifneq ("$(OPERATOR_RELEASE_REGISTRY)","$(OPERATOR_IMAGE_REGISTRY)")
27142717
$(DOCKER_CMD) tag $(OPERATOR_IMAGE_ARM) $(OPERATOR_RELEASE_ARM)
27152718
$(DOCKER_CMD) tag $(OPERATOR_IMAGE_AMD) $(OPERATOR_RELEASE_AMD)

docs/troubleshooting/01_trouble-shooting.adoc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,23 @@ This page will be updated and maintained over time to include common issues we s
3939
4040
== Issues
4141
42+
[#start-timeout]
43+
=== The Operator fails to start with a timeout error
44+
45+
One of the first things the operator does on start-up os to log the Kubernetes server version.
46+
This is done by making a request to the API server with a default timeout of one minute.
47+
If the network policies, firewall rules, or RBAC are blocking the operator from connecting to the API server this
48+
request will time out with an error like the one below
49+
50+
[source]
51+
----
52+
2025-10-16T09:05:29Z INFO setup ERROR: failed to get Kubernetes server version {"Host": "https://10.96.0.1:443", "Error": "Get \"https://10.96.0.1:443/version?timeout=32s\": dial tcp 10.96.0.1:443: i/o timeout"}
53+
Error: unable to get kubernetes server version: Get "https://10.96.0.1:443/version?timeout=32s": dial tcp 10.96.0.1:443: i/o timeout
54+
----
55+
56+
The error message in the operator's logs will include the host name and port that the operator attempted to use,
57+
so this can be used to then ensure egress is allowed to the correct host and port.
58+
4259
[#no-operator]
4360
=== I Uninstalled the Operator and Cannot Delete the Coherence Clusters
4461
@@ -97,6 +114,9 @@ not cleanly shut down and will then not be able to be restarted using the persis
97114
The readiness/liveness probe used by the Operator in the Coherence Pods checks a number of things to determine whether the Pods is ready, one of these is whether the JVM is a cluster member.
98115
If your application uses a custom main class and is not properly bootstrapping Coherence then the Pod will not be ready until your application code actually touches a Coherence resource causing Coherence to start and join the cluster.
99116
117+
If you have overridden the configuration for the readiness or liveness probes for the Coherence cluster with custom endpoints,
118+
then you need to debug your own code.
119+
100120
When running in clusters with the Operator using custom main classes it is advisable to properly bootstrap Coherence
101121
from within your `main` method. This can be done using the new Coherence bootstrap API available from CE release 20.12
102122
or by calling `com.tangosol.net.DefaultCacheServer.startServerDaemon().waitForServiceStart();`

examples/095_network_policies/README.adoc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,17 @@ Configuring access to the API server is not as straight forward as other network
148148
The reason for this is that there is no Pod available with labels that can be used in the configuration,
149149
instead, the IP address of the API server itself must be used.
150150
151+
[NOTE]
152+
====
153+
If the operator cannot connect to the API server it will fail to start.
154+
One of the first things the operator does on start-up os to log the Kubernetes version that the server is running.
155+
This is done by making a request to the API server with a default timeout of one minute.
156+
If the network policies are blocking the operator from connecting to the API server this request will time out.
157+
158+
The error message in the operator's logs will include the host name and port that the operator attempted to use,
159+
so this can be used to then ensure egress is allowed to the correct host and port.
160+
====
161+
151162
There are various methods to find the IP address of the API server.
152163
The exact method required may vary depending on the type of Kubernetes cluster being used, for example a simple
153164
development cluster running in KinD on a laptop may differ from a cluster running in a cloud provider's infrastructure.

hack/buildah/run-buildah.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,6 @@ else
137137
-e HTTP_PROXY="${HTTP_PROXY}" -e HTTPS_PROXY="${HTTPS_PROXY}" -e NO_PROXY="${NO_PROXY}" \
138138
-e http_proxy="${http_proxy}" -e https_proxy="${https_proxy}" -e no_proxy="${no_proxy}" \
139139
--name buildah \
140-
quay.io/buildah/stable:v1.37.1 "${SCRIPT_NAME}"
140+
quay.io/buildah/stable:v1.41.5 "${SCRIPT_NAME}"
141141
fi
142142

pkg/operator/operator.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ const (
6565
FlagOperatorImage = "operator-image"
6666
FlagEnvVar = "env"
6767
FlagJvmArg = "jvm"
68+
FlagKubernetesCheckTimeout = "kubernetes-check-timeout"
6869

6970
// EnvVarWatchNamespace is the environment variable to use to set the watch namespace(s)
7071
EnvVarWatchNamespace = "WATCH_NAMESPACE"
@@ -85,6 +86,11 @@ const (
8586
LabelTestHostName = "coherence.oracle.com/test_hostname"
8687
// LabelTestHealthPort is a label applied to Pods to set a testing health check port
8788
LabelTestHealthPort = "coherence.oracle.com/test_health_port"
89+
90+
// DefaultKubernetesCheckTimeout is the default timeout applied to the initial Kubernetes API connection check.
91+
DefaultKubernetesCheckTimeout = time.Minute
92+
// MinKubernetesCheckTimeout is the minimum timeout applied to the initial Kubernetes API connection check.
93+
MinKubernetesCheckTimeout = 10 * time.Second
8894
)
8995

9096
var setupLog = ctrl.Log.WithName("setup")
@@ -245,6 +251,11 @@ func SetupFlags(cmd *cobra.Command, v *viper.Viper) {
245251
time.Second*20,
246252
"The duration the Operator uses for the leadership lease renewal timeout. "+
247253
"If the value entered is less than 10s, then 10s will be used")
254+
cmd.Flags().Duration(
255+
FlagKubernetesCheckTimeout,
256+
DefaultKubernetesCheckTimeout,
257+
"The duration the Operator uses for the initial Kubernetes API connection check timeout. "+
258+
"If the value entered is less than 60s, then 60s will be used")
248259

249260
// enable using dashed notation in flags and underscores in env
250261
v.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))

pkg/runner/cmd_operator.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package runner
99
import (
1010
"context"
1111
"crypto/tls"
12+
"encoding/json"
1213
"fmt"
1314
"net/http"
1415
"os"
@@ -25,6 +26,7 @@ import (
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
apiruntime "k8s.io/apimachinery/pkg/runtime"
2728
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
29+
"k8s.io/apimachinery/pkg/version"
2830
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
2931
rest2 "k8s.io/client-go/rest"
3032
"k8s.io/utils/ptr"
@@ -79,8 +81,8 @@ func operatorCommand(v *viper.Viper) *cobra.Command {
7981
func execute(v *viper.Viper) error {
8082
ctrl.SetLogger(zap.New(zap.UseDevMode(true)))
8183

82-
setupLog.Info(fmt.Sprintf("Operator Coherence Image: %s", operator.GetDefaultCoherenceImage()))
8384
setupLog.Info(fmt.Sprintf("Operator Image: %s", operator.GetDefaultOperatorImage()))
85+
setupLog.Info(fmt.Sprintf("Default Coherence Image (only used when no image is specified for a Coherence resource): %s", operator.GetDefaultCoherenceImage()))
8486

8587
// if the enable-http2 flag is false (the default), http/2 should be disabled
8688
// due to its vulnerabilities. More specifically, disabling http/2 will
@@ -118,12 +120,18 @@ func execute(v *viper.Viper) error {
118120
if err != nil {
119121
return errors.Wrap(err, "unable to create client set")
120122
}
123+
setupLog.Info("Successfully created kubernetes client", "Host", cfg.Host)
121124

122-
version, err := cs.DiscoveryClient.ServerVersion()
125+
// Get and display the k8s version of the server.
126+
// This will also verify that we can actually talk to the k8s API server.
127+
// For example, incorrectly configured network polices or RBAC rules can prevent us from talking to the server.
128+
sv, err := getServerVersion(cs, v)
123129
if err != nil {
130+
setupLog.Info("ERROR: failed to get the Kubernetes server version. This could be cause by misconfigured network policies, RBAC rules or firewalls, etc.",
131+
"Host", cfg.Host, "Error", err.Error())
124132
return errors.Wrap(err, "unable to get kubernetes server version")
125133
}
126-
setupLog.Info("Kubernetes server version", "Major", version.Major, "Minor", version.Minor, "Platform", version.Platform)
134+
setupLog.Info("Kubernetes server version", "Major", sv.Major, "Minor", sv.Minor, "Platform", sv.Platform, "Host", cfg.Host)
127135

128136
// The Operator web-hook server has been removed, so we need to delete any existing web-hooks
129137
setupLog.Info("Ensuring any existing webhook configurations are removed")
@@ -286,3 +294,26 @@ func execute(v *viper.Viper) error {
286294

287295
return nil
288296
}
297+
298+
// GetServerVersion fetches the Kubernetes server version using the provided ClientSet and returns it as a version.Info struct.
299+
// It uses the discovery client to send a GET request to the "/version" endpoint and parses the response into version.Info.
300+
// This method has a default timeout of 1 minute but can be overridden by setting the environment variable KUBERNETES_CHECK_TIMEOUT.
301+
// Returns an error if the request fails or if the JSON response cannot be parsed.
302+
func getServerVersion(cs clients.ClientSet, v *viper.Viper) (*version.Info, error) {
303+
timeout := v.GetDuration(operator.FlagKubernetesCheckTimeout)
304+
if timeout < operator.MinKubernetesCheckTimeout {
305+
timeout = operator.MinKubernetesCheckTimeout
306+
}
307+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
308+
defer cancel()
309+
body, err := cs.DiscoveryClient.RESTClient().Get().AbsPath("/version").Do(ctx).Raw()
310+
if err != nil {
311+
return nil, err
312+
}
313+
var info version.Info
314+
err = json.Unmarshal(body, &info)
315+
if err != nil {
316+
return nil, fmt.Errorf("unable to parse the server version: %v", err)
317+
}
318+
return &info, nil
319+
}

0 commit comments

Comments
 (0)