Skip to content

Commit 0d70dc1

Browse files
authored
Make the Operator deployment HA by default (#535)
1 parent bde6053 commit 0d70dc1

File tree

22 files changed

+301
-94
lines changed

22 files changed

+301
-94
lines changed

.github/workflows/k8s-matrix.yaml

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,13 @@ jobs:
3535
fail-fast: false
3636
matrix:
3737
k8s:
38-
- kindest/node:v1.23.4@sha256:0e34f0d0fd448aa2f2819cfd74e99fe5793a6e4938b328f657c8e3f81ee0dfb9
39-
- kindest/node:v1.22.7@sha256:1dfd72d193bf7da64765fd2f2898f78663b9ba366c2aa74be1fd7498a1873166
40-
- kindest/node:v1.21.10@sha256:84709f09756ba4f863769bdcabe5edafc2ada72d3c8c44d6515fc581b66b029c
41-
- kindest/node:v1.20.15@sha256:393bb9096c6c4d723bb17bceb0896407d7db581532d11ea2839c80b28e5d8deb
42-
- kindest/node:v1.19.16@sha256:81f552397c1e6c1f293f967ecb1344d8857613fb978f963c30e907c32f598467
43-
- kindest/node:v1.18.20@sha256:e3dca5e16116d11363e31639640042a9b1bd2c90f85717a7fc66be34089a8169
44-
- kindest/node:v1.17.17@sha256:e477ee64df5731aa4ef4deabbafc34e8d9a686b49178f726563598344a3898d5
45-
- kindest/node:v1.16.15@sha256:64bac16b83b6adfd04ea3fbcf6c9b5b893277120f2b2cbf9f5fa3e5d4c2260cc
38+
- kindest/node:v1.24.0@sha256:0866296e693efe1fed79d5e6c7af8df71fc73ae45e3679af05342239cdc5bc8e
39+
- kindest/node:v1.23.6@sha256:b1fa224cc6c7ff32455e0b1fd9cbfd3d3bc87ecaa8fcb06961ed1afb3db0f9ae
40+
- kindest/node:v1.22.9@sha256:8135260b959dfe320206eb36b3aeda9cffcb262f4b44cda6b33f7bb73f453105
41+
- kindest/node:v1.21.12@sha256:f316b33dd88f8196379f38feb80545ef3ed44d9197dca1bfd48bcb1583210207
42+
- kindest/node:v1.20.15@sha256:6f2d011dffe182bad80b85f6c00e8ca9d86b5b8922cdf433d53575c4c5212248
43+
- kindest/node:v1.19.16@sha256:d9c819e8668de8d5030708e484a9fdff44d95ec4675d136ef0a0a584e587f65c
44+
- kindest/node:v1.18.20@sha256:738cdc23ed4be6cc0b7ea277a2ebcc454c8373d7d8fb991a7fcdbd126188e6d7
4645

4746
steps:
4847
- uses: actions/checkout@v2
@@ -104,7 +103,7 @@ jobs:
104103
- name: Start KinD Cluster
105104
shell: bash
106105
run: |
107-
sh ./hack/kind.sh --image ${{ matrix.k8s }}
106+
make kind KIND_IMAGE=${{ matrix.k8s }}
108107
kubectl version
109108
kubectl get nodes
110109
docker pull gcr.io/distroless/java

Makefile

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ override BUILD_MANIFESTS := $(BUILD_OUTPUT)/manifests
236236
override BUILD_MANIFESTS_PKG := $(BUILD_OUTPUT)/coherence-operator-manifests.tar.gz
237237
override BUILD_PROPS := $(BUILD_OUTPUT)/build.properties
238238
override BUILD_TARGETS := $(BUILD_OUTPUT)/targets
239+
override SCRIPTS_DIR := $(CURRDIR)/hack
239240
override TEST_LOGS_DIR := $(BUILD_OUTPUT)/test-logs
240241
override TANZU_DIR := $(BUILD_OUTPUT)/tanzu
241242
override TANZU_PACKAGE_DIR := $(BUILD_OUTPUT)/tanzu/package
@@ -521,7 +522,7 @@ $(BUILD_BIN)/manager: $(BUILD_PROPS) $(GOS) $(BUILD_TARGETS)/generate $(BUILD_TA
521522
.PHONY: ensure-sdk
522523
ensure-sdk:
523524
@echo "Ensuring Operator SDK is present at version $(OPERATOR_SDK_VERSION)"
524-
./hack/ensure-sdk.sh $(OPERATOR_SDK_VERSION) $(OPERATOR_SDK_HOME)
525+
$(SCRIPTS_DIR)/ensure-sdk.sh $(OPERATOR_SDK_VERSION) $(OPERATOR_SDK_HOME)
525526

526527
# ----------------------------------------------------------------------------------------------------------------------
527528
# Internal make step that builds the Operator runner artifacts utility
@@ -647,7 +648,7 @@ docs/about/04_coherence_spec.adoc: $(API_GO_FILES)
647648
# ----------------------------------------------------------------------------------------------------------------------
648649
$(BUILD_OUTPUT)/certs:
649650
@echo "Generating test keys and certs"
650-
./hack/keys.sh
651+
$(SCRIPTS_DIR)/keys.sh
651652

652653
# ----------------------------------------------------------------------------------------------------------------------
653654
# Executes the code review targets.
@@ -779,7 +780,7 @@ run-debug-clean: reset-namespace run-debug ## run the Operator locally with Delv
779780
# ----------------------------------------------------------------------------------------------------------------------
780781
.PHONY: stop
781782
stop: ## kill any locally running operator process
782-
./hack/kill-local.sh
783+
$(SCRIPTS_DIR)/kill-local.sh
783784

784785
# ======================================================================================================================
785786
# Targets related to Operator Lifecycle Manager and the Operator SDK
@@ -1244,15 +1245,16 @@ uninstall-crds: $(BUILD_TARGETS)/manifests ## Uninstall the CRDs
12441245
.PHONY: deploy-and-wait
12451246
deploy-and-wait: deploy wait-for-deploy ## Deploy the Coherence Operator and wait for the Operator Pod to be ready
12461247

1247-
OPERATOR_HA ?= false
1248+
# The Operator is deployed HA by default
1249+
OPERATOR_HA ?= true
12481250

12491251
.PHONY: deploy
12501252
deploy: prepare-deploy create-namespace kustomize ## Deploy the Coherence Operator
12511253
ifneq (,$(WATCH_NAMESPACE))
12521254
cd $(BUILD_DEPLOY)/manager && $(KUSTOMIZE) edit add configmap env-vars --from-literal WATCH_NAMESPACE=$(WATCH_NAMESPACE)
12531255
endif
1254-
ifeq (true,$(OPERATOR_HA))
1255-
cd $(BUILD_DEPLOY)/manager && $(KUSTOMIZE) edit add patch --kind Deployment --name controller-manager --path ha-patch.yaml
1256+
ifeq (false,$(OPERATOR_HA))
1257+
cd $(BUILD_DEPLOY)/manager && $(KUSTOMIZE) edit add patch --kind Deployment --name controller-manager --path single-replica-patch.yaml
12561258
endif
12571259
kubectl -n $(OPERATOR_NAMESPACE) create secret generic coherence-webhook-server-cert || true
12581260
$(KUSTOMIZE) build $(BUILD_DEPLOY)/default | kubectl apply -f -
@@ -1466,26 +1468,25 @@ KIND_IMAGE ?= "kindest/node:v1.24.0@sha256:0866296e693efe1fed79d5e6c7af8df71fc
14661468
# ----------------------------------------------------------------------------------------------------------------------
14671469
.PHONY: kind
14681470
kind: ## Run a default KinD cluster
1469-
./hack/kind.sh --wait 10m --image $(KIND_IMAGE)
1470-
./hack/kind-label-node.sh
1471+
kind create cluster --name $(KIND_CLUSTER) --wait 10m --config $(SCRIPTS_DIR)/kind-config.yaml --image $(KIND_IMAGE)
1472+
$(SCRIPTS_DIR)/kind-label-node.sh
14711473

14721474
# ----------------------------------------------------------------------------------------------------------------------
14731475
# Start a Kind cluster
14741476
# ----------------------------------------------------------------------------------------------------------------------
14751477
.PHONY: kind-single-worker
1476-
kind-single-worker: export KIND_CONFIG=./hack/kind-config-single.yaml
14771478
kind-single-worker: ## Run a KinD cluster with a single worker node
1478-
./hack/kind.sh --wait 10m --image $(KIND_IMAGE)
1479-
./hack/kind-label-node.sh
1479+
kind create cluster --name $(KIND_CLUSTER) --wait 10m --config $(SCRIPTS_DIR)/kind-config-single.yaml --image $(KIND_IMAGE)
1480+
$(SCRIPTS_DIR)/kind-label-node.sh
14801481

14811482
# ----------------------------------------------------------------------------------------------------------------------
14821483
# Start a Kind cluster with Calico
14831484
# ----------------------------------------------------------------------------------------------------------------------
14841485
.PHONY: kind-calico
1485-
kind-calico: export KIND_CONFIG=./hack/kind-config-calico.yaml
1486+
kind-calico: export KIND_CONFIG=$(SCRIPTS_DIR)/kind-config-calico.yaml
14861487
kind-calico: ## Run a KinD cluster with Calico
1487-
./hack/kind.sh --image $(KIND_IMAGE)
1488-
./hack/kind-label-node.sh
1488+
kind create cluster --name $(KIND_CLUSTER) --wait 10m --config $(SCRIPTS_DIR)/kind-config-calico.yaml --image $(KIND_IMAGE)
1489+
$(SCRIPTS_DIR)/kind-label-node.sh
14891490
curl -sL https://docs.projectcalico.org/manifests/calico.yaml | kubectl apply -f -
14901491
kubectl -n kube-system set env daemonset/calico-node FELIX_IGNORELOOSERPF=true
14911492
kubectl -n kube-system wait --for condition=ready --timeout=300s -l k8s-app=calico-node pod
@@ -1568,7 +1569,7 @@ uninstall-cert-manager: ## Uninstall Cert manager from the Kubernetes cluster
15681569
TANZU = $(shell which tanzu)
15691570
.PHONY: get-tanzu
15701571
get-tanzu: $(BUILD_PROPS)
1571-
./hack/get-tanzu.sh "$(TANZU_VERSION)" "$(TOOLS_DIRECTORY)"
1572+
$(SCRIPTS_DIR)/get-tanzu.sh "$(TANZU_VERSION)" "$(TOOLS_DIRECTORY)"
15721573

15731574
.PHONY: tanzu-create-cluster
15741575
tanzu-create-cluster: ## Create a local Tanzu unmanaged cluster named "$(KIND_CLUSTER)" (default "operator")
@@ -2083,7 +2084,7 @@ uninstall-istio: get-istio ## Uninstall Istio from k8s
20832084
# ----------------------------------------------------------------------------------------------------------------------
20842085
.PHONY: get-istio
20852086
get-istio: $(BUILD_PROPS)
2086-
./hack/get-istio-latest.sh "$(ISTIO_VERSION)" "$(TOOLS_DIRECTORY)"
2087+
$(SCRIPTS_DIR)/get-istio-latest.sh "$(ISTIO_VERSION)" "$(TOOLS_DIRECTORY)"
20872088
$(eval ISTIO_HOME := $(shell find $(TOOLS_DIRECTORY) -maxdepth 1 -type d | grep istio))
20882089
@echo "Istio installed at $(ISTIO_HOME)"
20892090

api/v1/zz_generated.deepcopy.go

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/manager/manager.yaml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ spec:
1414
selector:
1515
matchLabels:
1616
control-plane: coherence
17-
replicas: 1
17+
replicas: 3
1818
template:
1919
metadata:
2020
labels:
@@ -82,3 +82,24 @@ spec:
8282
defaultMode: 420
8383
secretName: coherence-webhook-server-cert
8484
optional: true
85+
affinity:
86+
podAntiAffinity:
87+
preferredDuringSchedulingIgnoredDuringExecution:
88+
- podAffinityTerm:
89+
topologyKey: "topology.kubernetes.io/zone"
90+
labelSelector:
91+
matchLabels:
92+
control-plane: coherence
93+
app.kubernetes.io/name: coherence-operator
94+
app.kubernetes.io/instance: coherence-operator-manager
95+
app.kubernetes.io/version: "3.2.7"
96+
weight: 50
97+
- podAffinityTerm:
98+
topologyKey: "oci.oraclecloud.com/fault-domain"
99+
labelSelector:
100+
matchLabels:
101+
control-plane: coherence
102+
app.kubernetes.io/name: coherence-operator
103+
app.kubernetes.io/instance: coherence-operator-manager
104+
app.kubernetes.io/version: "3.2.7"
105+
weight: 1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- op: replace
2+
path: /spec/replicas
3+
value: 3

config/manifests/bases/coherence-operator.clusterserviceversion.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ spec:
5858
- name: Coherence Operator
5959
url: https://oracle.github.io/coherence-operator/docs/latest
6060
maturity: alpha
61-
minKubeVersion: "1.16"
61+
minKubeVersion: "1.18"
6262
maintainers:
6363
- name: Jonathan Knight
6464

docs/installation/01_installation.adoc

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,33 @@ easily be installed into a Kubernetes cluster.
1717
The prerequisites apply to all installation methods.
1818
1919
* Access to Oracle Coherence Operator images.
20-
* Access to a Kubernetes v1.16.0+ cluster. The Operator test pipeline is run using Kubernetes versions v1.16 upto v1.23
20+
* Access to a Kubernetes v1.18.0+ cluster. The Operator test pipeline is run using Kubernetes versions v1.18 upto v1.24
2121
* A Coherence application image using Coherence version 12.2.1.3 or later. Note that some functionality (e.g. metrics) is only
2222
available in Coherence 12.2.1.4 and later.
2323
2424
NOTE: ARM Support: As of version 3.2.0, the Coherence Operator is build as a multi-architecture image that supports running in Kubernetes on both Linux/amd64 and Linux/arm64. The prerequisite is that the Coherence application image used has been built to support ARM.
2525
26-
There are a few ways to install the Coherence Operator documented below:
26+
There are a number of ways to install the Coherence Operator documented below:
2727
2828
* <<manifest,Simple installation using Kubectl>>
2929
* <<helm,Install the Helm chart>>
3030
* <<kubectl,Kubectl with Kustomize>>
3131
* <<tanzu,VMWare Tanzu Package (kapp-controller)>>
3232
33-
NOTE: Installing the Coherence Operator using the methods above will create a number of `ClusterRole` RBAC resources.
33+
=== High Availability
34+
35+
The Coherence Operator runs in HA mode by default. The `Deployment` created by the installation will have a replica count of 3.
36+
In reduced capacity Kubernetes clusters, for example, local laptop development and test, the replica count can be reduced. It is recommended to leave the default of 3 for production environments.
37+
Instructions on how to change the replica count for the different install methods are included below.
38+
39+
The Coherence Operator runs a REST server that the Coherence cluster members will query to discover the site and rack names that should be used by Coherence. If the Coherence Operator is not running when a Coherence Pod starts, then the Coherence member in that Pod will be unable to properly configure its site and rack names, possibly leading to data distribution that is not safely distributed over sites. In production, and in Kubernetes clusters that are spread over multiple availability zones and failure domains, it is important to run the Operator in HA mode.
40+
41+
The Operator yaml files and Helm chart include a default Pod scheduling configuration that uses anti-affinity to distribute the three replicas onto nodes that have different `topology.kubernetes.io/zone` labels. This label is a standard Kubernetes label used to describe the zone the node is running in, and is typically applied by Kubernetes cloud vendors.
42+
43+
44+
=== Notes
45+
46+
NOTE: Installing the Coherence Operator using the methods below will create a number of `ClusterRole` RBAC resources.
3447
Some corporate security policies do not like to give cluster wide roles to third-party products.
3548
To help in this situation the operator can be installed without cluster roles, but with caveats
3649
(see the <<docs/installation/09_RBAC.adoc,RBAC>> documentation) for more details.
@@ -56,7 +69,8 @@ If no image is specified in the `Coherence` yaml, then the default Coherence ima
5669
5770
* `{coherence-image}` - The default Coherence image.
5871
59-
If using a private image registry then these images will all need to be pushed to that registry for the Operator to work.
72+
If using a private image registry then these images will all need to be pushed to that registry for the Operator to work. The default Coherence image may be omitted if all Coherence applications will use custom Coherence images.
73+
6074
6175
[#manifest]
6276
== Default Install with Kubectl
@@ -89,6 +103,21 @@ Then download with:
89103
kubectl apply -f https://github.com/oracle/coherence-operator/releases/download/${VERSION}/coherence-operator.yaml
90104
----
91105
106+
=== Change the Operator Replica Count
107+
108+
When installing with single manifest yaml file, the replica count can be changed by editing the yaml file itself to change the occurrence of `replicas: 3` in the manifest yaml to `replicas: 1`
109+
110+
For example, this could be done using `sed`
111+
[source,bash]
112+
----
113+
sed -i -e 's/replicas: 3/replicas: 1/g' coherence-operator.yaml
114+
----
115+
116+
Or on MacOS, where `sed` is slightly different:
117+
[source,bash]
118+
----
119+
sed -i '' -e 's/replicas: 3/replicas: 1/g' coherence-operator.yaml
120+
----
92121
93122
94123
== Installing With Helm
@@ -108,7 +137,7 @@ helm repo add coherence https://oracle.github.io/coherence-operator/charts
108137
helm repo update
109138
----
110139
111-
NOTE: To avoid confusion, the URL `https://oracle.github.io/coherence-operator/charts` is a Helm repo, it is not a web site you open in a browser. You may think we shouldn't have to say this, but you'd be surprised.
140+
NOTE: To avoid confusion, the URL `https://oracle.github.io/coherence-operator/charts` is a Helm repo, it is not a website you open in a browser. You may think we shouldn't have to say this, but you'd be surprised.
112141
113142
=== Install the Coherence Operator Helm chart
114143
@@ -126,6 +155,20 @@ helm install \
126155
<1> where `<namespace>` is the namespace that the Coherence Operator will be installed into.
127156
<2> `coherence` is the name of this Helm installation.
128157
158+
=== Change the Operator Replica Count
159+
160+
To change the replica count when installing the Operator using Helm, the `replicas` value can be set.
161+
162+
For example, to change the replica count from 3 to 1, the `--set replicas=1` option can be used.
163+
[source,bash]
164+
----
165+
helm install \
166+
--namespace <namespace> \
167+
--set replicas=1
168+
coherence \
169+
coherence/coherence-operator
170+
----
171+
129172
130173
==== Uninstall the Coherence Operator Helm chart
131174
@@ -213,7 +256,7 @@ helm install \
213256
----
214257
<1> the `private-repo-values.yaml` values fle will be used by Helm to inject the settings into the Operator deployment
215258
216-
==== Add Pull Secrets Using --Set
259+
==== Add Pull Secrets Using --set
217260
218261
Although the `imagePullSecrets` field in the values file is an array of `name` to value pairs it is possible to set
219262
these values with the normal Helm `--set` parameter.
@@ -273,6 +316,19 @@ NOTE: All the commands below are run from a console in the `manifests/` director
273316
If you have Kustomize installed (or can install it from https://github.com/kubernetes-sigs/kustomize) you can use
274317
Kustomize to configure the yaml and install.
275318
319+
==== Change the Operator Replica Count
320+
321+
To change the replica count using Kustomize a patch file needs to be applied.
322+
The Operator manifests include a patch file, named `manager/single-replica-patch.yaml`, that changes the replica count from 3 to 1. This patch can be applied with the following Kustomize command.
323+
324+
[source,bash]
325+
----
326+
cd ./manager && kustomize edit add patch \
327+
--kind Deployment --name controller-manager \
328+
--path single-replica-patch.yaml
329+
----
330+
331+
276332
==== Set Image Names
277333
If you need to use different iamge names from the defaults `kustomize` can be used to specify different names:
278334

docs/installation/09_RBAC.adoc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ You MUST ensure that the CRD manifests match the version of the Operator being i
4242
4343
* Download the manifests and unpack them.
4444
45-
* In the directory that the .tar.gz file was unpacked to will be two versions of the CRDs.
46-
The directory `crd/` contains the `apiextensions.k8s.io/v1` version, which must be installed into Kubernetes cluster from k8s v1.16.x and above. The `crd-v1beta1/` directory contains the `apiextensions.k8s.io/v1beta1` version, which must be installed into Kubernetes cluster of k8s v1.15.x and below.
45+
* In the directory that the .tar.gz file the was unpacked the `crd/` directory will the Coherence CRD.
46+
The CRD can be installed with kubectl
4747
48-
The required CRD can be installed with kubectl
4948
[source,bash]
5049
----
5150
kubectl create -f crd/coherence.oracle.com_coherence.yaml

docs/scaling/010_overview.adoc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ behaviour. The scaling policy has three possible values:
5656
|`ParallelUpSafeDown`
5757
|This is the default scaling policy.
5858
With this policy when scaling up `Pods` are added in parallel (the same as using the `Parallel` `podManagementPolicy`
59-
in a https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.16/#statefulsetspec-v1-apps[StatefulSet]) and
59+
in a https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulsetspec-v1-apps[StatefulSet]) and
6060
when scaling down `Pods` are removed one at a time (the same as the `OrderedReady` `podManagementPolicy` for a
6161
StatefulSet). When scaling down a check is done to ensure that the members of the cluster have a safe StatusHA value
6262
before a `Pod` is removed (i.e. none of the Coherence cache services have an endangered status).
@@ -65,7 +65,7 @@ adding members, but offers safe, albeit slower, scaling down as `Pods` are remo
6565
6666
|`Parallel`
6767
|With this policy when scaling up `Pods` are added in parallel (the same as using the `Parallel` `podManagementPolicy`
68-
in a https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.16/#statefulsetspec-v1-apps[StatefulSet]).
68+
in a https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulsetspec-v1-apps[StatefulSet]).
6969
With this policy no StatusHA check is performed either when scaling up or when scaling down.
7070
This policy allows faster start and scaling times but at the cost of no data safety; it is ideal for deployments that are
7171
storage disabled.

docs/troubleshooting/02_heap_dump.adoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Heap dumps can be very useful when debugging but generating and downloading a he
1414
1515
== Ephemeral Containers
1616
17-
Ephemeral containers were introduced in k8s v1.16 and at the time of writing are still in alpha.
17+
Ephemeral containers were introduced in Kubernetes v1.16 and moved to beta in v1.23.
1818
Ephemeral containers is a feature gate that must be enabled for your cluster.
1919
If you have the `EphemeralContainers` feature gate enabled, then obtaining a heap dump is not so difficult.
2020

0 commit comments

Comments
 (0)