Skip to content

Commit 483cd20

Browse files
committed
scripts: add migration script from public operator to cloud operator.
Check in a reference implementation for migrating from statesets managed by the public operator to the cloud operator. Note that this process involves some manual steps, and we may want to automate and test it further.
1 parent ebd8f6f commit 483cd20

File tree

9 files changed

+641
-2
lines changed

9 files changed

+641
-2
lines changed

cmd/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ limitations under the License.
1616

1717
package main
1818

19-
import cmd "github.com/cockroachdb/helm-charts/cmd/self-signer"
19+
import (
20+
cmd "github.com/cockroachdb/helm-charts/cmd/self-signer"
21+
)
2022

2123
func main() {
2224
cmd.Execute()

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/Masterminds/semver/v3 v3.2.1
77
github.com/cenkalti/backoff v2.2.1+incompatible
88
github.com/cockroachdb/cockroach-operator v0.0.0-20230531051823-2cb3e2e676f4
9+
github.com/cockroachdb/errors v1.8.0
910
github.com/google/martian v2.1.1-0.20190517191504-25dcb96d9e51+incompatible
1011
github.com/gruntwork-io/terratest v0.41.26
1112
github.com/mitchellh/hashstructure/v2 v2.0.2
@@ -37,7 +38,6 @@ require (
3738
github.com/beorn7/perks v1.0.1 // indirect
3839
github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc // indirect
3940
github.com/cespare/xxhash/v2 v2.1.1 // indirect
40-
github.com/cockroachdb/errors v1.8.0 // indirect
4141
github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f // indirect
4242
github.com/cockroachdb/redact v1.0.6 // indirect
4343
github.com/cockroachdb/sentry-go v0.6.1-cockroachdb.2 // indirect

migrate/main.go

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
"strings"
9+
10+
"github.com/cockroachdb/errors"
11+
"github.com/spf13/cobra"
12+
corev1 "k8s.io/api/core/v1"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/runtime"
15+
"k8s.io/apimachinery/pkg/runtime/schema"
16+
"k8s.io/client-go/dynamic"
17+
"k8s.io/client-go/kubernetes"
18+
"k8s.io/client-go/tools/clientcmd"
19+
"k8s.io/client-go/util/homedir"
20+
"sigs.k8s.io/yaml"
21+
22+
publicv1 "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
23+
"github.com/cockroachdb/helm-charts/pkg/upstream/cockroach-operator/api/v1alpha1"
24+
)
25+
26+
func main() {
27+
var (
28+
cloudProvider string
29+
cloudRegion string
30+
crdbCluster string
31+
namespace string
32+
kubeconfig string
33+
outputDir string
34+
)
35+
36+
cmd := &cobra.Command{
37+
Use: "build-manifests",
38+
RunE: func(cmd *cobra.Command, args []string) error {
39+
ctx := context.TODO()
40+
41+
config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
42+
if err != nil {
43+
return errors.Wrap(err, "building k8s config")
44+
}
45+
clientset, err := kubernetes.NewForConfig(config)
46+
if err != nil {
47+
return errors.Wrap(err, "building k8s clientset")
48+
}
49+
dynamicClient, err := dynamic.NewForConfig(config)
50+
if err != nil {
51+
return errors.Wrap(err, "building k8s dynamic client")
52+
}
53+
54+
gvr := schema.GroupVersionResource{
55+
Group: "crdb.cockroachlabs.com",
56+
Version: "v1alpha1",
57+
Resource: "crdbclusters",
58+
}
59+
cr, err := dynamicClient.Resource(gvr).Namespace(namespace).Get(ctx, crdbCluster, metav1.GetOptions{})
60+
if err != nil {
61+
return errors.Wrap(err, "fetching public crdbcluster object")
62+
}
63+
64+
publicCluster := publicv1.CrdbCluster{}
65+
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(cr.Object, &publicCluster); err != nil {
66+
return errors.Wrap(err, "unmarshalling public crdbcluster object")
67+
}
68+
69+
sts, err := clientset.AppsV1().StatefulSets(namespace).Get(ctx, crdbCluster, metav1.GetOptions{})
70+
if err != nil {
71+
return errors.Wrap(err, "fetching statefulset")
72+
}
73+
74+
grpcPort := publicCluster.Spec.GRPCPort
75+
joinAddrs := []string{}
76+
for nodeIdx := range *sts.Spec.Replicas {
77+
joinAddrs = append(joinAddrs, fmt.Sprintf("%s-%d.%s.%s:%d", crdbCluster, nodeIdx, crdbCluster, namespace, grpcPort))
78+
}
79+
joinString := strings.Join(joinAddrs, ",")
80+
81+
buildNodeSpec := func() v1alpha1.CrdbNodeSpec {
82+
return v1alpha1.CrdbNodeSpec{
83+
PodLabels: sts.Spec.Template.Labels,
84+
DataStore: v1alpha1.DataStore{
85+
VolumeClaimTemplate: &corev1.PersistentVolumeClaim{
86+
ObjectMeta: metav1.ObjectMeta{
87+
Name: "datadir",
88+
},
89+
Spec: corev1.PersistentVolumeClaimSpec{
90+
AccessModes: []corev1.PersistentVolumeAccessMode{
91+
corev1.ReadWriteOnce,
92+
},
93+
Resources: sts.Spec.VolumeClaimTemplates[0].Spec.Resources,
94+
StorageClassName: sts.Spec.VolumeClaimTemplates[0].Spec.StorageClassName,
95+
},
96+
},
97+
},
98+
Domain: "",
99+
Env: []corev1.EnvVar{
100+
{
101+
Name: "HostIP",
102+
ValueFrom: &corev1.EnvVarSource{
103+
FieldRef: &corev1.ObjectFieldSelector{
104+
APIVersion: "v1",
105+
FieldPath: "status.hostIP",
106+
},
107+
},
108+
},
109+
},
110+
ResourceRequirements: sts.Spec.Template.Spec.Containers[0].Resources,
111+
Image: sts.Spec.Template.Spec.Containers[0].Image,
112+
ServiceAccountName: "cockroachdb",
113+
Join: joinString,
114+
Certificates: v1alpha1.Certificates{
115+
ExternalCertificates: &v1alpha1.ExternalCertificates{
116+
CAConfigMapName: crdbCluster + "-ca",
117+
NodeSecretName: crdbCluster + "-node-certs",
118+
RootSQLClientSecretName: crdbCluster + "-client-certs",
119+
},
120+
},
121+
}
122+
}
123+
124+
for nodeIdx := range *sts.Spec.Replicas {
125+
podName := fmt.Sprintf("%s-%d", crdbCluster, nodeIdx)
126+
pod, err := clientset.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{})
127+
if err != nil {
128+
return errors.Newf("couldn't find crdb pod %s", podName)
129+
}
130+
131+
if pod.Spec.NodeName == "" {
132+
return errors.Newf("pod %s isn't scheduled to a node", podName)
133+
}
134+
135+
nodeSpec := buildNodeSpec()
136+
nodeSpec.NodeName = pod.Spec.NodeName
137+
138+
crdbNode := v1alpha1.CrdbNode{
139+
TypeMeta: metav1.TypeMeta{
140+
Kind: "CrdbNode",
141+
APIVersion: "crdb.cockroachlabs.com/v1alpha1",
142+
},
143+
ObjectMeta: metav1.ObjectMeta{
144+
Name: fmt.Sprintf("%s-%d", crdbCluster, nodeIdx),
145+
Namespace: namespace,
146+
GenerateName: "",
147+
Labels: map[string]string{
148+
"app": "cockroachdb",
149+
"svc": "cockroachdb",
150+
"crdb.cockroachlabs.com/cluster": crdbCluster,
151+
},
152+
Annotations: map[string]string{
153+
"crdb.cockroachlabs.com/cloudProvider": cloudProvider,
154+
},
155+
Finalizers: []string{"crdbnode.crdb.cockroachlabs.com/finalizer"},
156+
},
157+
Spec: nodeSpec,
158+
}
159+
yamlToDisk(filepath.Join(outputDir, fmt.Sprintf("crdbnode-%d.yaml", nodeIdx)), crdbNode)
160+
}
161+
162+
helmValues := buildHelmValues(publicCluster, cloudProvider, cloudRegion, namespace)
163+
164+
yamlToDisk(filepath.Join(outputDir, "values.yaml"), helmValues)
165+
166+
return nil
167+
},
168+
}
169+
170+
cmd.PersistentFlags().StringVar(&cloudProvider, "cloud-provider", "", "name of cloud provider")
171+
cmd.PersistentFlags().StringVar(&cloudRegion, "cloud-region", "", "name of cloud provider region")
172+
cmd.PersistentFlags().StringVar(&crdbCluster, "crdb-cluster", "", "name of crdbcluster resource")
173+
cmd.PersistentFlags().StringVar(&namespace, "namespace", "default", "namespace of crdbcluster resource")
174+
cmd.PersistentFlags().StringVar(&kubeconfig, "kubeconfig", filepath.Join(homedir.HomeDir(), ".kube", "config"), "path to kubeconfig file")
175+
cmd.PersistentFlags().StringVar(&outputDir, "output-dir", "./manifests", "manifest output directory")
176+
cmd.MarkPersistentFlagRequired("cloud-provider")
177+
cmd.MarkPersistentFlagRequired("cloud-region")
178+
cmd.MarkPersistentFlagRequired("crdb-cluster")
179+
180+
if err := cmd.Execute(); err != nil {
181+
fmt.Println(err)
182+
os.Exit(1)
183+
}
184+
}
185+
186+
func To[T any](v T) *T {
187+
return &v
188+
}
189+
190+
func yamlToDisk(path string, data any) error {
191+
file, err := os.Create(path)
192+
if err != nil {
193+
return errors.Wrap(err, "creating file")
194+
}
195+
bytes, err := yaml.Marshal(data)
196+
if err != nil {
197+
return errors.Wrap(err, "marshalling yaml")
198+
}
199+
if _, err := file.Write(bytes); err != nil {
200+
return errors.Wrap(err, "writing yaml")
201+
}
202+
return nil
203+
}
204+
205+
func buildHelmValues(cluster publicv1.CrdbCluster, cloudProvider string, cloudRegion string, namespace string) map[string]interface{} {
206+
return map[string]interface{}{
207+
"operator": map[string]interface{}{
208+
"enabled": true,
209+
"tlsEnabled": cluster.Spec.TLSEnabled,
210+
"regions": []map[string]interface{}{
211+
{
212+
"namespace": namespace,
213+
"cloudProvider": cloudProvider,
214+
"code": cloudRegion,
215+
"nodes": cluster.Spec.Nodes,
216+
"domain": "",
217+
},
218+
},
219+
"dataStore": map[string]interface{}{
220+
"volumeClaimTemplate": map[string]interface{}{
221+
"metadata": map[string]interface{}{
222+
"name": "datadir",
223+
},
224+
},
225+
},
226+
"resources": cluster.Spec.Resources,
227+
"podAnnotations": cluster.Spec.AdditionalAnnotations,
228+
"ports": map[string]interface{}{
229+
"grpcPort": cluster.Spec.GRPCPort,
230+
"httpPort": cluster.Spec.HTTPPort,
231+
"sqlPort": cluster.Spec.SQLPort,
232+
},
233+
},
234+
}
235+
}

scripts/migration/public/README.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
## Migrate from public operator to cloud operator
2+
3+
This guide will walk you through migrating a crdb cluster managed via the public operator to the crdb cloud operator. We assume you've created a cluster using the public operator. The goals of this process are to migrate without affecting cluster availability, and to preserve existing disks so that we don't have to replica data into empty volumes. Note that this process scales down the statefulset by one node before adding each operator-managed pod, so cluster capacity will be reduced by one node at times.
4+
5+
Pre-requisite: Install the public operator and create an operator-managed cluster:
6+
7+
```
8+
kubectl apply -f https://raw.githubusercontent.com/cockroachdb/cockroach-operator/v2.17.0/install/crds.yaml
9+
kubectl apply -f https://raw.githubusercontent.com/cockroachdb/cockroach-operator/v2.17.0/install/operator.yaml
10+
11+
kubectl apply -f https://raw.githubusercontent.com/cockroachdb/cockroach-operator/v2.17.0/examples/example.yaml
12+
```
13+
14+
Set environment variables:
15+
16+
```
17+
export CRDBCLUSTER=cockroachdb
18+
export NAMESPACE=default
19+
export CLOUD_PROVIDER=gcp
20+
export REGION=us-central1
21+
```
22+
23+
Back up crdbcluster resource in case we need to revert:
24+
25+
```
26+
mkdir -p backup
27+
kubectl get crdbcluster -o yaml $CRDBCLUSTER > backup/crdbcluster-$CRDBCLUSTER.yaml
28+
```
29+
30+
Next, we need to re-map and generate tls certs. The crdb cloud operator uses slightly different certs than the public operator and mounts them in configmaps and secrets with different names. Run the `generate-certs.sh` script to generate and upload certs to your cluster.
31+
32+
```
33+
./generate-certs.sh
34+
```
35+
36+
Next, generate manifests for each crdbnode and the crdbcluster based on the state of the statefulset. We generate a manifest for each crdbnode because we want the crdb pods and their associated pvcs to use the same names as the original statefulset-managed pods and pvcs. This means that the new operator-managed pods will use the original pvcs, and won't have to replicate data into empty nodes.
37+
38+
```
39+
./generate-manifests.sh
40+
41+
The public operator and cloud operator use custom resource definitions with the same names, so we have to remove the public operator before installing the cloud operator. Uninstall the public operator, without deleting its managed pods, pvc, etc.:
42+
43+
```
44+
45+
# Ensure that operator can't accidentally delete managed k8s objects.
46+
kubectl delete clusterrolebinding cockroach-operator-rolebinding
47+
48+
# Delete public operator cr.
49+
kubectl delete crdbcluster $CRDBCLUSTER --cascade=orphan
50+
51+
# Delete public operator resources and crd.
52+
kubectl delete -f https://raw.githubusercontent.com/cockroachdb/cockroach-operator/v2.17.0/install/crds.yaml
53+
kubectl delete -f https://raw.githubusercontent.com/cockroachdb/cockroach-operator/v2.17.0/install/operator.yaml
54+
```
55+
56+
Install the cloud operator and wait for it to become ready:
57+
58+
```
59+
helm upgrade --install crdb-operator ./operator
60+
kubectl rollout status deployment/cockroach-operator --timeout=60s
61+
```
62+
63+
To migrate seamlessly from the statefulset to the cloud operator, we'll scale down statefulset-managed pods and replace them with crdbnode objects, one by one. Then we'll create the crdbcluster that manages the crdbnodes. Because of this order of operations, we need to create some objects that the crdbcluster will eventually own:
64+
65+
```
66+
kubectl create priorityclass crdb-critical --value 500000000
67+
yq '(.. | select(tag == "!!str")) |= envsubst' rbac-template.yaml > manifests/rbac.yaml
68+
kubectl apply -f manifests/rbac.yaml
69+
```
70+
71+
For each crdb pod, scale the statefulset down by one replica. For example, for a three-node cluster, first scale the statefulset down to two replicas:
72+
73+
```
74+
kubectl scale statefulset/$CRDBCLUSTER --replicas=2
75+
```
76+
77+
Then create the crdbnode corresponding to the statefulset pod you just scaled down:
78+
79+
```
80+
kubectl apply -f manifests/crdbnode-$CRDBCLUSTER-2.yaml
81+
```
82+
83+
Wait for the new pod to become ready. If it doesn't, check the cloud operator logs for errors.
84+
85+
Repeat this process for each crdb node until the statefulset has zero replicas.
86+
87+
The public operator creates a pod disruption budget that conflicts with a pod disruption budget managed by the cloud operator. Before applying the crdbcluster manifest, delete the existing pod disruption budget:
88+
89+
```
90+
kubectl delete poddisruptionbudget $CRDBCLUSTER
91+
```
92+
93+
Finally, apply the crdbcluster manifest:
94+
95+
```
96+
kubectl apply -f manifests/crdbcluster-$CRDBCLUSTER.yaml
97+
```

0 commit comments

Comments
 (0)