Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2f12c51
[CLOUD-785] Refactor rclone credential handling for s3
maismail Oct 20, 2025
7e53a7d
[CLOUD-785] Refactor rclone credential handling for s3
maismail Oct 21, 2025
7d28198
[CLOUD-785] Refactor backups to use global config if not local config…
maismail Oct 21, 2025
ca2764f
[CLOUD-785] Generate backup ids based on the timestamp
maismail Oct 21, 2025
4df19f0
[CLOUD-785] make is possible to override enabled flag from rondb
maismail Oct 21, 2025
76e9cd8
[CLOUD-785] refactor restoreFromBackup calls to use local and if not …
maismail Oct 21, 2025
d7d6aad
[CLOUD-785] update schema
maismail Oct 22, 2025
7e80d2b
[CLOUD-785] Use backupId as string to avoid issues with scientific no…
maismail Oct 24, 2025
f1341cc
[CLOUD-785] Move backups and restoreFromBackup out of managedObjectSt…
maismail Oct 24, 2025
72101df
[CLOUD-785] Rename create-backup to create-rondb-backup
maismail Oct 28, 2025
e48f4ef
[CLOUD-785] in case of minio we should set optional since the secret …
maismail Oct 30, 2025
f381cbd
[CLOUD-785] store backups metadata in configmap
maismail Oct 30, 2025
524a2f7
[CLOUD-785] rotate configmaps for storing backups when needed
maismail Oct 31, 2025
6ecd576
[CLOUD-785] add path to backup metadata
maismail Oct 31, 2025
747338f
[CLOUD-785] add storage provider to path
maismail Nov 3, 2025
60f7928
[CLOUD-785] fix dummy lint values file
maismail Nov 12, 2025
1f643a7
[CLOUD-785] fix linting
maismail Nov 13, 2025
b90aee7
[CLOUD-785] fix tests
maismail Nov 13, 2025
4cbb63e
[CLOUD-785] test hack to cleanup test resources
maismail Nov 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/remove_cluster/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ runs:
steps:
- name: Delete RonDB Helmchart
shell: bash
run: helm delete --namespace=${{ inputs.namespace }} ${{ inputs.helm_chart }}
run: helm delete --namespace=${{ inputs.namespace }} ${{ inputs.helm_chart }} || true

# Can't just use delete Hook, since we want to print the logs of the Pod
- name: Delete Helm Test Pods
Expand Down
6 changes: 2 additions & 4 deletions files/scripts/backups/metadata_upload_kubectl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
# for everything we want to back up. Root cannot be used over the network.
set -e

{{ include "rondb.createRcloneConfig" . }}

kubectl exec \
$MYSQLD_PODNAME \
-c mysqld \
Expand All @@ -23,8 +21,8 @@ kubectl cp \
-c mysqld $LOCAL_BACKUP_DIR
ls -la $LOCAL_BACKUP_DIR

{{ include "rondb.backups.defineJobNumberEnv" $ }}
REMOTE_BACKUP_DIR={{ include "rondb.rcloneBackupRemoteName" . }}:{{ .Values.backups.s3.bucketName }}/{{ include "rondb.takeBackupPathPrefix" . }}/$JOB_NUMBER
{{ include "rondb.backups.defineBackupIdEnv" $ }}
REMOTE_BACKUP_DIR={{ include "rondb.rcloneBackupRemoteName" . }}:{{include "rondb.backups.bucketName" (dict "backupConfig" .Values.backups "global" .Values.global)}}/{{ include "rondb.takeBackupPathPrefix" . }}/$BACKUP_ID
echo && rclone ls $REMOTE_BACKUP_DIR
echo "Copying backup from $LOCAL_BACKUP_DIR to $REMOTE_BACKUP_DIR"
rclone move $LOCAL_BACKUP_DIR $REMOTE_BACKUP_DIR
Expand Down
96 changes: 93 additions & 3 deletions files/scripts/backups/native_upload_kubectl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ wait_pids=()
NUM_NODE_GROUPS={{ .Values.clusterSize.numNodeGroups }}
NUM_REPLICAS={{ .Values.clusterSize.activeDataReplicas }}

{{ include "rondb.backups.defineJobNumberEnv" $ }}
SOURCE_DIR=/home/hopsworks/data/ndb/backups/BACKUP/BACKUP-$JOB_NUMBER
REMOTE_BACKUP_DIR={{ include "rondb.rcloneBackupRemoteName" . }}:{{ .Values.backups.s3.bucketName }}/{{ include "rondb.takeBackupPathPrefix" . }}/$JOB_NUMBER
{{ include "rondb.backups.defineBackupIdEnv" $ }}
SOURCE_DIR=/home/hopsworks/data/ndb/backups/BACKUP/BACKUP-$BACKUP_ID
REMOTE_BACKUP_DIR={{ include "rondb.rcloneBackupRemoteName" . }}:{{ include "rondb.backups.bucketName" (dict "backupConfig" .Values.backups "global" .Values.global) }}/{{ include "rondb.takeBackupPathPrefix" . }}/$BACKUP_ID

echo "Uploading backups from '$SOURCE_DIR' to object storage $REMOTE_BACKUP_DIR in parallel"
for ((g = 0; g < NUM_NODE_GROUPS; g++)); do
Expand Down Expand Up @@ -85,3 +85,93 @@ if [ "$FAILED" = true ]; then
exit 1
fi
echo ">>> Succeeded uploading all backups"

{{ $configMap := include "rondb.backups.metadataStore.configMapName" . }}
{{- if $configMap }}

MAX_KEYS=5000
MAX_SIZE_BYTES=900000
BASE_CONFIGMAP={{ $configMap }}

get_active_configmap(){
kubectl get cm -n {{ .Release.Namespace }} -l "app=backups-metadata,service=rondb,managed-by=cronjob,active=active" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
}

log_stderr(){
echo "$*" >&2
}

create_configmap_if_missing() {
local name="$1"
if ! kubectl get configmap "$name" -n {{ .Release.Namespace }} >/dev/null 2>&1; then
log_stderr "Creating ConfigMap $name"
kubectl create configmap "$name" -n {{ .Release.Namespace }} >/dev/null 2>&1
kubectl label configmap "$name" -n {{ .Release.Namespace }} app=backups-metadata service=rondb managed-by=cronjob active=active --overwrite >/dev/null 2>&1
fi
}

rotate_if_needed() {
local cm="$1"

local key_count size_bytes
key_count=$(kubectl get configmap "$cm" -n {{ .Release.Namespace }} -o json | jq '.data | length')
size_bytes=$(kubectl get configmap "$cm" -n {{ .Release.Namespace }} -o json | jq -r '.data' | wc -c)

log_stderr "ConfigMap: $cm | Keys=$key_count | Size=${size_bytes}B"

if (( key_count >= MAX_KEYS )) || (( size_bytes >= MAX_SIZE_BYTES )); then
log_stderr "Threshold exceeded, rotating ConfigMap..."
local suffix next_suffix new_cm
suffix="${cm#$BASE_CONFIGMAP-}"
if [[ "$suffix" =~ ^[0-9]+$ ]]; then
next_suffix=$((suffix + 1))
else
next_suffix=1
fi
new_cm="${BASE_CONFIGMAP}-${next_suffix}"

# Create and label the new ConfigMap
create_configmap_if_missing "$new_cm"

# Remove active label from old configmap
kubectl label configmap "$cm" -n {{ .Release.Namespace }} active- --overwrite >/dev/null 2>&1 || true

log_stderr "Rotated to new ConfigMap: $new_cm"
echo "$new_cm"
else
echo "$cm"
fi
}

ACTIVE_CM=$(get_active_configmap)
if [[ -z "$ACTIVE_CM" ]]; then
ACTIVE_CM="$BASE_CONFIGMAP"
echo "No active ConfigMap found. Creating $ACTIVE_CM ..."
create_configmap_if_missing "$ACTIVE_CM"
fi

ACTIVE_CM=$(rotate_if_needed "$ACTIVE_CM")

# Build backup metadata info json
echo "Updating backup metadata on ConfigMap $ACTIVE_CM "
START_TS=$(stat -c %Y {{ include "rondb.backups.backupIdFile" . }} | awk '{printf "%.3f", $1}')
END_TS=$(date +%s.%3N)

DURATION_MS=$(awk -v start="$START_TS" -v end="$END_TS" 'BEGIN { printf "%.0f", (end - start) * 1000 }')

START_TIME=$(date -u -d @"${START_TS%.*}" +"%Y-%m-%dT%H:%M:%S").$(printf "%03d" "${START_TS#*.}")Z
END_TIME=$(date -u -d @"${END_TS%.*}" +"%Y-%m-%dT%H:%M:%S").$(printf "%03d" "${END_TS#*.}")Z

STATE="SUCCESS"

PATCH_JSON=$(cat <<EOF
{
"data": {
"$BACKUP_ID": "{\"start_time\":\"$START_TIME\",\"end_time\":\"$END_TIME\",\"duration_ms\":$DURATION_MS,\"state\":\"$STATE\",\"path\":\"{{ include "rondb.backups.pathScheme" . }}/{{ include "rondb.backups.bucketName" (dict "backupConfig" .Values.backups "global" .Values.global) }}/{{ include "rondb.takeBackupPathPrefix" . }}/$BACKUP_ID\"}"
}
}
EOF
)

kubectl patch configmap "$ACTIVE_CM" -n {{ .Release.Namespace }} --type merge -p "$PATCH_JSON"
{{- end }}
55 changes: 21 additions & 34 deletions templates/backups/create.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# Copyright (c) 2024-2025 Hopsworks AB. All rights reserved.

{{ if .Values.backups.enabled -}}
{{ if include "rondb.backups.isEnabled" . -}}
apiVersion: batch/v1
kind: CronJob
metadata:
name: create-backup
name: create-rondb-backup
namespace: {{ .Release.Namespace }}
spec:
schedule: {{ .Values.backups.schedule | quote}}
schedule: {{ include "rondb.backups.schedule" . | quote}}
concurrencyPolicy: Forbid
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
Expand All @@ -20,7 +21,7 @@ spec:
{{- include "rondb.nodeSelector" (dict "nodeSelector" $.Values.nodeSelector.backup) | indent 10 }}
{{- include "rondb.tolerations" (dict "tolerations" $.Values.tolerations.backup) | indent 10 }}
serviceAccountName: rondb-backups-sa
restartPolicy: OnFailure
restartPolicy: Never
initContainers:
{{ include "rondb.apiInitContainer" . | indent 10 }}
- name: backup-metadata
Expand All @@ -32,38 +33,23 @@ spec:
- |
{{ tpl (.Files.Get "files/scripts/backups/metadata_upload_kubectl.sh") . | indent 14 }}
env:
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MYSQLD_PODNAME
value: {{ include "rondb.mysqldPodname" . }}
# On MySQLd Pod:
- name: REMOTE_BACKUP_DIR
value: /tmp/backup
- name: LOCAL_BACKUP_DIR
value: /home/hopsworks/schemata
- name: RCLONE_MOUNT_FILEPATH
value: &rawRCloneConf /home/hopsworks/rclone-raw.conf
# This will be read by rclone
- name: RCLONE_CONFIG
value: /home/hopsworks/rclone.conf
{{- if eq $.Values.backups.objectStorageProvider "s3" }}
- name: ACCESS_KEY_ID
valueFrom:
secretKeyRef:
{{- toYaml .Values.backups.s3.keyCredentialsSecret | nindent 18 }}
optional: true
- name: SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
{{- toYaml .Values.backups.s3.secretCredentialsSecret | nindent 18 }}
optional: true
{{- end }}
{{- include "rondb.backup.credentials" (dict "backupConfig" $.Values.backups "namespace" $.Release.Namespace "global" $.Values.global) | indent 12 }}
volumeMounts:
- name: rclone-configs
mountPath: *rawRCloneConf
mountPath: /home/hopsworks/rclone.conf
subPath: rclone.conf
- name: backup-id
mountPath: /home/hopsworks/backup-id
# RonDB contains a native backup protocol, which is launched through
# the mgm client. It essentially causes every datanode to create a binary
# backup that it stores locally.
Expand All @@ -74,15 +60,15 @@ spec:
- /bin/bash
- -c
- |
{{- include "rondb.backups.defineJobNumberEnv" $ | indent 14 }}
ndb_mgm --ndb-connectstring=$MGM_CONNECTSTRING -e "START BACKUP $JOB_NUMBER SNAPSHOTEND WAIT COMPLETED"
{{- include "rondb.backups.defineBackupIdEnv" $ | indent 14 }}
ndb_mgm --ndb-connectstring=$MGM_CONNECTSTRING -e "START BACKUP $BACKUP_ID SNAPSHOTEND WAIT COMPLETED"
env:
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MGM_CONNECTSTRING
value: {{ include "rondb.mgmdHostname" . }}:1186
volumeMounts:
- name: backup-id
mountPath: /home/hopsworks/backup-id
readOnly: true
containers:
- name: upload-native-backups
image: {{ include "image_address" (dict "image" $.Values.images.toolbox) }}
Expand All @@ -93,14 +79,15 @@ spec:
- -c
- |
{{ tpl (.Files.Get "files/scripts/backups/native_upload_kubectl.sh") . | indent 14 }}
env:
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: backup-id
mountPath: /home/hopsworks/backup-id
readOnly: true
volumes:
- name: rclone-configs
configMap:
name: rclone-configs
- name: backup-id
emptyDir: {}
---
{{- end -}}
27 changes: 7 additions & 20 deletions templates/backups/restore.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2024-2025 Hopsworks AB. All rights reserved.

{{ if .Values.restoreFromBackup.backupId -}}
{{ if include "rondb.restoreFromBackup.backupId" . -}}
apiVersion: batch/v1
kind: Job
metadata:
Expand Down Expand Up @@ -145,7 +145,7 @@ spec:
- name: MGMD_HOST
value: {{ include "rondb.mgmdHostname" . }}
- name: BACKUP_ID
value: {{ .Values.restoreFromBackup.backupId | int | quote }}
value: {{ include "rondb.restoreFromBackup.backupId" . | quote }}
resources:
limits:
cpu: {{ .Values.resources.limits.cpus.restore }}
Expand Down Expand Up @@ -200,27 +200,14 @@ spec:
- name: MGMD_HOST
value: {{ include "rondb.mgmdHostname" . }}
- name: BACKUP_ID
value: {{ .Values.restoreFromBackup.backupId | int | quote }}
{{- if eq $.Values.restoreFromBackup.objectStorageProvider "s3" }}
- name: ACCESS_KEY_ID
valueFrom:
secretKeyRef:
{{- toYaml $.Values.restoreFromBackup.s3.keyCredentialsSecret | nindent 14 }}
optional: true
- name: SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
{{- toYaml $.Values.restoreFromBackup.s3.secretCredentialsSecret | nindent 14 }}
optional: true
{{- end }}
- name: RCLONE_MOUNT_FILEPATH
value: {{ include "rondb.rawRCloneConf" $ }}
value: {{ include "rondb.restoreFromBackup.backupId" . | quote }}
{{- include "rondb.backup.credentials" (dict "backupConfig" $.Values.restoreFromBackup "namespace" $.Release.Namespace "global" $.Values.global) | indent 8 }}
# This will be read by rclone
- name: RCLONE_CONFIG
value: /home/hopsworks/rclone.conf
volumeMounts:
- name: rclone-configs
mountPath: {{ include "rondb.rawRCloneConf" $ }}
mountPath: /home/hopsworks/rclone.conf
subPath: rclone.conf
resources:
limits:
Expand Down Expand Up @@ -272,7 +259,7 @@ spec:
- name: MGMD_HOST
value: {{ include "rondb.mgmdHostname" . }}
- name: BACKUP_ID
value: {{ .Values.restoreFromBackup.backupId | int | quote }}
value: {{ include "rondb.restoreFromBackup.backupId" . | quote }}
resources:
limits:
cpu: {{ .Values.resources.limits.cpus.restore }}
Expand Down Expand Up @@ -309,7 +296,7 @@ spec:
echo "Successfully removed native backup ID $BACKUP_ID from data node Pods"
env:
- name: BACKUP_ID
value: {{ .Values.restoreFromBackup.backupId | int | quote }}
value: {{ include "rondb.restoreFromBackup.backupId" . | quote }}
resources:
limits:
cpu: {{ .Values.resources.limits.cpus.restore }}
Expand Down
39 changes: 8 additions & 31 deletions templates/backups/shared.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2024-2025 Hopsworks AB. All rights reserved.

{{ if or .Values.backups.enabled .Values.restoreFromBackup.backupId }}
{{ if or (include "rondb.backups.isEnabled" .) (include "rondb.restoreFromBackup.backupId" .) }}
apiVersion: v1
kind: ConfigMap
metadata:
Expand All @@ -9,38 +9,10 @@ metadata:
data:
rclone.conf: |
[{{ include "rondb.rcloneRestoreRemoteName" . }}]
{{- if eq $.Values.restoreFromBackup.objectStorageProvider "s3" }}
type = s3
provider = {{ .Values.restoreFromBackup.s3.provider }}
access_key_id = REG_ACCESS_KEY_ID
secret_access_key = REG_SECRET_ACCESS_KEY
env_auth = false
region = {{ .Values.restoreFromBackup.s3.region }}
{{- if .Values.restoreFromBackup.s3.serverSideEncryption }}
server_side_encryption = {{ .Values.restoreFromBackup.s3.serverSideEncryption }}
{{- end }}
storage_class = STANDARD
{{- if .Values.restoreFromBackup.s3.endpoint }}
endpoint = {{ .Values.restoreFromBackup.s3.endpoint }}
{{- end }}
{{- end }}
{{ include "rondb.rcloneConfig" (dict "backupConfig" .Values.restoreFromBackup "global" .Values.global) | nindent 8}}

[{{ include "rondb.rcloneBackupRemoteName" . }}]
{{- if eq $.Values.backups.objectStorageProvider "s3" }}
type = s3
provider = {{ .Values.backups.s3.provider }}
access_key_id = REG_ACCESS_KEY_ID
secret_access_key = REG_SECRET_ACCESS_KEY
env_auth = false
region = {{ .Values.backups.s3.region }}
{{- if .Values.backups.s3.serverSideEncryption }}
server_side_encryption = {{ .Values.backups.s3.serverSideEncryption }}
{{- end }}
storage_class = STANDARD
{{- if .Values.backups.s3.endpoint }}
endpoint = {{ .Values.backups.s3.endpoint }}
{{- end }}
{{- end }}
{{ include "rondb.rcloneConfig" (dict "backupConfig" .Values.backups "global" .Values.global) | nindent 8}}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
Expand Down Expand Up @@ -82,4 +54,9 @@ rules:
{{- end }}
{{- end }}
- {{ include "rondb.mysqldPodname" . }}
{{- if include "rondb.backups.metadataStore.configMapName" $ }}
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "get", "list", "patch"]
{{- end }}
{{- end }}
2 changes: 1 addition & 1 deletion templates/mysqlds/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
echo "Waiting for {{ include "rondb.mysqldSetupJobName" . }} Job to have completed"

{{- $waitTimeoutMinutes := .Values.timeoutsMinutes.singleSetupMySQLds }}
{{- if .Values.restoreFromBackup.backupId }}
{{- if include "rondb.restoreFromBackup.backupId" . }}
{{- $waitTimeoutMinutes := (add $waitTimeoutMinutes .Values.timeoutsMinutes.restoreNativeBackup) }}
{{- end }}
(
Expand Down
2 changes: 1 addition & 1 deletion templates/mysqlds/binlog_servers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ spec:
{{- if .Values.priorityClass }}
priorityClassName: {{ .Values.priorityClass | quote }}
{{- end }}
{{- if .Values.restoreFromBackup.backupId }}
{{- if include "rondb.restoreFromBackup.backupId" . }}
# We need to wait for the restore Job
serviceAccountName: {{ include "rondb.serviceAccount.restoreWatcher" . }}
{{- end }}
Expand Down
Loading
Loading