Skip to content

Commit 6bafe5c

Browse files
authored
feat: implement safe clickhouse upgrade process + resource limits support (#129)
- Recreate deployment strategy: Ensures old pod terminates completely before new one starts, preventing multiple pods from accessing the same persistent volume - imagePullPolicy: IfNotPresent: Consistent image pull behavior - Resource limits support: Allows users to configure CPU/memory to prevent OOM kills - Enhanced preStop hook: includes: - SYSTEM STOP MOVES - Stops data movement between disks - SYSTEM STOP MERGES - Stops background merge operations - SYSTEM FLUSH DISTRIBUTED - Flushes pending distributed table data - SYSTEM FLUSH LOGS - Flushes buffered logs to system tables - 5-second grace period before SIGTERM - startupProbe: Gives up to 5 minutes (30 failures × 10s) for ClickHouse to start, crucial for upgrades with schema migrations - shutdown_wait_unfinished: 60: This gives ClickHouse 60 seconds to wait for active connections to close gracefully before forcing shutdown - Bumping default clickhouse version to `v25.7` Ref: HDX-2569
1 parent 69c3a42 commit 6bafe5c

File tree

6 files changed

+276
-28
lines changed

6 files changed

+276
-28
lines changed

.changeset/puny-hats-turn.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"helm-charts": minor
3+
---
4+
5+
feat: implement safe clickhouse upgrade process + resource limits support

.changeset/shaggy-squids-wonder.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"helm-charts": minor
3+
---
4+
5+
chore: bump clickhouse to v25.7

charts/hdx-oss-v2/data/config.xml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
<?xml version="1.0"?>
22
<clickhouse>
3+
<user_directories>
4+
<users_xml>
5+
<path>/etc/clickhouse-server/users.xml</path>
6+
</users_xml>
7+
</user_directories>
8+
39
<logger>
410
<level>information</level>
511
<console>true</console>
@@ -27,6 +33,9 @@
2733
<timezone>UTC</timezone>
2834
<mlock_executable>false</mlock_executable>
2935

36+
<!-- Graceful shutdown settings -->
37+
<shutdown_wait_unfinished>60</shutdown_wait_unfinished>
38+
3039
{{- if .Values.clickhouse.prometheus.enabled }}
3140
<!-- Prometheus exporter -->
3241
<prometheus>
@@ -154,4 +163,4 @@
154163
</distributed_ddl>
155164

156165
<format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>
157-
</clickhouse>
166+
</clickhouse>

charts/hdx-oss-v2/templates/clickhouse-deployment.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ metadata:
88
app: clickhouse
99
spec:
1010
replicas: 1
11+
strategy:
12+
type: Recreate
1113
selector:
1214
matchLabels:
1315
{{- include "hdx-oss.selectorLabels" . | nindent 6 }}
@@ -18,6 +20,7 @@ spec:
1820
{{- include "hdx-oss.selectorLabels" . | nindent 8 }}
1921
app: clickhouse
2022
spec:
23+
terminationGracePeriodSeconds: {{ .Values.clickhouse.terminationGracePeriodSeconds | default 90 }}
2124
{{- if .Values.clickhouse.nodeSelector }}
2225
nodeSelector:
2326
{{- toYaml .Values.clickhouse.nodeSelector | nindent 8 }}
@@ -33,12 +36,28 @@ spec:
3336
containers:
3437
- name: clickhouse
3538
image: "{{ .Values.clickhouse.image }}"
39+
imagePullPolicy: IfNotPresent
3640
ports:
3741
- containerPort: {{ .Values.clickhouse.port }}
3842
- containerPort: {{ .Values.clickhouse.nativePort }}
3943
env:
4044
- name: CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT
4145
value: "1"
46+
{{- if .Values.clickhouse.resources }}
47+
resources:
48+
{{- toYaml .Values.clickhouse.resources | nindent 12 }}
49+
{{- end }}
50+
lifecycle:
51+
preStop:
52+
exec:
53+
command:
54+
- /bin/sh
55+
- -c
56+
- |
57+
clickhouse-client --query "SYSTEM STOP MERGES" || true
58+
clickhouse-client --query "SYSTEM STOP MOVES" || true
59+
clickhouse-client --query "SYSTEM FLUSH LOGS" || true
60+
sleep 5
4261
{{- if .Values.clickhouse.livenessProbe.enabled }}
4362
livenessProbe:
4463
httpGet:
@@ -59,6 +78,16 @@ spec:
5978
timeoutSeconds: {{ .Values.clickhouse.readinessProbe.timeoutSeconds }}
6079
failureThreshold: {{ .Values.clickhouse.readinessProbe.failureThreshold }}
6180
{{- end }}
81+
{{- if .Values.clickhouse.startupProbe.enabled }}
82+
startupProbe:
83+
httpGet:
84+
path: /ping
85+
port: {{ .Values.clickhouse.port }}
86+
initialDelaySeconds: {{ .Values.clickhouse.startupProbe.initialDelaySeconds }}
87+
periodSeconds: {{ .Values.clickhouse.startupProbe.periodSeconds }}
88+
timeoutSeconds: {{ .Values.clickhouse.startupProbe.timeoutSeconds }}
89+
failureThreshold: {{ .Values.clickhouse.startupProbe.failureThreshold }}
90+
{{- end }}
6291
volumeMounts:
6392
- name: config
6493
mountPath: /etc/clickhouse-server/config.xml

charts/hdx-oss-v2/tests/clickhouse-deployment_test.yaml

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,4 +384,172 @@ tests:
384384
- documentIndex: 0
385385
equal:
386386
path: spec.template.spec.imagePullSecrets[0].name
387-
value: regcred
387+
value: regcred
388+
389+
- it: should have Recreate deployment strategy
390+
set:
391+
clickhouse:
392+
enabled: true
393+
asserts:
394+
- documentSelector: *deployment-selector
395+
equal:
396+
path: spec.strategy.type
397+
value: Recreate
398+
399+
- it: should have imagePullPolicy set to IfNotPresent
400+
set:
401+
clickhouse:
402+
enabled: true
403+
asserts:
404+
- documentSelector: *deployment-selector
405+
equal:
406+
path: spec.template.spec.containers[0].imagePullPolicy
407+
value: IfNotPresent
408+
409+
- it: should include resources when configured
410+
set:
411+
clickhouse:
412+
enabled: true
413+
resources:
414+
requests:
415+
memory: "512Mi"
416+
cpu: "500m"
417+
limits:
418+
memory: "2Gi"
419+
cpu: "2000m"
420+
asserts:
421+
- documentSelector: *deployment-selector
422+
equal:
423+
path: spec.template.spec.containers[0].resources.requests.memory
424+
value: "512Mi"
425+
- documentSelector: *deployment-selector
426+
equal:
427+
path: spec.template.spec.containers[0].resources.requests.cpu
428+
value: "500m"
429+
- documentSelector: *deployment-selector
430+
equal:
431+
path: spec.template.spec.containers[0].resources.limits.memory
432+
value: "2Gi"
433+
- documentSelector: *deployment-selector
434+
equal:
435+
path: spec.template.spec.containers[0].resources.limits.cpu
436+
value: "2000m"
437+
438+
- it: should not include resources when not configured
439+
set:
440+
clickhouse:
441+
enabled: true
442+
resources: {}
443+
asserts:
444+
- documentSelector: *deployment-selector
445+
isNull:
446+
path: spec.template.spec.containers[0].resources
447+
448+
- it: should include preStop lifecycle hook with correct commands
449+
set:
450+
clickhouse:
451+
enabled: true
452+
asserts:
453+
- documentSelector: *deployment-selector
454+
isNotNull:
455+
path: spec.template.spec.containers[0].lifecycle.preStop
456+
- documentSelector: *deployment-selector
457+
equal:
458+
path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[0]
459+
value: /bin/sh
460+
- documentSelector: *deployment-selector
461+
equal:
462+
path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[1]
463+
value: -c
464+
- documentSelector: *deployment-selector
465+
matchRegex:
466+
path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[2]
467+
pattern: ".*SYSTEM STOP MERGES.*"
468+
- documentSelector: *deployment-selector
469+
matchRegex:
470+
path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[2]
471+
pattern: ".*SYSTEM STOP MOVES.*"
472+
- documentSelector: *deployment-selector
473+
matchRegex:
474+
path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[2]
475+
pattern: ".*SYSTEM FLUSH LOGS.*"
476+
477+
- it: should include startupProbe with default values when enabled
478+
set:
479+
clickhouse:
480+
enabled: true
481+
port: 8123
482+
startupProbe:
483+
enabled: true
484+
initialDelaySeconds: 5
485+
periodSeconds: 10
486+
timeoutSeconds: 5
487+
failureThreshold: 30
488+
asserts:
489+
- documentSelector: *deployment-selector
490+
isSubset:
491+
path: spec.template.spec.containers[0].startupProbe
492+
content:
493+
httpGet:
494+
path: /ping
495+
port: 8123
496+
initialDelaySeconds: 5
497+
periodSeconds: 10
498+
timeoutSeconds: 5
499+
failureThreshold: 30
500+
501+
- it: should not include startupProbe when disabled
502+
set:
503+
clickhouse:
504+
enabled: true
505+
startupProbe:
506+
enabled: false
507+
asserts:
508+
- documentSelector: *deployment-selector
509+
isNull:
510+
path: spec.template.spec.containers[0].startupProbe
511+
512+
- it: should use custom startupProbe values when provided
513+
set:
514+
clickhouse:
515+
enabled: true
516+
port: 8123
517+
startupProbe:
518+
enabled: true
519+
initialDelaySeconds: 10
520+
periodSeconds: 15
521+
timeoutSeconds: 10
522+
failureThreshold: 60
523+
asserts:
524+
- documentSelector: *deployment-selector
525+
isSubset:
526+
path: spec.template.spec.containers[0].startupProbe
527+
content:
528+
httpGet:
529+
path: /ping
530+
port: 8123
531+
initialDelaySeconds: 10
532+
periodSeconds: 15
533+
timeoutSeconds: 10
534+
failureThreshold: 60
535+
536+
- it: should have default terminationGracePeriodSeconds of 90
537+
set:
538+
clickhouse:
539+
enabled: true
540+
asserts:
541+
- documentSelector: *deployment-selector
542+
equal:
543+
path: spec.template.spec.terminationGracePeriodSeconds
544+
value: 90
545+
546+
- it: should use custom terminationGracePeriodSeconds when provided
547+
set:
548+
clickhouse:
549+
enabled: true
550+
terminationGracePeriodSeconds: 120
551+
asserts:
552+
- documentSelector: *deployment-selector
553+
equal:
554+
path: spec.template.spec.terminationGracePeriodSeconds
555+
value: 120

0 commit comments

Comments
 (0)