Skip to content

Commit 3eab5a0

Browse files
heng4funapecloud-bot
authored andcommitted
fix: loki can not recovery after node restart (#2267)
(cherry picked from commit 2c8f5a5)
1 parent 73d9961 commit 3eab5a0

File tree

8 files changed

+206
-14
lines changed

8 files changed

+206
-14
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/sh
2+
# check-index-gateway-ring.sh
3+
# Check if index gateway ring has ACTIVE instances
4+
# This script is used in startupProbe and readinessProbe
5+
# Uses curl from tools volume (copied by initContainer)
6+
7+
LOCAL_PORT="${SERVER_HTTP_PORT:-3100}"
8+
CURL="/kb-tools/curl"
9+
10+
# Check if curl is available
11+
if [ ! -x "$CURL" ]; then
12+
echo "curl not found at $CURL"
13+
exit 1
14+
fi
15+
16+
# Check if Loki service is ready
17+
if ! "$CURL" -sf "http://localhost:${LOCAL_PORT}/ready" > /dev/null 2>&1; then
18+
echo "Loki service not ready"
19+
exit 1
20+
fi
21+
22+
# Check index gateway ring for ACTIVE instances
23+
RING_HTML=$("$CURL" -sf "http://localhost:${LOCAL_PORT}/indexgateway/ring" 2>/dev/null || echo "")
24+
if [ -z "$RING_HTML" ]; then
25+
echo "Cannot access index gateway ring endpoint"
26+
exit 1
27+
fi
28+
29+
# Check HTML for ACTIVE status instances
30+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
31+
if [ "$ACTIVE_COUNT" -eq "0" ]; then
32+
echo "Index gateway ring is empty, no ACTIVE instances found"
33+
exit 1
34+
fi
35+
36+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
37+
exit 0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/sh
2+
# wait-index-gateway-ring.sh
3+
# Wait for at least one index gateway instance to be ACTIVE in the ring
4+
# This script is used as an init container for read/write components
5+
6+
set -euo pipefail
7+
8+
BACKEND_SVC="${KB_CLUSTER_NAME}-backend"
9+
BACKEND_PORT="${SERVER_HTTP_PORT:-3100}"
10+
MAX_WAIT="${MAX_WAIT:-300}" # 5 minutes default
11+
ELAPSED=0
12+
13+
echo "Waiting for index gateway ring to be ready..."
14+
echo "Backend service: ${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}"
15+
echo "Max wait time: ${MAX_WAIT} seconds"
16+
17+
while [ $ELAPSED -lt $MAX_WAIT ]; do
18+
# Check if backend service is accessible
19+
if curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/ready" > /dev/null 2>&1; then
20+
# Check ring for ACTIVE instances (parse HTML)
21+
RING_HTML=$(curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/indexgateway/ring" 2>/dev/null || echo "")
22+
if [ -n "$RING_HTML" ]; then
23+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
24+
if [ "$ACTIVE_COUNT" -gt "0" ]; then
25+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
26+
exit 0
27+
fi
28+
fi
29+
fi
30+
echo "Waiting for index gateway ring... ($ELAPSED/$MAX_WAIT seconds)"
31+
sleep 5
32+
ELAPSED=$((ELAPSED + 5))
33+
done
34+
35+
echo "Timeout waiting for index gateway ring after $MAX_WAIT seconds"
36+
exit 1

addons/loki/templates/_helpers.tpl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ Docker image name
100100
{{- if .Values.enterprise.enabled -}}{{- include "loki.enterpriseImage" . -}}{{- else -}}{{- include "loki.lokiImage" . -}}{{- end -}}
101101
{{- end -}}
102102

103+
103104
{{/*
104105
write fullname
105106
*/}}
@@ -275,6 +276,23 @@ Define loki write component definition regular expression name prefix
275276
^loki-write-
276277
{{- end -}}
277278

279+
{{/*
280+
Define loki scripts configMap template name
281+
*/}}
282+
{{- define "loki.scriptsTemplate" -}}
283+
loki-scripts-{{ .Chart.Version }}
284+
{{- end -}}
285+
286+
{{/*
287+
Generate loki scripts configmap
288+
*/}}
289+
{{- define "loki.extend.scripts" -}}
290+
{{- range $path, $_ := $.Files.Glob "scripts/**" }}
291+
{{ $path | base }}: |-
292+
{{- $.Files.Get $path | nindent 2 }}
293+
{{- end }}
294+
{{- end }}
295+
278296
{{/*
279297
object storage serviceRef declarations
280298
*/}}

addons/loki/templates/cmpd-read.yaml

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ spec:
3737
topologyKey: kubernetes.io/hostname
3838
securityContext:
3939
fsGroup: 10001
40+
initContainers:
41+
- name: wait-index-gateway
42+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
43+
command:
44+
- /bin/sh
45+
- -c
46+
- |
47+
# Copy curl to tools volume for use in probes
48+
cp /bin/curl /kb-tools/curl
49+
50+
# Execute wait script
51+
/kb-scripts/wait-index-gateway-ring.sh
52+
env:
53+
- name: MAX_WAIT
54+
value: "300"
55+
volumeMounts:
56+
- name: scripts
57+
mountPath: /kb-scripts
58+
- name: tools
59+
mountPath: /kb-tools
4060
containers:
4161
- name: read
4262
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -63,25 +83,40 @@ spec:
6383
- containerPort: {{ .Values.server.httpMemberlistPort }}
6484
name: http-memberlist
6585
protocol: TCP
86+
startupProbe:
87+
exec:
88+
command:
89+
- /kb-scripts/check-index-gateway-ring.sh
90+
initialDelaySeconds: 10
91+
periodSeconds: 5
92+
timeoutSeconds: 3
93+
successThreshold: 1
94+
failureThreshold: 60
6695
readinessProbe:
67-
failureThreshold: 3
68-
httpGet:
69-
path: /ready
70-
port: http-metrics
71-
scheme: HTTP
96+
exec:
97+
command:
98+
- /kb-scripts/check-index-gateway-ring.sh
7299
initialDelaySeconds: 15
73100
periodSeconds: 10
101+
timeoutSeconds: 3
74102
successThreshold: 1
75-
timeoutSeconds: 1
103+
failureThreshold: 3
76104
volumeMounts:
77-
- mountPath: /etc/loki/config
105+
- mountPath: /etc/loki/config
78106
name: config
79107
- mountPath: /etc/loki/runtime-config
80108
name: runtime-config
81109
- mountPath: /tmp
82110
name: tmp
83111
- mountPath: /var/loki
84112
name: data
113+
- mountPath: /kb-scripts
114+
name: scripts
115+
- mountPath: /kb-tools
116+
name: tools
117+
volumes:
118+
- emptyDir: {}
119+
name: tools
85120
configs:
86121
- name: loki-config
87122
template: loki-tpl
@@ -93,6 +128,12 @@ spec:
93128
volumeName: runtime-config
94129
namespace: {{ .Release.Namespace }}
95130
restartOnFileChange: true
131+
scripts:
132+
- name: loki-scripts
133+
template: {{ include "loki.scriptsTemplate" . }}
134+
namespace: {{ .Release.Namespace }}
135+
volumeName: scripts
136+
defaultMode: 0555
96137
vars:
97138
- name: SERVER_HTTP_PORT
98139
value: {{ .Values.server.httpMetricsPort | quote }}

addons/loki/templates/cmpd-write.yaml

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ spec:
3737
topologyKey: kubernetes.io/hostname
3838
securityContext:
3939
fsGroup: 10001
40+
initContainers:
41+
- name: wait-index-gateway
42+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
43+
command:
44+
- /bin/sh
45+
- -c
46+
- |
47+
# Copy curl to tools volume for use in probes
48+
cp /bin/curl /kb-tools/curl
49+
50+
# Execute wait script
51+
/kb-scripts/wait-index-gateway-ring.sh
52+
env:
53+
- name: MAX_WAIT
54+
value: "300"
55+
volumeMounts:
56+
- name: scripts
57+
mountPath: /kb-scripts
58+
- name: tools
59+
mountPath: /kb-tools
4060
containers:
4161
- name: write
4262
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -60,25 +80,40 @@ spec:
6080
- containerPort: {{ .Values.server.httpMemberlistPort }}
6181
name: http-memberlist
6282
protocol: TCP
83+
startupProbe:
84+
exec:
85+
command:
86+
- /kb-scripts/check-index-gateway-ring.sh
87+
initialDelaySeconds: 10
88+
periodSeconds: 5
89+
timeoutSeconds: 3
90+
successThreshold: 1
91+
failureThreshold: 60
6392
readinessProbe:
64-
failureThreshold: 3
65-
httpGet:
66-
path: /ready
67-
port: http-metrics
68-
scheme: HTTP
93+
exec:
94+
command:
95+
- /kb-scripts/check-index-gateway-ring.sh
6996
initialDelaySeconds: 15
7097
periodSeconds: 10
98+
timeoutSeconds: 3
7199
successThreshold: 1
72-
timeoutSeconds: 1
100+
failureThreshold: 3
73101
volumeMounts:
74-
- mountPath: /etc/loki/config
102+
- mountPath: /etc/loki/config
75103
name: config
76104
- mountPath: /etc/loki/runtime-config
77105
name: runtime-config
78106
- mountPath: /tmp
79107
name: tmp
80108
- mountPath: /var/loki
81109
name: data
110+
- mountPath: /kb-scripts
111+
name: scripts
112+
- mountPath: /kb-tools
113+
name: tools
114+
volumes:
115+
- emptyDir: {}
116+
name: tools
82117
configs:
83118
- name: loki-config
84119
template: loki-tpl
@@ -90,6 +125,12 @@ spec:
90125
volumeName: runtime-config
91126
namespace: {{ .Release.Namespace }}
92127
restartOnFileChange: true
128+
scripts:
129+
- name: loki-scripts
130+
template: {{ include "loki.scriptsTemplate" . }}
131+
namespace: {{ .Release.Namespace }}
132+
volumeName: scripts
133+
defaultMode: 0555
93134
vars:
94135
- name: SERVER_HTTP_PORT
95136
value: {{ .Values.server.httpMetricsPort | quote }}

addons/loki/templates/cmpv.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ spec:
2929
serviceVersion: 1.0.0
3030
images:
3131
write: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
32+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3233
- name: read-1.0.0
3334
serviceVersion: 1.0.0
3435
images:
3536
read: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
37+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3638
- name: backend-1.0.0
3739
serviceVersion: 1.0.0
3840
images:

addons/loki/templates/scripts.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: {{ include "loki.scriptsTemplate" . }}
5+
labels:
6+
{{- include "loki.labels" . | nindent 4 }}
7+
annotations:
8+
{{- include "loki.annotations" . | nindent 4 }}
9+
data:
10+
{{- with include "loki.extend.scripts" . }}
11+
{{- . | nindent 2 }}
12+
{{- end }}

addons/loki/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ images:
1111
tag: 1.24-alpine
1212
repository: nginxinc/nginx-unprivileged
1313
pullPolicy: IfNotPresent
14+
# Curl image for init container
15+
curl:
16+
repository: apecloud/curl-jq
17+
tag: 0.1.0
18+
pullPolicy: IfNotPresent
1419

1520
nameOverride: ""
1621
fullnameOverride: ""

0 commit comments

Comments
 (0)