Skip to content

Commit 2c8f5a5

Browse files
authored
fix: loki can not recovery after node restart (#2267)
1 parent 24c03b8 commit 2c8f5a5

File tree

8 files changed

+206
-14
lines changed

8 files changed

+206
-14
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/sh
2+
# check-index-gateway-ring.sh
3+
# Check if index gateway ring has ACTIVE instances
4+
# This script is used in startupProbe and readinessProbe
5+
# Uses curl from tools volume (copied by initContainer)
6+
7+
LOCAL_PORT="${SERVER_HTTP_PORT:-3100}"
8+
CURL="/kb-tools/curl"
9+
10+
# Check if curl is available
11+
if [ ! -x "$CURL" ]; then
12+
echo "curl not found at $CURL"
13+
exit 1
14+
fi
15+
16+
# Check if Loki service is ready
17+
if ! "$CURL" -sf "http://localhost:${LOCAL_PORT}/ready" > /dev/null 2>&1; then
18+
echo "Loki service not ready"
19+
exit 1
20+
fi
21+
22+
# Check index gateway ring for ACTIVE instances
23+
RING_HTML=$("$CURL" -sf "http://localhost:${LOCAL_PORT}/indexgateway/ring" 2>/dev/null || echo "")
24+
if [ -z "$RING_HTML" ]; then
25+
echo "Cannot access index gateway ring endpoint"
26+
exit 1
27+
fi
28+
29+
# Check HTML for ACTIVE status instances
30+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
31+
if [ "$ACTIVE_COUNT" -eq "0" ]; then
32+
echo "Index gateway ring is empty, no ACTIVE instances found"
33+
exit 1
34+
fi
35+
36+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
37+
exit 0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/sh
2+
# wait-index-gateway-ring.sh
3+
# Wait for at least one index gateway instance to be ACTIVE in the ring
4+
# This script is used as an init container for read/write components
5+
6+
set -euo pipefail
7+
8+
BACKEND_SVC="${KB_CLUSTER_NAME}-backend"
9+
BACKEND_PORT="${SERVER_HTTP_PORT:-3100}"
10+
MAX_WAIT="${MAX_WAIT:-300}" # 5 minutes default
11+
ELAPSED=0
12+
13+
echo "Waiting for index gateway ring to be ready..."
14+
echo "Backend service: ${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}"
15+
echo "Max wait time: ${MAX_WAIT} seconds"
16+
17+
while [ $ELAPSED -lt $MAX_WAIT ]; do
18+
# Check if backend service is accessible
19+
if curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/ready" > /dev/null 2>&1; then
20+
# Check ring for ACTIVE instances (parse HTML)
21+
RING_HTML=$(curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/indexgateway/ring" 2>/dev/null || echo "")
22+
if [ -n "$RING_HTML" ]; then
23+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
24+
if [ "$ACTIVE_COUNT" -gt "0" ]; then
25+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
26+
exit 0
27+
fi
28+
fi
29+
fi
30+
echo "Waiting for index gateway ring... ($ELAPSED/$MAX_WAIT seconds)"
31+
sleep 5
32+
ELAPSED=$((ELAPSED + 5))
33+
done
34+
35+
echo "Timeout waiting for index gateway ring after $MAX_WAIT seconds"
36+
exit 1

addons/loki/templates/_helpers.tpl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ Docker image name
100100
{{- if .Values.enterprise.enabled -}}{{- include "loki.enterpriseImage" . -}}{{- else -}}{{- include "loki.lokiImage" . -}}{{- end -}}
101101
{{- end -}}
102102

103+
103104
{{/*
104105
write fullname
105106
*/}}
@@ -275,6 +276,23 @@ Define loki write component definition regular expression name prefix
275276
^loki-write-
276277
{{- end -}}
277278

279+
{{/*
280+
Define loki scripts configMap template name
281+
*/}}
282+
{{- define "loki.scriptsTemplate" -}}
283+
loki-scripts-{{ .Chart.Version }}
284+
{{- end -}}
285+
286+
{{/*
287+
Generate loki scripts configmap
288+
*/}}
289+
{{- define "loki.extend.scripts" -}}
290+
{{- range $path, $_ := $.Files.Glob "scripts/**" }}
291+
{{ $path | base }}: |-
292+
{{- $.Files.Get $path | nindent 2 }}
293+
{{- end }}
294+
{{- end }}
295+
278296
{{/*
279297
object storage serviceRef declarations
280298
*/}}

addons/loki/templates/cmpd-read.yaml

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ spec:
3737
topologyKey: kubernetes.io/hostname
3838
securityContext:
3939
fsGroup: 10001
40+
initContainers:
41+
- name: wait-index-gateway
42+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
43+
command:
44+
- /bin/sh
45+
- -c
46+
- |
47+
# Copy curl to tools volume for use in probes
48+
cp /bin/curl /kb-tools/curl
49+
50+
# Execute wait script
51+
/kb-scripts/wait-index-gateway-ring.sh
52+
env:
53+
- name: MAX_WAIT
54+
value: "300"
55+
volumeMounts:
56+
- name: scripts
57+
mountPath: /kb-scripts
58+
- name: tools
59+
mountPath: /kb-tools
4060
containers:
4161
- name: read
4262
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -69,25 +89,40 @@ spec:
6989
- containerPort: {{ .Values.server.httpMemberlistPort }}
7090
name: http-memberlist
7191
protocol: TCP
92+
startupProbe:
93+
exec:
94+
command:
95+
- /kb-scripts/check-index-gateway-ring.sh
96+
initialDelaySeconds: 10
97+
periodSeconds: 5
98+
timeoutSeconds: 3
99+
successThreshold: 1
100+
failureThreshold: 60
72101
readinessProbe:
73-
failureThreshold: 3
74-
httpGet:
75-
path: /ready
76-
port: http-metrics
77-
scheme: HTTP
102+
exec:
103+
command:
104+
- /kb-scripts/check-index-gateway-ring.sh
78105
initialDelaySeconds: 15
79106
periodSeconds: 10
107+
timeoutSeconds: 3
80108
successThreshold: 1
81-
timeoutSeconds: 1
109+
failureThreshold: 3
82110
volumeMounts:
83-
- mountPath: /etc/loki/config
111+
- mountPath: /etc/loki/config
84112
name: config
85113
- mountPath: /etc/loki/runtime-config
86114
name: runtime-config
87115
- mountPath: /tmp
88116
name: tmp
89117
- mountPath: /var/loki
90118
name: data
119+
- mountPath: /kb-scripts
120+
name: scripts
121+
- mountPath: /kb-tools
122+
name: tools
123+
volumes:
124+
- emptyDir: {}
125+
name: tools
91126
configs:
92127
- name: loki-config
93128
template: loki-tpl
@@ -99,6 +134,12 @@ spec:
99134
volumeName: runtime-config
100135
namespace: {{ .Release.Namespace }}
101136
restartOnFileChange: true
137+
scripts:
138+
- name: loki-scripts
139+
template: {{ include "loki.scriptsTemplate" . }}
140+
namespace: {{ .Release.Namespace }}
141+
volumeName: scripts
142+
defaultMode: 0555
102143
vars:
103144
- name: SERVER_HTTP_PORT
104145
value: {{ .Values.server.httpMetricsPort | quote }}

addons/loki/templates/cmpd-write.yaml

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ spec:
3737
topologyKey: kubernetes.io/hostname
3838
securityContext:
3939
fsGroup: 10001
40+
initContainers:
41+
- name: wait-index-gateway
42+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
43+
command:
44+
- /bin/sh
45+
- -c
46+
- |
47+
# Copy curl to tools volume for use in probes
48+
cp /bin/curl /kb-tools/curl
49+
50+
# Execute wait script
51+
/kb-scripts/wait-index-gateway-ring.sh
52+
env:
53+
- name: MAX_WAIT
54+
value: "300"
55+
volumeMounts:
56+
- name: scripts
57+
mountPath: /kb-scripts
58+
- name: tools
59+
mountPath: /kb-tools
4060
containers:
4161
- name: write
4262
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -66,25 +86,40 @@ spec:
6686
- containerPort: {{ .Values.server.httpMemberlistPort }}
6787
name: http-memberlist
6888
protocol: TCP
89+
startupProbe:
90+
exec:
91+
command:
92+
- /kb-scripts/check-index-gateway-ring.sh
93+
initialDelaySeconds: 10
94+
periodSeconds: 5
95+
timeoutSeconds: 3
96+
successThreshold: 1
97+
failureThreshold: 60
6998
readinessProbe:
70-
failureThreshold: 3
71-
httpGet:
72-
path: /ready
73-
port: http-metrics
74-
scheme: HTTP
99+
exec:
100+
command:
101+
- /kb-scripts/check-index-gateway-ring.sh
75102
initialDelaySeconds: 15
76103
periodSeconds: 10
104+
timeoutSeconds: 3
77105
successThreshold: 1
78-
timeoutSeconds: 1
106+
failureThreshold: 3
79107
volumeMounts:
80-
- mountPath: /etc/loki/config
108+
- mountPath: /etc/loki/config
81109
name: config
82110
- mountPath: /etc/loki/runtime-config
83111
name: runtime-config
84112
- mountPath: /tmp
85113
name: tmp
86114
- mountPath: /var/loki
87115
name: data
116+
- mountPath: /kb-scripts
117+
name: scripts
118+
- mountPath: /kb-tools
119+
name: tools
120+
volumes:
121+
- emptyDir: {}
122+
name: tools
88123
configs:
89124
- name: loki-config
90125
template: loki-tpl
@@ -96,6 +131,12 @@ spec:
96131
volumeName: runtime-config
97132
namespace: {{ .Release.Namespace }}
98133
restartOnFileChange: true
134+
scripts:
135+
- name: loki-scripts
136+
template: {{ include "loki.scriptsTemplate" . }}
137+
namespace: {{ .Release.Namespace }}
138+
volumeName: scripts
139+
defaultMode: 0555
99140
vars:
100141
- name: SERVER_HTTP_PORT
101142
value: {{ .Values.server.httpMetricsPort | quote }}

addons/loki/templates/cmpv.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ spec:
2929
serviceVersion: 1.0.0
3030
images:
3131
write: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
32+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3233
- name: read-1.0.0
3334
serviceVersion: 1.0.0
3435
images:
3536
read: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
37+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3638
- name: backend-1.0.0
3739
serviceVersion: 1.0.0
3840
images:

addons/loki/templates/scripts.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: {{ include "loki.scriptsTemplate" . }}
5+
labels:
6+
{{- include "loki.labels" . | nindent 4 }}
7+
annotations:
8+
{{- include "loki.annotations" . | nindent 4 }}
9+
data:
10+
{{- with include "loki.extend.scripts" . }}
11+
{{- . | nindent 2 }}
12+
{{- end }}

addons/loki/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ images:
1111
tag: 1.24-alpine
1212
repository: nginxinc/nginx-unprivileged
1313
pullPolicy: IfNotPresent
14+
# Curl image for init container
15+
curl:
16+
repository: apecloud/curl-jq
17+
tag: 0.1.0
18+
pullPolicy: IfNotPresent
1419

1520
nameOverride: ""
1621
fullnameOverride: ""

0 commit comments

Comments
 (0)