Skip to content

Commit ab27bc8

Browse files
committed
fix: loki can not recovery after node restart (#2267)
(cherry picked from commit 2c8f5a5)
1 parent de728f7 commit ab27bc8

File tree

8 files changed

+197
-34
lines changed

8 files changed

+197
-34
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/sh
2+
# check-index-gateway-ring.sh
3+
# Check if index gateway ring has ACTIVE instances
4+
# This script is used in startupProbe and readinessProbe
5+
# Uses curl from tools volume (copied by initContainer)
6+
7+
LOCAL_PORT="${SERVER_HTTP_PORT:-3100}"
8+
CURL="/kb-tools/curl"
9+
10+
# Check if curl is available
11+
if [ ! -x "$CURL" ]; then
12+
echo "curl not found at $CURL"
13+
exit 1
14+
fi
15+
16+
# Check if Loki service is ready
17+
if ! "$CURL" -sf "http://localhost:${LOCAL_PORT}/ready" > /dev/null 2>&1; then
18+
echo "Loki service not ready"
19+
exit 1
20+
fi
21+
22+
# Check index gateway ring for ACTIVE instances
23+
RING_HTML=$("$CURL" -sf "http://localhost:${LOCAL_PORT}/indexgateway/ring" 2>/dev/null || echo "")
24+
if [ -z "$RING_HTML" ]; then
25+
echo "Cannot access index gateway ring endpoint"
26+
exit 1
27+
fi
28+
29+
# Check HTML for ACTIVE status instances
30+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
31+
if [ "$ACTIVE_COUNT" -eq "0" ]; then
32+
echo "Index gateway ring is empty, no ACTIVE instances found"
33+
exit 1
34+
fi
35+
36+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
37+
exit 0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/sh
2+
# wait-index-gateway-ring.sh
3+
# Wait for at least one index gateway instance to be ACTIVE in the ring
4+
# This script is used as an init container for read/write components
5+
6+
set -euo pipefail
7+
8+
BACKEND_SVC="${KB_CLUSTER_NAME}-backend"
9+
BACKEND_PORT="${SERVER_HTTP_PORT:-3100}"
10+
MAX_WAIT="${MAX_WAIT:-300}" # 5 minutes default
11+
ELAPSED=0
12+
13+
echo "Waiting for index gateway ring to be ready..."
14+
echo "Backend service: ${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}"
15+
echo "Max wait time: ${MAX_WAIT} seconds"
16+
17+
while [ $ELAPSED -lt $MAX_WAIT ]; do
18+
# Check if backend service is accessible
19+
if curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/ready" > /dev/null 2>&1; then
20+
# Check ring for ACTIVE instances (parse HTML)
21+
RING_HTML=$(curl -sf "http://${BACKEND_SVC}.${KB_NAMESPACE}.svc.${CLUSTER_DOMAIN}:${BACKEND_PORT}/indexgateway/ring" 2>/dev/null || echo "")
22+
if [ -n "$RING_HTML" ]; then
23+
ACTIVE_COUNT=$(echo "$RING_HTML" | grep -o '<td>ACTIVE</td>' | wc -l || echo "0")
24+
if [ "$ACTIVE_COUNT" -gt "0" ]; then
25+
echo "Index gateway ring is ready with $ACTIVE_COUNT ACTIVE instance(s)"
26+
exit 0
27+
fi
28+
fi
29+
fi
30+
echo "Waiting for index gateway ring... ($ELAPSED/$MAX_WAIT seconds)"
31+
sleep 5
32+
ELAPSED=$((ELAPSED + 5))
33+
done
34+
35+
echo "Timeout waiting for index gateway ring after $MAX_WAIT seconds"
36+
exit 1

addons/loki/templates/_helpers.tpl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ Docker image name
108108
{{- if .Values.enterprise.enabled -}}{{- include "loki.enterpriseImage" . -}}{{- else -}}{{- include "loki.lokiImage" . -}}{{- end -}}
109109
{{- end -}}
110110

111+
111112
{{/*
112113
write fullname
113114
*/}}
@@ -226,6 +227,16 @@ query-scheduler fullname
226227
{{ include "loki.fullname" . }}-query-scheduler
227228
{{- end }}
228229

230+
{{/*
231+
Generate loki scripts configmap
232+
*/}}
233+
{{- define "loki.extend.scripts" -}}
234+
{{- range $path, $_ := $.Files.Glob "scripts/**" }}
235+
{{ $path | base }}: |-
236+
{{- $.Files.Get $path | nindent 2 }}
237+
{{- end }}
238+
{{- end }}
239+
229240
{{/*
230241
object storage serviceRef declarations
231242
*/}}

addons/loki/templates/cmpd-read.yaml

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,20 +33,28 @@ spec:
3333
matchLabels:
3434
app.kubernetes.io/component: read
3535
topologyKey: kubernetes.io/hostname
36+
securityContext:
37+
fsGroup: 10001
3638
initContainers:
37-
- name: init
38-
imagePullPolicy: {{ .Values.images.pullPolicy }}
39-
image: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
40-
securityContext:
41-
runAsUser: 0
42-
privileged: true
39+
- name: wait-index-gateway
40+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
4341
command:
4442
- /bin/sh
4543
- -c
46-
- chown loki:loki /var/loki
44+
- |
45+
# Copy curl to tools volume for use in probes
46+
cp /bin/curl /kb-tools/curl
47+
48+
# Execute wait script
49+
/kb-scripts/wait-index-gateway-ring.sh
50+
env:
51+
- name: MAX_WAIT
52+
value: "300"
4753
volumeMounts:
48-
- mountPath: /var/loki
49-
name: data
54+
- name: scripts
55+
mountPath: /kb-scripts
56+
- name: tools
57+
mountPath: /kb-tools
5058
containers:
5159
- name: read
5260
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -79,25 +87,40 @@ spec:
7987
- containerPort: {{ .Values.server.httpMemberlistPort }}
8088
name: http-memberlist
8189
protocol: TCP
90+
startupProbe:
91+
exec:
92+
command:
93+
- /kb-scripts/check-index-gateway-ring.sh
94+
initialDelaySeconds: 10
95+
periodSeconds: 5
96+
timeoutSeconds: 3
97+
successThreshold: 1
98+
failureThreshold: 60
8299
readinessProbe:
83-
failureThreshold: 3
84-
httpGet:
85-
path: /ready
86-
port: http-metrics
87-
scheme: HTTP
88-
initialDelaySeconds: 30
100+
exec:
101+
command:
102+
- /kb-scripts/check-index-gateway-ring.sh
103+
initialDelaySeconds: 15
89104
periodSeconds: 10
105+
timeoutSeconds: 3
90106
successThreshold: 1
91-
timeoutSeconds: 1
107+
failureThreshold: 3
92108
volumeMounts:
93-
- mountPath: /etc/loki/config
109+
- mountPath: /etc/loki/config
94110
name: config
95111
- mountPath: /etc/loki/runtime-config
96112
name: runtime-config
97113
- mountPath: /tmp
98114
name: tmp
99115
- mountPath: /var/loki
100116
name: data
117+
- mountPath: /kb-scripts
118+
name: scripts
119+
- mountPath: /kb-tools
120+
name: tools
121+
volumes:
122+
- emptyDir: {}
123+
name: tools
101124
configs:
102125
- name: loki-config
103126
templateRef: loki-tpl
@@ -108,7 +131,15 @@ spec:
108131
templateRef: loki-runtime-tpl
109132
volumeName: runtime-config
110133
namespace: {{ .Release.Namespace }}
134+
scripts:
135+
- name: loki-scripts
136+
templateRef: loki-scripts
137+
namespace: {{ .Release.Namespace }}
138+
volumeName: scripts
139+
defaultMode: 0555
111140
vars:
141+
- name: CLUSTER_DOMAIN
142+
value: {{ .Values.global.clusterDomain}}
112143
- name: SERVER_HTTP_PORT
113144
value: {{ .Values.server.httpMetricsPort | quote }}
114145
- name: SERVER_GRPC_PORT

addons/loki/templates/cmpd-write.yaml

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,20 +33,28 @@ spec:
3333
matchLabels:
3434
app.kubernetes.io/component: write
3535
topologyKey: kubernetes.io/hostname
36+
securityContext:
37+
fsGroup: 10001
3638
initContainers:
37-
- name: init
38-
imagePullPolicy: {{ .Values.images.pullPolicy }}
39-
image: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
40-
securityContext:
41-
runAsUser: 0
42-
privileged: true
39+
- name: wait-index-gateway
40+
imagePullPolicy: {{ .Values.images.curl.pullPolicy }}
4341
command:
4442
- /bin/sh
4543
- -c
46-
- chown loki:loki /var/loki
44+
- |
45+
# Copy curl to tools volume for use in probes
46+
cp /bin/curl /kb-tools/curl
47+
48+
# Execute wait script
49+
/kb-scripts/wait-index-gateway-ring.sh
50+
env:
51+
- name: MAX_WAIT
52+
value: "300"
4753
volumeMounts:
48-
- mountPath: /var/loki
49-
name: data
54+
- name: scripts
55+
mountPath: /kb-scripts
56+
- name: tools
57+
mountPath: /kb-tools
5058
containers:
5159
- name: write
5260
imagePullPolicy: {{ .Values.images.pullPolicy }}
@@ -76,25 +84,40 @@ spec:
7684
- containerPort: {{ .Values.server.httpMemberlistPort }}
7785
name: http-memberlist
7886
protocol: TCP
87+
startupProbe:
88+
exec:
89+
command:
90+
- /kb-scripts/check-index-gateway-ring.sh
91+
initialDelaySeconds: 10
92+
periodSeconds: 5
93+
timeoutSeconds: 3
94+
successThreshold: 1
95+
failureThreshold: 60
7996
readinessProbe:
80-
failureThreshold: 3
81-
httpGet:
82-
path: /ready
83-
port: http-metrics
84-
scheme: HTTP
85-
initialDelaySeconds: 30
97+
exec:
98+
command:
99+
- /kb-scripts/check-index-gateway-ring.sh
100+
initialDelaySeconds: 15
86101
periodSeconds: 10
102+
timeoutSeconds: 3
87103
successThreshold: 1
88-
timeoutSeconds: 1
104+
failureThreshold: 3
89105
volumeMounts:
90-
- mountPath: /etc/loki/config
106+
- mountPath: /etc/loki/config
91107
name: config
92108
- mountPath: /etc/loki/runtime-config
93109
name: runtime-config
94110
- mountPath: /tmp
95111
name: tmp
96112
- mountPath: /var/loki
97113
name: data
114+
- mountPath: /kb-scripts
115+
name: scripts
116+
- mountPath: /kb-tools
117+
name: tools
118+
volumes:
119+
- emptyDir: {}
120+
name: tools
98121
configs:
99122
- name: loki-config
100123
templateRef: loki-tpl
@@ -105,7 +128,15 @@ spec:
105128
templateRef: loki-runtime-tpl
106129
volumeName: runtime-config
107130
namespace: {{ .Release.Namespace }}
131+
scripts:
132+
- name: loki-scripts
133+
templateRef: loki-scripts
134+
namespace: {{ .Release.Namespace }}
135+
volumeName: scripts
136+
defaultMode: 0555
108137
vars:
138+
- name: CLUSTER_DOMAIN
139+
value: {{ .Values.global.clusterDomain}}
109140
- name: SERVER_HTTP_PORT
110141
value: {{ .Values.server.httpMetricsPort | quote }}
111142
- name: SERVER_GRPC_PORT

addons/loki/templates/cmpv.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,12 @@ spec:
2727
serviceVersion: 1.0.0
2828
images:
2929
write: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
30+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3031
- name: read-1.0.0
3132
serviceVersion: 1.0.0
3233
images:
3334
read: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.repository }}:{{ .Values.images.tag }}
35+
wait-index-gateway: {{ .Values.images.registry | default "docker.io" }}/{{ .Values.images.curl.repository }}:{{ .Values.images.curl.tag }}
3436
- name: backend-1.0.0
3537
serviceVersion: 1.0.0
3638
images:

addons/loki/templates/scripts.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: loki-scripts
5+
labels:
6+
{{- include "loki.labels" . | nindent 4 }}
7+
data:
8+
{{- with include "loki.extend.scripts" . }}
9+
{{- . | nindent 2 }}
10+
{{- end }}

addons/loki/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ images:
1111
tag: 1.24-alpine
1212
repository: nginxinc/nginx-unprivileged
1313
pullPolicy: IfNotPresent
14+
# Curl image for init container
15+
curl:
16+
repository: apecloud/curl-jq
17+
tag: 0.1.0
18+
pullPolicy: IfNotPresent
1419

1520
nameOverride: ""
1621
fullnameOverride: ""

0 commit comments

Comments
 (0)