Skip to content

Commit 75f0b0a

Browse files
authored
fix(KAR-671): stale ingester pods in the ring (#9011)
* fix(KAR-671): stale inggester pod on kflux-rhel-p01 Signed-off-by: obetsun <[email protected]> Assisted-by: Cursor AI rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * add disruption budget unhealthy eviction policy to stone-prod-rh01 Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * introduce autoforget_unhealthy ingesters parameter Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * setup memberlist to forget unhealthy pods Assisted-by: Cursor AI Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * fix(KAR-671): stale ingester pod in the ring Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * add memberlist settings to the loki ingesters config Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * add minio s3 settings back as filesystem is not supported in distributed mode Assisted-by: Cursor AI Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * Add minio parameters to loki secret Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --------- Co-authored-by: obetsun <[email protected]>
1 parent bcef3ac commit 75f0b0a

File tree

17 files changed

+407
-14
lines changed

17 files changed

+407
-14
lines changed

components/vector-kubearchive-log-collector/development/kustomization.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ patches:
2828
target:
2929
kind: SecurityContextConstraints
3030
name: kubearchive-logging-scc
31+
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
32+
# This is critical for StatefulSets where stale pods should be evicted
33+
- patch: |
34+
- op: add
35+
path: /spec/unhealthyPodEvictionPolicy
36+
value: AlwaysAllow
37+
target:
38+
group: policy
39+
version: v1
40+
kind: PodDisruptionBudget
41+
labelSelector: app.kubernetes.io/name=loki
3142
3243
generators:
3344
- vector-helm-generator.yaml

components/vector-kubearchive-log-collector/development/loki-helm-dev-values.yaml

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,30 @@ loki:
3232
replication_factor: 1
3333
memberlist:
3434
join_members: []
35-
dead_node_reclaim_time: 0s
35+
# How long to wait before reclaiming a dead node's tokens
36+
# Reduced to 2 minutes for development (faster cleanup with single replica)
37+
# This helps remove stale ring instances quickly when pods are restarted
38+
dead_node_reclaim_time: 2m
39+
# How often to gossip with other nodes (lower = faster detection of failures)
40+
# Keep at 2s for quick failure detection
3641
gossip_interval: 2s
37-
push_pull_interval: 10s
38-
gossip_nodes: 2
39-
gossip_to_dead_nodes_time: 15s
40-
left_ingesters_timeout: 30s
42+
# How often to do full state sync with other nodes
43+
# Reduced for development to sync faster
44+
push_pull_interval: 5s
45+
# Number of random nodes to gossip with per interval
46+
# Set to 1 for development (only 1 ingester replica)
47+
gossip_nodes: 1
48+
# How long to continue gossiping to dead nodes (helps propagate death info)
49+
# Reduced for development to propagate death info faster
50+
gossip_to_dead_nodes_time: 10s
51+
# How long to wait for an ingester to gracefully leave before considering it dead
52+
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
53+
# Reduced to 60s for development (faster cleanup)
54+
left_ingesters_timeout: 60s
55+
max_join_backoff: 1m
56+
max_join_retries: 10
57+
min_join_backoff: 1s
58+
rejoin_interval: 90s
4159
storage:
4260
bucketNames:
4361
chunks: loki-data
@@ -80,6 +98,7 @@ loki:
8098
log_stream_creation: false
8199
log_duplicate_stream_info: true
82100
ingester:
101+
autoforget_unhealthy: true
83102
chunk_encoding: snappy
84103
chunk_target_size: 3145728
85104
chunk_idle_period: 5m
@@ -123,6 +142,38 @@ ingester:
123142
podAntiAffinity:
124143
soft: {}
125144
hard: {}
145+
# AWS credentials for S3/MinIO access
146+
# These match the MinIO credentials from loki-helm-minio-values.yaml
147+
# Using secretKeyRef to satisfy kube-linter security requirements
148+
extraEnv:
149+
- name: AWS_ACCESS_KEY_ID
150+
valueFrom:
151+
secretKeyRef:
152+
name: kubearchive-loki
153+
key: AWS_ACCESS_KEY_ID
154+
- name: AWS_SECRET_ACCESS_KEY
155+
valueFrom:
156+
secretKeyRef:
157+
name: kubearchive-loki
158+
key: AWS_SECRET_ACCESS_KEY
159+
- name: AWS_DEFAULT_REGION
160+
valueFrom:
161+
secretKeyRef:
162+
name: kubearchive-loki
163+
key: AWS_DEFAULT_REGION
164+
# Graceful shutdown configuration to prevent stale ring instances
165+
# Give Loki time to flush chunks and leave the ring gracefully
166+
# Set to 90s to be longer than left_ingesters_timeout (60s) but still allow quick cleanup
167+
terminationGracePeriodSeconds: 90
168+
lifecycle:
169+
preStop:
170+
exec:
171+
# Sleep to allow readiness probe to fail, removing pod from service endpoints
172+
# This gives distributor time to stop sending new requests before shutdown
173+
command:
174+
- /bin/sh
175+
- -c
176+
- sleep 10
126177

127178
querier:
128179
replicas: 1
@@ -198,40 +249,40 @@ chunksCache:
198249
replicas: 1
199250
batchSize: 256 # Batch size for sending/receiving chunks from cache
200251
parallelism: 10 # Parallel threads for cache operations
201-
maxItemMemory: 30 # MB - Increased from 10MB to handle chunks (3MB target + compression overhead + metadata)
252+
maxItemMemory: 10 # MB
202253
defaultValidity: 12h # How long cached chunks are stored
203254

204255
resultsCache:
205256
enabled: true
206257
replicas: 1
207-
maxItemMemory: 100 # MB - Increased from 10MB to handle large query results (can be much larger than chunks)
258+
maxItemMemory: 10 # MB
208259
defaultValidity: 12h # How long cached query results are stored
209260

210261
memcached:
211262
enabled: true
212-
maxItemMemory: 30 # MB - Shared default for general memcached instances
263+
maxItemMemory: 10 # MB - Shared default for general memcached instances
213264

214265
# Shared memcached configuration (used as defaults for all memcached instances)
215266
# These don't deploy separate instances - they configure shared settings
216267
memcachedResults:
217268
enabled: true
218-
maxItemMemory: 100 # MB - For query result caching
269+
maxItemMemory: 10 # MB - For query result caching
219270

220271
memcachedChunks:
221272
enabled: true
222-
maxItemMemory: 30 # MB - For chunk caching
273+
maxItemMemory: 10 # MB - For chunk caching
223274

224275
memcachedFrontend:
225276
enabled: true
226-
maxItemMemory: 100 # MB - Frontend cache can handle large query results
277+
maxItemMemory: 10 # MB - Frontend cache can handle large query results
227278

228279
memcachedIndexQueries:
229280
enabled: true
230-
maxItemMemory: 50 # MB - Index queries can be large
281+
maxItemMemory: 10 # MB - Index queries can be large
231282

232283
memcachedIndexWrites:
233284
enabled: true
234-
maxItemMemory: 30 # MB - Index write operations
285+
maxItemMemory: 10 # MB - Index write operations
235286

236287

237288

components/vector-kubearchive-log-collector/development/loki-helm-minio-values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
---
22
# Enable minio for storage
3+
# Required for distributed mode - filesystem storage is not supported in distributed deployments
34
minio:
4-
enabled: false
5+
enabled: true
56
rootUser: loki
67
rootPassword: supersecret
78
mode: standalone

components/vector-kubearchive-log-collector/development/loki-secret.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,6 @@ type: Opaque
1212
stringData: # Using stringData for easier management
1313
USERNAME: admin
1414
PASSWORD: devpassword123 # notsecret only for dev
15+
AWS_ACCESS_KEY_ID: loki
16+
AWS_SECRET_ACCESS_KEY: supersecret
17+
AWS_DEFAULT_REGION: us-east-1

components/vector-kubearchive-log-collector/production/base/kustomization.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,14 @@ patches:
2020
target:
2121
kind: SecurityContextConstraints
2222
name: kubearchive-logging-scc
23+
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
24+
# This is critical for StatefulSets where stale pods should be evicted
25+
- patch: |
26+
- op: add
27+
path: /spec/unhealthyPodEvictionPolicy
28+
value: AlwaysAllow
29+
target:
30+
group: policy
31+
version: v1
32+
kind: PodDisruptionBudget
33+
labelSelector: app.kubernetes.io/name=loki

components/vector-kubearchive-log-collector/production/kflux-ocp-p01/loki-helm-prod-values.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,32 @@ gateway:
1616
loki:
1717
commonConfig:
1818
replication_factor: 3
19+
memberlist:
20+
join_members: []
21+
# How long to wait before reclaiming a dead node's tokens
22+
# Reduced to 2 minutes for development (faster cleanup with single replica)
23+
# This helps remove stale ring instances quickly when pods are restarted
24+
dead_node_reclaim_time: 2m
25+
# How often to gossip with other nodes (lower = faster detection of failures)
26+
# Keep at 2s for quick failure detection
27+
gossip_interval: 2s
28+
# How often to do full state sync with other nodes
29+
# Reduced for development to sync faster
30+
push_pull_interval: 5s
31+
# Number of random nodes to gossip with per interval
32+
# Set to 1 for development (only 1 ingester replica)
33+
gossip_nodes: 1
34+
# How long to continue gossiping to dead nodes (helps propagate death info)
35+
# Reduced for development to propagate death info faster
36+
gossip_to_dead_nodes_time: 10s
37+
# How long to wait for an ingester to gracefully leave before considering it dead
38+
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
39+
# Reduced to 60s for development (faster cleanup)
40+
left_ingesters_timeout: 60s
41+
max_join_backoff: 1m
42+
max_join_retries: 10
43+
min_join_backoff: 1s
44+
rejoin_interval: 90s
1945
# Required storage configuration for Helm chart
2046
storage:
2147
type: s3
@@ -54,6 +80,7 @@ loki:
5480
log_stream_creation: false
5581
log_duplicate_stream_info: true
5682
ingester:
83+
autoforget_unhealthy: true
5784
chunk_target_size: 8388608 # 8MB
5885
chunk_idle_period: 5m
5986
max_chunk_age: 2h

components/vector-kubearchive-log-collector/production/kflux-osp-p01/loki-helm-prod-values.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,32 @@ gateway:
1717
loki:
1818
commonConfig:
1919
replication_factor: 3
20+
memberlist:
21+
join_members: []
22+
# How long to wait before reclaiming a dead node's tokens
23+
# Reduced to 2 minutes for development (faster cleanup with single replica)
24+
# This helps remove stale ring instances quickly when pods are restarted
25+
dead_node_reclaim_time: 2m
26+
# How often to gossip with other nodes (lower = faster detection of failures)
27+
# Keep at 2s for quick failure detection
28+
gossip_interval: 2s
29+
# How often to do full state sync with other nodes
30+
# Reduced for development to sync faster
31+
push_pull_interval: 5s
32+
# Number of random nodes to gossip with per interval
33+
# Set to 1 for development (only 1 ingester replica)
34+
gossip_nodes: 1
35+
# How long to continue gossiping to dead nodes (helps propagate death info)
36+
# Reduced for development to propagate death info faster
37+
gossip_to_dead_nodes_time: 10s
38+
# How long to wait for an ingester to gracefully leave before considering it dead
39+
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
40+
# Reduced to 60s for development (faster cleanup)
41+
left_ingesters_timeout: 60s
42+
max_join_backoff: 1m
43+
max_join_retries: 10
44+
min_join_backoff: 1s
45+
rejoin_interval: 90s
2046
# Required storage configuration for Helm chart
2147
storage:
2248
type: s3
@@ -55,6 +81,7 @@ loki:
5581
log_stream_creation: false
5682
log_duplicate_stream_info: true
5783
ingester:
84+
autoforget_unhealthy: true
5885
chunk_target_size: 8388608 # 8MB
5986
chunk_idle_period: 5m
6087
max_chunk_age: 2h

components/vector-kubearchive-log-collector/production/kflux-prd-rh02/loki-helm-prod-values.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,32 @@ gateway:
1717
loki:
1818
commonConfig:
1919
replication_factor: 3
20+
memberlist:
21+
join_members: []
22+
# How long to wait before reclaiming a dead node's tokens
23+
# Reduced to 2 minutes for development (faster cleanup with single replica)
24+
# This helps remove stale ring instances quickly when pods are restarted
25+
dead_node_reclaim_time: 2m
26+
# How often to gossip with other nodes (lower = faster detection of failures)
27+
# Keep at 2s for quick failure detection
28+
gossip_interval: 2s
29+
# How often to do full state sync with other nodes
30+
# Reduced for development to sync faster
31+
push_pull_interval: 5s
32+
# Number of random nodes to gossip with per interval
33+
# Set to 1 for development (only 1 ingester replica)
34+
gossip_nodes: 1
35+
# How long to continue gossiping to dead nodes (helps propagate death info)
36+
# Reduced for development to propagate death info faster
37+
gossip_to_dead_nodes_time: 10s
38+
# How long to wait for an ingester to gracefully leave before considering it dead
39+
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
40+
# Reduced to 60s for development (faster cleanup)
41+
left_ingesters_timeout: 60s
42+
max_join_backoff: 1m
43+
max_join_retries: 10
44+
min_join_backoff: 1s
45+
rejoin_interval: 90s
2046
# Required storage configuration for Helm chart
2147
storage:
2248
type: s3
@@ -55,6 +81,7 @@ loki:
5581
log_stream_creation: false
5682
log_duplicate_stream_info: true
5783
ingester:
84+
autoforget_unhealthy: true
5885
chunk_target_size: 8388608 # 8MB
5986
chunk_idle_period: 5m
6087
max_chunk_age: 2h

components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,32 @@ gateway:
1717
loki:
1818
commonConfig:
1919
replication_factor: 3
20+
memberlist:
21+
join_members: []
22+
# How long to wait before reclaiming a dead node's tokens
23+
# Reduced to 2 minutes for development (faster cleanup with single replica)
24+
# This helps remove stale ring instances quickly when pods are restarted
25+
dead_node_reclaim_time: 2m
26+
# How often to gossip with other nodes (lower = faster detection of failures)
27+
# Keep at 2s for quick failure detection
28+
gossip_interval: 2s
29+
# How often to do full state sync with other nodes
30+
# Reduced for development to sync faster
31+
push_pull_interval: 5s
32+
# Number of random nodes to gossip with per interval
33+
# Set to 1 for development (only 1 ingester replica)
34+
gossip_nodes: 1
35+
# How long to continue gossiping to dead nodes (helps propagate death info)
36+
# Reduced for development to propagate death info faster
37+
gossip_to_dead_nodes_time: 10s
38+
# How long to wait for an ingester to gracefully leave before considering it dead
39+
# This should be longer than terminationGracePeriodSeconds to allow graceful shutdown
40+
# Reduced to 60s for development (faster cleanup)
41+
left_ingesters_timeout: 60s
42+
max_join_backoff: 1m
43+
max_join_retries: 10
44+
min_join_backoff: 1s
45+
rejoin_interval: 90s
2046
# Required storage configuration for Helm chart
2147
storage:
2248
type: s3
@@ -55,6 +81,7 @@ loki:
5581
log_stream_creation: false
5682
log_duplicate_stream_info: true
5783
ingester:
84+
autoforget_unhealthy: true
5885
chunk_target_size: 8388608 # 8MB
5986
chunk_idle_period: 5m
6087
max_chunk_age: 2h

components/vector-kubearchive-log-collector/production/kflux-rhel-p01/kustomization.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,16 @@ resources:
1717
generators:
1818
- vector-helm-generator.yaml
1919
- loki-helm-generator.yaml
20+
21+
patches:
22+
# Patch all Loki PodDisruptionBudgets to allow eviction of unhealthy pods
23+
# This is critical for StatefulSets where stale pods should be evicted
24+
- patch: |
25+
- op: add
26+
path: /spec/unhealthyPodEvictionPolicy
27+
value: AlwaysAllow
28+
target:
29+
group: policy
30+
version: v1
31+
kind: PodDisruptionBudget
32+
labelSelector: app.kubernetes.io/name=loki

0 commit comments

Comments
 (0)