Skip to content

Commit 9a6fe2d

Browse files
Trigger full snapshot when spinning up a compaction job is unviable (#1130)
* trigger full snapshot when spinning up a compaction job is unviable * Add unit tests for mocking httpClient * modify the CompactionSpec object name to SnapshotCompactionSpec to be more clear about the type of compaction * etcd status changes & integration test adaption * poll k8s resources to make sure of the state before proceeding with next reconciliations * initialize logger with etcdName & etcdNamespace rather than passing them everytime * get etcd full snapshot timeout from backup spec
1 parent a87d6a4 commit 9a6fe2d

File tree

24 files changed

+1315
-368
lines changed

24 files changed

+1315
-368
lines changed

api/config/v1alpha1/defaults.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ const (
129129
DefaultCompactionConcurrentSyncs = 3
130130
// DefaultCompactionEventsThreshold is the default number of events that triggers compaction.
131131
DefaultCompactionEventsThreshold = 1000000
132+
// DefaultCompactionTriggerFullSnapshotThreshold is the default upper threshold for the number of etcd events before giving up on compaction job and triggering a full snapshot.
133+
DefaultCompactionTriggerFullSnapshotThreshold = 3000000
132134
// DefaultCompactionActiveDeadlineDuration is the default active deadline duration for compaction.
133135
DefaultCompactionActiveDeadlineDuration = 3 * time.Hour
134136
)
@@ -144,6 +146,9 @@ func SetDefaults_CompactionControllerConfiguration(compactionCtrlConfig *Compact
144146
if compactionCtrlConfig.EventsThreshold == 0 {
145147
compactionCtrlConfig.EventsThreshold = DefaultCompactionEventsThreshold
146148
}
149+
if compactionCtrlConfig.TriggerFullSnapshotThreshold == 0 {
150+
compactionCtrlConfig.TriggerFullSnapshotThreshold = DefaultCompactionTriggerFullSnapshotThreshold
151+
}
147152
if compactionCtrlConfig.ActiveDeadlineDuration == zeroDuration {
148153
compactionCtrlConfig.ActiveDeadlineDuration = metav1.Duration{Duration: DefaultCompactionActiveDeadlineDuration}
149154
}

api/config/v1alpha1/defaults_test.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -267,24 +267,27 @@ func TestSetDefaults_CompactionControllerConfiguration(t *testing.T) {
267267
Enabled: true,
268268
},
269269
expected: &CompactionControllerConfiguration{
270-
Enabled: true,
271-
ConcurrentSyncs: ptr.To(3),
272-
EventsThreshold: 1000000,
273-
ActiveDeadlineDuration: metav1.Duration{Duration: 3 * time.Hour},
270+
Enabled: true,
271+
ConcurrentSyncs: ptr.To(3),
272+
EventsThreshold: 1000000,
273+
TriggerFullSnapshotThreshold: 3000000,
274+
ActiveDeadlineDuration: metav1.Duration{Duration: 3 * time.Hour},
274275
},
275276
},
276277
{
277278
name: "should not overwrite already set values",
278279
config: &CompactionControllerConfiguration{
279-
Enabled: true,
280-
ConcurrentSyncs: ptr.To(5),
281-
ActiveDeadlineDuration: metav1.Duration{Duration: 1 * time.Hour},
280+
Enabled: true,
281+
ConcurrentSyncs: ptr.To(5),
282+
TriggerFullSnapshotThreshold: 2000000,
283+
ActiveDeadlineDuration: metav1.Duration{Duration: 1 * time.Hour},
282284
},
283285
expected: &CompactionControllerConfiguration{
284-
Enabled: true,
285-
ConcurrentSyncs: ptr.To(5),
286-
EventsThreshold: 1000000,
287-
ActiveDeadlineDuration: metav1.Duration{Duration: 1 * time.Hour},
286+
Enabled: true,
287+
ConcurrentSyncs: ptr.To(5),
288+
EventsThreshold: 1000000,
289+
TriggerFullSnapshotThreshold: 2000000,
290+
ActiveDeadlineDuration: metav1.Duration{Duration: 1 * time.Hour},
288291
},
289292
},
290293
}

api/config/v1alpha1/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ type CompactionControllerConfiguration struct {
166166
ConcurrentSyncs *int `json:"concurrentSyncs,omitempty"`
167167
// EventsThreshold denotes total number of etcd events to be reached upon which a backup compaction job is triggered.
168168
EventsThreshold int64 `json:"eventsThreshold"`
169+
// TriggerFullSnapshotThreshold denotes the upper threshold for the number of etcd events before giving up on compaction job and triggering a full snapshot.
170+
TriggerFullSnapshotThreshold int64 `json:"triggerFullSnapshotThreshold"`
169171
// ActiveDeadlineDuration is the duration after which a running compaction job will be killed.
170172
ActiveDeadlineDuration metav1.Duration `json:"activeDeadlineDuration"`
171173
// MetricsScrapeWaitDuration is the duration to wait for after compaction job is completed, to allow Prometheus metrics to be scraped

api/config/v1alpha1/validation/validation.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ func validateCompactionControllerConfiguration(compactionControllerConfig druidc
9797
if compactionControllerConfig.EventsThreshold <= 0 {
9898
allErrs = append(allErrs, field.Invalid(fldPath.Child("eventsThreshold"), compactionControllerConfig.EventsThreshold, "must be greater than 0"))
9999
}
100+
if compactionControllerConfig.TriggerFullSnapshotThreshold <= 0 {
101+
allErrs = append(allErrs, field.Invalid(fldPath.Child("triggerFullSnapshotThreshold"), compactionControllerConfig.TriggerFullSnapshotThreshold, "must be greater than 0"))
102+
}
100103
return allErrs
101104
}
102105

api/config/v1alpha1/validation/validation_test.go

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -313,14 +313,15 @@ func TestValidateEtcdControllerConfiguration(t *testing.T) {
313313

314314
func TestValidateCompactionControllerConfiguration(t *testing.T) {
315315
tests := []struct {
316-
name string
317-
enabled bool
318-
concurrentSync *int
319-
eventThreshold *int64
320-
activeDeadlineDuration *metav1.Duration
321-
metricsScrapeWaitDuration *metav1.Duration
322-
expectedErrors int
323-
matcher gomegatypes.GomegaMatcher
316+
name string
317+
enabled bool
318+
concurrentSync *int
319+
eventsThreshold *int64
320+
triggerFullSnapshotThreshold *int64
321+
activeDeadlineDuration *metav1.Duration
322+
metricsScrapeWaitDuration *metav1.Duration
323+
expectedErrors int
324+
matcher gomegatypes.GomegaMatcher
324325
}{
325326
{
326327
name: "should allow default compaction controller configuration when it is enabled",
@@ -355,12 +356,32 @@ func TestValidateCompactionControllerConfiguration(t *testing.T) {
355356
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.concurrentSyncs")}))),
356357
},
357358
{
358-
name: "should forbid event threshold less than zero",
359-
enabled: true,
360-
eventThreshold: ptr.To(int64(-1)),
361-
expectedErrors: 1,
362-
363-
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.eventsThreshold")}))),
359+
name: "should forbid events threshold equal to zero",
360+
enabled: true,
361+
eventsThreshold: ptr.To(int64(0)),
362+
expectedErrors: 1,
363+
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.eventsThreshold")}))),
364+
},
365+
{
366+
name: "should forbid events threshold less than zero",
367+
enabled: true,
368+
eventsThreshold: ptr.To(int64(-1)),
369+
expectedErrors: 1,
370+
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.eventsThreshold")}))),
371+
},
372+
{
373+
name: "should forbid trigger full snapshot threshold equal to zero",
374+
enabled: true,
375+
triggerFullSnapshotThreshold: ptr.To(int64(0)),
376+
expectedErrors: 1,
377+
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.triggerFullSnapshotThreshold")}))),
378+
},
379+
{
380+
name: "should forbid trigger full snapshot threshold less than zero",
381+
enabled: true,
382+
triggerFullSnapshotThreshold: ptr.To(int64(-1)),
383+
expectedErrors: 1,
384+
matcher: ConsistOf(PointTo(MatchFields(IgnoreExtras, Fields{"Type": Equal(field.ErrorTypeInvalid), "Field": Equal("controllers.compaction.triggerFullSnapshotThreshold")}))),
364385
},
365386
{
366387
name: "should forbid active deadline duration less than zero",
@@ -390,8 +411,11 @@ func TestValidateCompactionControllerConfiguration(t *testing.T) {
390411
if test.concurrentSync != nil {
391412
controllerConfig.ConcurrentSyncs = test.concurrentSync
392413
}
393-
if test.eventThreshold != nil {
394-
controllerConfig.EventsThreshold = *test.eventThreshold
414+
if test.eventsThreshold != nil {
415+
controllerConfig.EventsThreshold = *test.eventsThreshold
416+
}
417+
if test.triggerFullSnapshotThreshold != nil {
418+
controllerConfig.TriggerFullSnapshotThreshold = *test.triggerFullSnapshotThreshold
395419
}
396420
if test.activeDeadlineDuration != nil {
397421
controllerConfig.ActiveDeadlineDuration = *test.activeDeadlineDuration

api/core/v1alpha1/constants.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
package v1alpha1
66

7+
import (
8+
batchv1 "k8s.io/api/batch/v1"
9+
v1 "k8s.io/api/core/v1"
10+
)
11+
712
// Common label keys to be placed on all druid-managed resources
813
const (
914
// LabelAppNameKey is a label which sets the name of the resource provisioned for an etcd cluster.
@@ -38,3 +43,32 @@ const (
3843
// runtime components of the etcd cluster such as pods, PVCs, leases, RBAC resources, PDBs, services, etc.
3944
DisableEtcdRuntimeComponentCreationAnnotation = "druid.gardener.cloud/disable-etcd-runtime-component-creation"
4045
)
46+
47+
// Compaction Job/Pod reasons that are used to set the reason for a pod condition in the status of an Etcd resource.
48+
const (
49+
// PodFailureReasonPreemptionByScheduler is a reason for a pod failure that indicates that the pod was preempted by the scheduler.
50+
PodFailureReasonPreemptionByScheduler = v1.PodReasonPreemptionByScheduler
51+
// PodFailureReasonDeletionByTaintManager is a reason for a pod failure that indicates that the pod was deleted by the taint manager.
52+
PodFailureReasonDeletionByTaintManager = "DeletionByTaintManager"
53+
// PodFailureReasonEvictionByEvictionAPI is a reason for a pod failure that indicates that the pod was evicted by the eviction API.
54+
PodFailureReasonEvictionByEvictionAPI = "EvictionByEvictionAPI"
55+
// PodFailureReasonTerminationByKubelet is a reason for a pod failure that indicates that the pod was terminated by the kubelet.
56+
PodFailureReasonTerminationByKubelet = v1.PodReasonTerminationByKubelet
57+
// PodFailureReasonProcessFailure is a reason for a pod failure that indicates that the pod process failed.
58+
PodFailureReasonProcessFailure = "ProcessFailure"
59+
// PodFailureReasonUnknown is a reason for a pod failure that indicates that the reason for the pod failure is unknown.
60+
PodFailureReasonUnknown = "Unknown"
61+
62+
// PodSuccessReasonNone is a reason for a pod success that indicates that the pod has not failed.
63+
PodSuccessReasonNone = "None"
64+
65+
// JobFailureReasonDeadlineExceeded is a reason for a job failure that indicates that the job has exceeded its deadline.
66+
JobFailureReasonDeadlineExceeded = batchv1.JobReasonDeadlineExceeded
67+
// JobFailureReasonBackoffLimitExceeded is a reason for a job failure that indicates that the job has exceeded its backoff limit.
68+
JobFailureReasonBackoffLimitExceeded = batchv1.JobReasonBackoffLimitExceeded
69+
70+
// FullSnapshotSuccessReason is the reason for a successful full snapshot.
71+
FullSnapshotSuccessReason string = "FullSnapshotTakenSuccessfully"
72+
// FullSnapshotFailureReason is the reason for a failed full snapshot.
73+
FullSnapshotFailureReason string = "ErrorTriggeringFullSnapshot"
74+
)

api/core/v1alpha1/crds/druid.gardener.cloud_etcds.yaml

Lines changed: 77 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -75,67 +75,6 @@ spec:
7575
description: BackupSpec defines parameters associated with the full
7676
and delta snapshots of etcd.
7777
properties:
78-
compactionResources:
79-
description: |-
80-
CompactionResources defines compute Resources required by compaction job.
81-
More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
82-
properties:
83-
claims:
84-
description: |-
85-
Claims lists the names of resources, defined in spec.resourceClaims,
86-
that are used by this container.
87-
88-
This is an alpha field and requires enabling the
89-
DynamicResourceAllocation feature gate.
90-
91-
This field is immutable. It can only be set for containers.
92-
items:
93-
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
94-
properties:
95-
name:
96-
description: |-
97-
Name must match the name of one entry in pod.spec.resourceClaims of
98-
the Pod where this field is used. It makes that resource available
99-
inside a container.
100-
type: string
101-
request:
102-
description: |-
103-
Request is the name chosen for a request in the referenced claim.
104-
If empty, everything from the claim is made available, otherwise
105-
only the result of this request.
106-
type: string
107-
required:
108-
- name
109-
type: object
110-
type: array
111-
x-kubernetes-list-map-keys:
112-
- name
113-
x-kubernetes-list-type: map
114-
limits:
115-
additionalProperties:
116-
anyOf:
117-
- type: integer
118-
- type: string
119-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
120-
x-kubernetes-int-or-string: true
121-
description: |-
122-
Limits describes the maximum amount of compute resources allowed.
123-
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
124-
type: object
125-
requests:
126-
additionalProperties:
127-
anyOf:
128-
- type: integer
129-
- type: string
130-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
131-
x-kubernetes-int-or-string: true
132-
description: |-
133-
Requests describes the minimum amount of compute resources required.
134-
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
135-
otherwise to an implementation-defined value. Requests cannot exceed Limits.
136-
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
137-
type: object
138-
type: object
13978
compression:
14079
description: SnapshotCompression defines the specification for
14180
compression of Snapshots.
@@ -286,6 +225,83 @@ spec:
286225
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
287226
type: object
288227
type: object
228+
snapshotCompaction:
229+
description: SnapshotCompaction defines the specification for
230+
compaction of backups.
231+
properties:
232+
eventsThreshold:
233+
description: EventsThreshold defines the threshold for the
234+
number of etcd events before triggering a compaction job
235+
format: int64
236+
type: integer
237+
resources:
238+
description: |-
239+
Resources defines compute Resources required by compaction job.
240+
More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
241+
properties:
242+
claims:
243+
description: |-
244+
Claims lists the names of resources, defined in spec.resourceClaims,
245+
that are used by this container.
246+
247+
This is an alpha field and requires enabling the
248+
DynamicResourceAllocation feature gate.
249+
250+
This field is immutable. It can only be set for containers.
251+
items:
252+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
253+
properties:
254+
name:
255+
description: |-
256+
Name must match the name of one entry in pod.spec.resourceClaims of
257+
the Pod where this field is used. It makes that resource available
258+
inside a container.
259+
type: string
260+
request:
261+
description: |-
262+
Request is the name chosen for a request in the referenced claim.
263+
If empty, everything from the claim is made available, otherwise
264+
only the result of this request.
265+
type: string
266+
required:
267+
- name
268+
type: object
269+
type: array
270+
x-kubernetes-list-map-keys:
271+
- name
272+
x-kubernetes-list-type: map
273+
limits:
274+
additionalProperties:
275+
anyOf:
276+
- type: integer
277+
- type: string
278+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
279+
x-kubernetes-int-or-string: true
280+
description: |-
281+
Limits describes the maximum amount of compute resources allowed.
282+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
283+
type: object
284+
requests:
285+
additionalProperties:
286+
anyOf:
287+
- type: integer
288+
- type: string
289+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
290+
x-kubernetes-int-or-string: true
291+
description: |-
292+
Requests describes the minimum amount of compute resources required.
293+
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
294+
otherwise to an implementation-defined value. Requests cannot exceed Limits.
295+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
296+
type: object
297+
type: object
298+
triggerFullSnapshotThreshold:
299+
description: TriggerFullSnapshotThreshold defines the upper
300+
threshold for the number of etcd events before giving up
301+
on compaction job and triggering a full snapshot.
302+
format: int64
303+
type: integer
304+
type: object
289305
store:
290306
description: Store defines the specification of object store provider
291307
for storing backups.

0 commit comments

Comments
 (0)