Skip to content

Commit 1c794de

Browse files
authored
fix: update crp status if there is a delay in creating the snapshot (#116)
Signed-off-by: Zhiying Lin <[email protected]>
1 parent 4ff0b9a commit 1c794de

File tree

21 files changed

+1091
-139
lines changed

21 files changed

+1091
-139
lines changed

.github/workflows/ci.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,11 @@ jobs:
9393
include:
9494
- customized-settings: default
9595
# to shorten the test duration, set the resource snapshot creation interval to 0
96-
resource-snapshot-creation-interval: 0m
96+
resource-snapshot-creation-minimum-interval: 0m
97+
resource-changes-collection-duration: 0m
9798
- customized-settings: custom
98-
resource-snapshot-creation-interval: 1m
99+
resource-snapshot-creation-minimum-interval: 30s
100+
resource-changes-collection-duration: 15s
99101
runs-on: ubuntu-latest
100102
needs: [
101103
detect-noop,
@@ -143,5 +145,6 @@ jobs:
143145
# TO-DO (chenyu1): to ensure a vendor-neutral experience, switch to a dummy
144146
# property provider once the AKS one is split out.
145147
PROPERTY_PROVIDER: 'azure'
146-
RESOURCE_SNAPSHOT_CREATION_INTERVAL: ${{ matrix.resource-snapshot-creation-interval }}
148+
RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL: ${{ matrix.resource-snapshot-creation-minimum-interval }}
149+
RESOURCE_CHANGES_COLLECTION_DURATION: ${{ matrix.resource-changes-collection-duration }}
147150

apis/placement/v1/resourcesnapshot_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ import (
4141
// Each snapshot MUST have the following labels:
4242
// - `CRPTrackingLabel` which points to its owner CRP.
4343
// - `ResourceIndexLabel` which is the index of the snapshot group.
44+
//
45+
// The first snapshot of the index group MAY have the following labels:
4446
// - `IsLatestSnapshotLabel` which indicates whether the snapshot is the latest one.
4547
//
4648
// All the snapshots within the same index group must have the same ResourceIndexLabel.
@@ -51,6 +53,9 @@ import (
5153
//
5254
// Each snapshot (excluding the first snapshot) MUST have the following annotations:
5355
// - `SubindexOfResourceSnapshotAnnotation` to store the subindex of resource snapshot in the group.
56+
//
57+
// Snapshot may have the following annotations to indicate the time of next resourceSnapshot candidate detected by the controller:
58+
// - `NextResourceSnapshotCandidateDetectionTimeAnnotation` to store the time of next resourceSnapshot candidate detected by the controller.
5459
type ClusterResourceSnapshot struct {
5560
metav1.TypeMeta `json:",inline"`
5661
metav1.ObjectMeta `json:"metadata,omitempty"`

apis/placement/v1beta1/resourcesnapshot_types.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ const (
4040
// SubindexOfResourceSnapshotAnnotation is the annotation to store the subindex of resource snapshot in the group.
4141
SubindexOfResourceSnapshotAnnotation = fleetPrefix + "subindex-of-resource-snapshot"
4242

43+
// NextResourceSnapshotCandidateDetectionTimeAnnotation is the annotation to store the time of next resourceSnapshot candidate detected by the controller.
44+
NextResourceSnapshotCandidateDetectionTimeAnnotation = fleetPrefix + "next-resource-snapshot-candidate-detection-time"
45+
4346
// ResourceSnapshotNameFmt is resourcePolicySnapshot name format: {CRPName}-{resourceIndex}-snapshot.
4447
ResourceSnapshotNameFmt = "%s-%d-snapshot"
4548

@@ -119,6 +122,9 @@ type ResourceSnapshotObjList interface {
119122
//
120123
// Each snapshot (excluding the first snapshot) MUST have the following annotations:
121124
// - `SubindexOfResourceSnapshotAnnotation` to store the subindex of resource snapshot in the group.
125+
//
126+
// Snapshot may have the following annotations to indicate the time of next resourceSnapshot candidate detected by the controller:
127+
// - `NextResourceSnapshotCandidateDetectionTimeAnnotation` to store the time of next resourceSnapshot candidate detected by the controller.
122128
type ClusterResourceSnapshot struct {
123129
metav1.TypeMeta `json:",inline"`
124130
metav1.ObjectMeta `json:"metadata,omitempty"`
@@ -226,6 +232,8 @@ func (c *ClusterResourceSnapshotList) GetResourceSnapshotObjs() []ResourceSnapsh
226232
// Each snapshot MUST have the following labels:
227233
// - `CRPTrackingLabel` which points to its owner resource placement.
228234
// - `ResourceIndexLabel` which is the index of the snapshot group.
235+
//
236+
// The first snapshot of the index group MAY have the following labels:
229237
// - `IsLatestSnapshotLabel` which indicates whether the snapshot is the latest one.
230238
//
231239
// All the snapshots within the same index group must have the same ResourceIndexLabel.

charts/hub-agent/README.md

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,26 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen
1919

2020
## Parameters
2121

22-
| Parameter | Description | Default |
23-
|:-----------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------|
24-
| replicaCount | The number of hub-agent replicas to deploy | `1` |
25-
| image.repository | Image repository | `ghcr.io/azure/azure/fleet/hub-agent` |
26-
| image.pullPolicy | Image pullPolicy | `Always` |
27-
| image.tag | The image release tag to use | `v0.1.0` |
28-
| namespace | Namespace that this Helm chart is installed on | `fleet-system` |
29-
| serviceAccount.create | Whether to create service account | `true` |
30-
| serviceAccount.name | Service account name | `hub-agent-sa` |
31-
| resources | The resource request/limits for the container image | limits: 500m CPU, 1Gi, requests: 100m CPU, 128Mi |
32-
| affinity | The node affinity to use for hubagent pod | `{}` |
33-
| tolerations | The tolerations to use for hubagent pod | `[]` |
34-
| logVerbosity | Log level. Uses V logs (klog) | `5` |
35-
| enableV1Alpha1APIs | If set, the agents will watch for the v1alpha1 APIs. | `false` |
36-
| enableV1Beta1APIs | If set, the agents will watch for the v1beta1 APIs. | `true` |
37-
| hubAPIQPS | QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags. | `250` |
38-
| hubAPIBurst | Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags. | `1000` |
39-
| MaxConcurrentClusterPlacement | The max number of clusterResourcePlacement to run concurrently this fleet supports. | `100` |
40-
| ConcurrentResourceChangeSyncs | The number of resourceChange reconcilers that are allowed to run concurrently. | `20` |
41-
| logFileMaxSize | Max size of log file before rotation | `1000000` |
42-
| MaxFleetSizeSupported | The max number of member clusters this fleet supports. | `100` |
43-
| resourceSnapshotCreationInterval | The interval at which resource snapshots are created. | `1m` |
22+
| Parameter | Description | Default |
23+
|:------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------|
24+
| `replicaCount` | Number of hub-agent replicas to deploy | `1` |
25+
| `image.repository` | Image repository | `ghcr.io/azure/azure/fleet/hub-agent` |
26+
| `image.pullPolicy` | Image pull policy | `Always` |
27+
| `image.tag` | Image release tag | `v0.1.0` |
28+
| `namespace` | Namespace where this chart is installed | `fleet-system` |
29+
| `serviceAccount.create` | Whether to create a service account | `true` |
30+
| `serviceAccount.name` | Service account name | `hub-agent-sa` |
31+
| `resources` | Resource requests/limits for the container | limits: 500m CPU, 1Gi; requests: 100m CPU, 128Mi |
32+
| `affinity` | Node affinity for hub-agent pods | `{}` |
33+
| `tolerations` | Tolerations for hub-agent pods | `[]` |
34+
| `logVerbosity` | Log level (klog V logs) | `5` |
35+
| `enableV1Alpha1APIs` | Watch for v1alpha1 APIs | `false` |
36+
| `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` |
37+
| `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` |
38+
| `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` |
39+
| `MaxConcurrentClusterPlacement` | Max concurrent ClusterResourcePlacement operations | `100` |
40+
| `ConcurrentResourceChangeSyncs` | Max concurrent resourceChange reconcilers | `20` |
41+
| `logFileMaxSize` | Max log file size before rotation | `1000000` |
42+
| `MaxFleetSizeSupported` | Max number of member clusters supported | `100` |
43+
| `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` |
44+
| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` |

charts/hub-agent/templates/deployment.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ spec:
4343
- --hub-api-burst={{ .Values.hubAPIBurst }}
4444
- --force-delete-wait-time={{ .Values.forceDeleteWaitTime }}
4545
- --cluster-unhealthy-threshold={{ .Values.clusterUnhealthyThreshold }}
46-
- --resource-snapshot-creation-interval={{ .Values.resourceSnapshotCreationInterval }}
46+
- --resource-snapshot-creation-minimum-interval={{ .Values.resourceSnapshotCreationMinimumInterval }}
47+
- --resource-changes-collection-duration={{ .Values.resourceChangesCollectionDuration }}
4748
ports:
4849
- name: metrics
4950
containerPort: 8080

charts/hub-agent/values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ enableGuardRail: true
1818
webhookClientConnectionType: service
1919
forceDeleteWaitTime: 15m0s
2020
clusterUnhealthyThreshold: 3m0s
21-
resourceSnapshotCreationInterval: 1m0s
21+
resourceSnapshotCreationMinimumInterval: 30s
22+
resourceChangesCollectionDuration: 15s
2223

2324
namespace:
2425
fleet-system

cmd/hubagent/options/options.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,11 @@ type Options struct {
104104
PprofPort int
105105
// DenyModifyMemberClusterLabels indicates if the member cluster labels cannot be modified by groups (excluding system:masters)
106106
DenyModifyMemberClusterLabels bool
107-
// ResourceSnapshotCreationInterval is the interval at which resource snapshots are created.
108-
ResourceSnapshotCreationInterval time.Duration
107+
// ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created.
108+
// Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration.
109+
ResourceSnapshotCreationMinimumInterval time.Duration
110+
// ResourceChangesCollectionDuration is the duration for collecting resource changes into one snapshot.
111+
ResourceChangesCollectionDuration time.Duration
109112
}
110113

111114
// NewOptions builds an empty options.
@@ -117,15 +120,16 @@ func NewOptions() *Options {
117120
ResourceNamespace: utils.FleetSystemNamespace,
118121
ResourceName: "136224848560.hub.fleet.azure.com",
119122
},
120-
MaxConcurrentClusterPlacement: 10,
121-
ConcurrentResourceChangeSyncs: 1,
122-
MaxFleetSizeSupported: 100,
123-
EnableV1Alpha1APIs: false,
124-
EnableClusterInventoryAPIs: true,
125-
EnableStagedUpdateRunAPIs: true,
126-
EnablePprof: false,
127-
PprofPort: 6065,
128-
ResourceSnapshotCreationInterval: 1 * time.Minute,
123+
MaxConcurrentClusterPlacement: 10,
124+
ConcurrentResourceChangeSyncs: 1,
125+
MaxFleetSizeSupported: 100,
126+
EnableV1Alpha1APIs: false,
127+
EnableClusterInventoryAPIs: true,
128+
EnableStagedUpdateRunAPIs: true,
129+
EnablePprof: false,
130+
PprofPort: 6065,
131+
ResourceSnapshotCreationMinimumInterval: 30 * time.Second,
132+
ResourceChangesCollectionDuration: 15 * time.Second,
129133
}
130134
}
131135

@@ -172,7 +176,8 @@ func (o *Options) AddFlags(flags *flag.FlagSet) {
172176
flags.BoolVar(&o.EnablePprof, "enable-pprof", false, "If set, the pprof profiling is enabled.")
173177
flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.")
174178
flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.")
175-
flags.DurationVar(&o.ResourceSnapshotCreationInterval, "resource-snapshot-creation-interval", 1*time.Minute, "The interval at which resource snapshots are created.")
176-
179+
flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.")
180+
flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second,
181+
"The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.")
177182
o.RateLimiterOpts.AddFlags(flags)
178183
}

cmd/hubagent/workload/setup.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,15 +153,16 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager,
153153

154154
// Set up a custom controller to reconcile cluster resource placement
155155
crpc := &clusterresourceplacement.Reconciler{
156-
Client: mgr.GetClient(),
157-
Recorder: mgr.GetEventRecorderFor(crpControllerName),
158-
RestMapper: mgr.GetRESTMapper(),
159-
InformerManager: dynamicInformerManager,
160-
ResourceConfig: resourceConfig,
161-
SkippedNamespaces: skippedNamespaces,
162-
Scheme: mgr.GetScheme(),
163-
UncachedReader: mgr.GetAPIReader(),
164-
ResourceSnapshotCreationInterval: opts.ResourceSnapshotCreationInterval,
156+
Client: mgr.GetClient(),
157+
Recorder: mgr.GetEventRecorderFor(crpControllerName),
158+
RestMapper: mgr.GetRESTMapper(),
159+
InformerManager: dynamicInformerManager,
160+
ResourceConfig: resourceConfig,
161+
SkippedNamespaces: skippedNamespaces,
162+
Scheme: mgr.GetScheme(),
163+
UncachedReader: mgr.GetAPIReader(),
164+
ResourceSnapshotCreationMinimumInterval: opts.ResourceSnapshotCreationMinimumInterval,
165+
ResourceChangesCollectionDuration: opts.ResourceChangesCollectionDuration,
165166
}
166167

167168
rateLimiter := options.DefaultControllerRateLimiter(opts.RateLimiterOpts)

0 commit comments

Comments
 (0)