Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
89c2ca8
chore: upgrade Go version from 1.23.8 to 1.24.4 (#108)
Copilot Jun 16, 2025
25d684b
chore: bump step-security/harden-runner from 2.12.0 to 2.12.1 (#105)
dependabot[bot] Jun 16, 2025
8c83fed
fix: update unexpected error and API Server log error types (#94)
britaniar Jun 17, 2025
03e265d
fix: fix updateRun handling unscheduled bindings (#95)
jwtty Jun 17, 2025
17cf340
test: fix the rollout integration tests flakiness (#110)
zhiying-lin Jun 17, 2025
23615ea
feat: bump version of Karpenter/Azure provider to unblock fix progres…
michaelawyu Jun 24, 2025
385c310
fix: add some delays when creating resourceSnapshot (#97)
zhiying-lin Jun 25, 2025
224f3c6
test: use clusterloader2 framework for large scale testing (#102)
jwtty Jun 26, 2025
dcd82cb
feat: Change AppliedWork to used foregroundDeletion (#60)
britaniar Jun 27, 2025
d83c930
feat: use bindingObj interface instead of ClusterResourceBinding in t…
ryanzhang-oss Jun 27, 2025
d437b2c
test: fix flaky capacity-based scheduling e2e (#121)
jwtty Jun 30, 2025
5d35be5
fix: address an issue where agent might panic if CSA cannot be used a…
michaelawyu Jul 1, 2025
4ff0b9a
feat: better handling of cost property calculation corner cases (#83)
michaelawyu Jul 1, 2025
1c794de
fix: update crp status if there is a delay in creating the snapshot (…
zhiying-lin Jul 2, 2025
8b60b4a
feat: allow backoff with a rate limiter in the work applier (#113)
michaelawyu Jul 2, 2025
104264c
Merge remote-tracking branch 'cncf/main' into backport
zhiying-lin Jul 2, 2025
825a798
update the crd-installer dockerfile
zhiying-lin Jul 2, 2025
723280e
Merge branch 'main' into backport
michaelawyu Jul 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Large diffs are not rendered by default.

22 changes: 20 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"]

env:
GO_VERSION: '1.23.8'
GO_VERSION: '1.24.4'

jobs:
detect-noop:
Expand Down Expand Up @@ -86,6 +86,18 @@ jobs:
HUB_SERVER_URL: 'https://172.19.0.2:6443'

e2e-tests:
strategy:
fail-fast: false
matrix:
customized-settings: [default, custom]
include:
- customized-settings: default
# to shorten the test duration, set the resource snapshot creation interval to 0
resource-snapshot-creation-minimum-interval: 0m
resource-changes-collection-duration: 0m
- customized-settings: custom
resource-snapshot-creation-minimum-interval: 30s
resource-changes-collection-duration: 15s
runs-on: ubuntu-latest
needs: [
detect-noop,
Expand Down Expand Up @@ -119,7 +131,11 @@ jobs:

- name: Run e2e tests
run: |
make e2e-tests
if [ "${{ matrix.customized-settings }}" = "default" ]; then
make e2e-tests
else
make e2e-tests-custom
fi
env:
KUBECONFIG: '/home/runner/.kube/config'
HUB_SERVER_URL: 'https://172.19.0.2:6443'
Expand All @@ -129,4 +145,6 @@ jobs:
# TO-DO (chenyu1): to ensure a vendor-neutral experience, switch to a dummy
# property provider once the AKS one is split out.
PROPERTY_PROVIDER: 'azure'
RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL: ${{ matrix.resource-snapshot-creation-minimum-interval }}
RESOURCE_CHANGES_COLLECTION_DURATION: ${{ matrix.resource-changes-collection-duration }}

2 changes: 1 addition & 1 deletion .github/workflows/code-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:

env:
# Common versions
GO_VERSION: '1.23.8'
GO_VERSION: '1.24.4'

jobs:

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
with:
egress-policy: audit

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/markdown-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: gaurav-nelson/github-action-markdown-link-check@v1
- uses: tcort/github-action-markdown-link-check@v1
with:
# this will only show errors in the output
use-quiet-mode: 'yes'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/trivy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ env:
MEMBER_AGENT_IMAGE_NAME: member-agent
REFRESH_TOKEN_IMAGE_NAME: refresh-token

GO_VERSION: '1.23.8'
GO_VERSION: '1.24.4'

jobs:
export-registry:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ on:
paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"]

env:
GO_VERSION: '1.23.8'
GO_VERSION: '1.24.4'

jobs:
detect-noop:
Expand Down
2 changes: 1 addition & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
run:
timeout: 15m
go: '1.23.8'
go: '1.24.4'

linters-settings:
stylecheck:
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,10 @@ e2e-tests-v1alpha1: create-kind-cluster run-e2e-v1alpha1

.PHONY: e2e-tests
e2e-tests: setup-clusters
cd ./test/e2e && ginkgo -v -p .
cd ./test/e2e && ginkgo --label-filter="!custom" -v -p .

e2e-tests-custom: setup-clusters
cd ./test/e2e && ginkgo --label-filter="custom" -v -p .

.PHONY: setup-clusters
setup-clusters:
Expand Down
5 changes: 5 additions & 0 deletions apis/placement/v1/resourcesnapshot_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ import (
// Each snapshot MUST have the following labels:
// - `CRPTrackingLabel` which points to its owner CRP.
// - `ResourceIndexLabel` which is the index of the snapshot group.
//
// The first snapshot of the index group MAY have the following labels:
// - `IsLatestSnapshotLabel` which indicates whether the snapshot is the latest one.
//
// All the snapshots within the same index group must have the same ResourceIndexLabel.
Expand All @@ -51,6 +53,9 @@ import (
//
// Each snapshot (excluding the first snapshot) MUST have the following annotations:
// - `SubindexOfResourceSnapshotAnnotation` to store the subindex of resource snapshot in the group.
//
// Snapshot may have the following annotations to indicate the time of next resourceSnapshot candidate detected by the controller:
// - `NextResourceSnapshotCandidateDetectionTimeAnnotation` to store the time of next resourceSnapshot candidate detected by the controller.
type ClusterResourceSnapshot struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Expand Down
8 changes: 8 additions & 0 deletions apis/placement/v1beta1/resourcesnapshot_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ const (
// SubindexOfResourceSnapshotAnnotation is the annotation to store the subindex of resource snapshot in the group.
SubindexOfResourceSnapshotAnnotation = fleetPrefix + "subindex-of-resource-snapshot"

// NextResourceSnapshotCandidateDetectionTimeAnnotation is the annotation to store the time of next resourceSnapshot candidate detected by the controller.
NextResourceSnapshotCandidateDetectionTimeAnnotation = fleetPrefix + "next-resource-snapshot-candidate-detection-time"

// ResourceSnapshotNameFmt is resourcePolicySnapshot name format: {CRPName}-{resourceIndex}-snapshot.
ResourceSnapshotNameFmt = "%s-%d-snapshot"

Expand Down Expand Up @@ -119,6 +122,9 @@ type ResourceSnapshotObjList interface {
//
// Each snapshot (excluding the first snapshot) MUST have the following annotations:
// - `SubindexOfResourceSnapshotAnnotation` to store the subindex of resource snapshot in the group.
//
// Snapshot may have the following annotations to indicate the time of next resourceSnapshot candidate detected by the controller:
// - `NextResourceSnapshotCandidateDetectionTimeAnnotation` to store the time of next resourceSnapshot candidate detected by the controller.
type ClusterResourceSnapshot struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Expand Down Expand Up @@ -226,6 +232,8 @@ func (c *ClusterResourceSnapshotList) GetResourceSnapshotObjs() []ResourceSnapsh
// Each snapshot MUST have the following labels:
// - `CRPTrackingLabel` which points to its owner resource placement.
// - `ResourceIndexLabel` which is the index of the snapshot group.
//
// The first snapshot of the index group MAY have the following labels:
// - `IsLatestSnapshotLabel` which indicates whether the snapshot is the latest one.
//
// All the snapshots within the same index group must have the same ResourceIndexLabel.
Expand Down
44 changes: 23 additions & 21 deletions charts/hub-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,26 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen

## Parameters

| Parameter | Description | Default |
|:------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------|
| replicaCount | The number of hub-agent replicas to deploy | `1` |
| image.repository | Image repository | `ghcr.io/azure/azure/fleet/hub-agent` |
| image.pullPolicy | Image pullPolicy | `Always` |
| image.tag | The image release tag to use | `v0.1.0` |
| namespace | Namespace that this Helm chart is installed on | `fleet-system` |
| serviceAccount.create | Whether to create service account | `true` |
| serviceAccount.name | Service account name | `hub-agent-sa` |
| resources | The resource request/limits for the container image | limits: 500m CPU, 1Gi, requests: 100m CPU, 128Mi |
| affinity | The node affinity to use for hubagent pod | `{}` |
| tolerations | The tolerations to use for hubagent pod | `[]` |
| logVerbosity | Log level. Uses V logs (klog) | `5` |
| enableV1Alpha1APIs | If set, the agents will watch for the v1alpha1 APIs. | `false` |
| enableV1Beta1APIs | If set, the agents will watch for the v1beta1 APIs. | `true` |
| hubAPIQPS | QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags. | `250` |
| hubAPIBurst | Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags. | `1000` |
| MaxConcurrentClusterPlacement | The max number of clusterResourcePlacement to run concurrently this fleet supports. | `100` |
| ConcurrentResourceChangeSyncs | The number of resourceChange reconcilers that are allowed to run concurrently. | `20` |
| logFileMaxSize | Max size of log file before rotation | `1000000` |
| MaxFleetSizeSupported | The max number of member clusters this fleet supports. | `100` |
| Parameter | Description | Default |
|:------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------|
| `replicaCount` | Number of hub-agent replicas to deploy | `1` |
| `image.repository` | Image repository | `ghcr.io/azure/azure/fleet/hub-agent` |
| `image.pullPolicy` | Image pull policy | `Always` |
| `image.tag` | Image release tag | `v0.1.0` |
| `namespace` | Namespace where this chart is installed | `fleet-system` |
| `serviceAccount.create` | Whether to create a service account | `true` |
| `serviceAccount.name` | Service account name | `hub-agent-sa` |
| `resources` | Resource requests/limits for the container | limits: 500m CPU, 1Gi; requests: 100m CPU, 128Mi |
| `affinity` | Node affinity for hub-agent pods | `{}` |
| `tolerations` | Tolerations for hub-agent pods | `[]` |
| `logVerbosity` | Log level (klog V logs) | `5` |
| `enableV1Alpha1APIs` | Watch for v1alpha1 APIs | `false` |
| `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` |
| `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` |
| `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` |
| `MaxConcurrentClusterPlacement` | Max concurrent ClusterResourcePlacement operations | `100` |
| `ConcurrentResourceChangeSyncs` | Max concurrent resourceChange reconcilers | `20` |
| `logFileMaxSize` | Max log file size before rotation | `1000000` |
| `MaxFleetSizeSupported` | Max number of member clusters supported | `100` |
| `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` |
| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` |
2 changes: 2 additions & 0 deletions charts/hub-agent/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ spec:
- --hub-api-burst={{ .Values.hubAPIBurst }}
- --force-delete-wait-time={{ .Values.forceDeleteWaitTime }}
- --cluster-unhealthy-threshold={{ .Values.clusterUnhealthyThreshold }}
- --resource-snapshot-creation-minimum-interval={{ .Values.resourceSnapshotCreationMinimumInterval }}
- --resource-changes-collection-duration={{ .Values.resourceChangesCollectionDuration }}
ports:
- name: metrics
containerPort: 8080
Expand Down
3 changes: 3 additions & 0 deletions charts/hub-agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ enableGuardRail: true
webhookClientConnectionType: service
forceDeleteWaitTime: 15m0s
clusterUnhealthyThreshold: 3m0s
resourceSnapshotCreationMinimumInterval: 30s
resourceChangesCollectionDuration: 15s

namespace:
fleet-system

Expand Down
27 changes: 18 additions & 9 deletions cmd/hubagent/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ type Options struct {
PprofPort int
// DenyModifyMemberClusterLabels indicates if the member cluster labels cannot be modified by groups (excluding system:masters)
DenyModifyMemberClusterLabels bool
// ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created.
// Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration.
ResourceSnapshotCreationMinimumInterval time.Duration
// ResourceChangesCollectionDuration is the duration for collecting resource changes into one snapshot.
ResourceChangesCollectionDuration time.Duration
}

// NewOptions builds an empty options.
Expand All @@ -115,14 +120,16 @@ func NewOptions() *Options {
ResourceNamespace: utils.FleetSystemNamespace,
ResourceName: "136224848560.hub.fleet.azure.com",
},
MaxConcurrentClusterPlacement: 10,
ConcurrentResourceChangeSyncs: 1,
MaxFleetSizeSupported: 100,
EnableV1Alpha1APIs: false,
EnableClusterInventoryAPIs: true,
EnableStagedUpdateRunAPIs: true,
EnablePprof: false,
PprofPort: 6065,
MaxConcurrentClusterPlacement: 10,
ConcurrentResourceChangeSyncs: 1,
MaxFleetSizeSupported: 100,
EnableV1Alpha1APIs: false,
EnableClusterInventoryAPIs: true,
EnableStagedUpdateRunAPIs: true,
EnablePprof: false,
PprofPort: 6065,
ResourceSnapshotCreationMinimumInterval: 30 * time.Second,
ResourceChangesCollectionDuration: 15 * time.Second,
}
}

Expand Down Expand Up @@ -169,6 +176,8 @@ func (o *Options) AddFlags(flags *flag.FlagSet) {
flags.BoolVar(&o.EnablePprof, "enable-pprof", false, "If set, the pprof profiling is enabled.")
flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.")
flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.")

flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.")
flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second,
"The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.")
o.RateLimiterOpts.AddFlags(flags)
}
Loading
Loading