Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .codacy.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
exclude_paths:
- "**/zz_generated.deepcopy.go"
- "test/utils/**"
- "**/*.md"
4 changes: 4 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ jobs:
run: |
python -m mkdocs build

- name: 📝 Copy Artifact Hub metadata
run: |
cp artifacthub-repo.yml ./site/

- name: 🚀 Deploy documentation
uses: peaceiris/actions-gh-pages@v4
with:
Expand Down
7 changes: 7 additions & 0 deletions artifacthub-repo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Artifact Hub repository metadata file
# This file enables Verified Publisher status and ownership claim

repositoryID: 8e1a59f6-36d5-4e2a-805e-aeb1c38784c0
owners:
- name: Abdelrhman Hamouda
email: abdelrhman.ahmed@outlook.com
34 changes: 31 additions & 3 deletions charts/locust-k8s-operator/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
apiVersion: v2
name: locust-k8s-operator
description: Locust Kubernetes Operator - Go v2
description: Production-ready Kubernetes operator for distributed Locust load testing with native OpenTelemetry, HA support, and automatic v1→v2 migration
icon: https://raw.githubusercontent.com/AbdelrhmanHamouda/locust-k8s-operator/master/docs/assets/images/favicon.png

# Keywords for Artifact Hub search discoverability
keywords:
- locust
- load-testing
- performance-testing
- kubernetes-operator
- distributed-testing
- opentelemetry
- observability
- stress-testing

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
Expand All @@ -16,7 +27,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 2.0.0
version: 2.0.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand All @@ -29,10 +40,27 @@ home: https://github.com/AbdelrhmanHamouda/locust-k8s-operator
sources:
- https://github.com/AbdelrhmanHamouda/locust-k8s-operator
maintainers:
- name: AbdelrhmanHamouda
- name: Abdelrhman Hamouda
url: https://github.com/AbdelrhmanHamouda

annotations:
# Artifact Hub specific metadata
artifacthub.io/license: Apache-2.0
artifacthub.io/operator: "true"
artifacthub.io/operatorCapabilities: Basic Install
artifacthub.io/prerelease: "false"
artifacthub.io/containsSecurityUpdates: "false"
artifacthub.io/links: |
- name: Documentation
url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/
- name: Getting Started
url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/getting_started/
- name: Migration Guide (v1→v2)
url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/
- name: GitHub Discussions
url: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/discussions
- name: GitHub Issues
url: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/issues
artifacthub.io/changes: |
- kind: changed
description: Rewritten for Go operator with clean-slate design
Expand Down
98 changes: 98 additions & 0 deletions charts/locust-k8s-operator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Locust Kubernetes Operator

[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/locust-k8s-operator)](https://artifacthub.io/packages/helm/locust-k8s-operator/locust-k8s-operator)

Production-ready Kubernetes operator for distributed Locust load testing with native OpenTelemetry support, high availability, and automatic v1→v2 migration.

## ✨ Features

- **🚀 Complete Go Rewrite**: 4x memory reduction, 60x faster startup
- **📊 Native OpenTelemetry**: Built-in metrics, traces, and optional collector deployment
- **🔒 High Availability**: Leader election support for multi-replica deployments
- **🔄 Zero-Downtime Migration**: Automatic conversion webhooks for v1→v2 upgrades
- **💚 Pod Health Monitoring**: Automatic pod recovery and health checks
- **🛡️ Production-Ready**: Comprehensive RBAC, security contexts, and resource management

## 📦 Installation

### Prerequisites

- Kubernetes 1.25+
- Helm 3.x

### Quick Start

```bash
# Add the Helm repository
helm repo add locust-k8s-operator https://abdelrhmanhamouda.github.io/locust-k8s-operator

# Update repository
helm repo update

# Install the operator
helm install my-locust-operator locust-k8s-operator/locust-k8s-operator

# Install with OpenTelemetry collector
helm install my-locust-operator locust-k8s-operator/locust-k8s-operator \
--set otelCollector.enabled=true
```

## ⚙️ Configuration

### Key Values

| Parameter | Description | Default |
|-----------|-------------|---------|
| `replicas` | Number of operator replicas | `1` |
| `resources.limits.memory` | Memory limit | `128Mi` |
| `resources.limits.cpu` | CPU limit | `500m` |
| `webhook.enabled` | Enable conversion webhooks | `false` |
| `webhook.certManager.enabled` | Use cert-manager for webhook certs | `false` |
| `otelCollector.enabled` | Deploy OTel collector sidecar | `false` |
| `leaderElection.enabled` | Enable leader election for HA | `true` |

### Example: Production Deployment with HA

```bash
helm install locust-operator locust-k8s-operator/locust-k8s-operator \
--set replicas=3 \
--set leaderElection.enabled=true \
--set webhook.enabled=true \
--set webhook.certManager.enabled=true \
--set otelCollector.enabled=true \
--set resources.limits.memory=256Mi
```

## 🔄 Migrating from v1.x

The operator includes automatic conversion webhooks for zero-downtime migration:

```bash
# Install v2 with webhooks enabled
helm upgrade locust-operator locust-k8s-operator/locust-k8s-operator \
--set webhook.enabled=true \
--set webhook.certManager.enabled=true

# Your existing v1 LocustTest resources will be automatically converted
```

For detailed migration guide, see: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/

## 📚 Documentation

- **Full Documentation**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/
- **Getting Started**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/getting_started/
- **Migration Guide**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/
- **API Reference**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/api_reference/

## 🐛 Support

- **GitHub Issues**: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/issues

## 📝 License

Apache-2.0

## 🙏 Contributing

Contributions are welcome! Please see our [Contributing Guide](https://github.com/AbdelrhmanHamouda/locust-k8s-operator/blob/master/CONTRIBUTING.md).
178 changes: 84 additions & 94 deletions internal/controller/locusttest_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,21 +258,25 @@ func (r *LocustTestReconciler) createResource(ctx context.Context, lt *locustv2.
return nil
}

// reconcileStatus updates the LocustTest status based on owned Job states.
// Called when resources already exist and we need to track Job completion.
func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2.LocustTest) (ctrl.Result, error) {
// handleExternalResourceDeletion handles the case where a resource was externally deleted.
// It transitions the LocustTest to Pending phase to trigger recreation on the next reconcile.
// Returns (shouldRequeue, requeueAfter, error).
func (r *LocustTestReconciler) handleExternalResourceDeletion(
ctx context.Context,
lt *locustv2.LocustTest,
resourceName, resourceKind string,
obj client.Object,
) (bool, time.Duration, error) {
log := logf.FromContext(ctx)

// Check for externally deleted Service
masterServiceName := lt.Name + "-master"
masterService := &corev1.Service{}
if err := r.Get(ctx, client.ObjectKey{Name: masterServiceName, Namespace: lt.Namespace}, masterService); err != nil {
// Try to fetch the resource
if err := r.Get(ctx, client.ObjectKey{Name: resourceName, Namespace: lt.Namespace}, obj); err != nil {
if apierrors.IsNotFound(err) {
// Service was externally deleted — transition to Pending for recovery
log.Info("Master Service externally deleted, transitioning to Pending for recovery",
"service", masterServiceName)
// Resource was externally deleted — transition to Pending for recovery
log.Info(fmt.Sprintf("%s externally deleted, transitioning to Pending for recovery", resourceKind),
resourceKind, resourceName)
r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted",
fmt.Sprintf("Master Service %s was deleted externally, will attempt recreation", masterServiceName))
fmt.Sprintf("%s %s was deleted externally, will attempt recreation", resourceKind, resourceName))

// Reset to Pending to trigger resource recreation on next reconcile
log.Info("Attempting to update status to Pending after external deletion",
Expand All @@ -292,104 +296,90 @@ func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2
r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources")
return r.Status().Update(ctx, lt)
}); err != nil {
log.Error(err, "Failed to update status after detecting Service deletion",
"service", masterServiceName,
log.Error(err, fmt.Sprintf("Failed to update status after detecting %s deletion", resourceKind),
resourceKind, resourceName,
"retryAttempts", "exhausted")
// Still requeue to retry the entire reconciliation
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
return true, 5 * time.Second, nil
}
log.Info("Successfully updated status to Pending, will recreate Service",
"service", masterServiceName)
return ctrl.Result{RequeueAfter: time.Second}, nil
log.Info(fmt.Sprintf("Successfully updated status to Pending, will recreate %s", resourceKind),
resourceKind, resourceName)
return true, time.Second, nil
}
return ctrl.Result{}, fmt.Errorf("failed to get master Service: %w", err)
return false, 0, fmt.Errorf("failed to get %s: %w", resourceKind, err)
}

// Don't update if already in terminal state (unless resources are missing — handled above)
if lt.Status.Phase == locustv2.PhaseSucceeded || lt.Status.Phase == locustv2.PhaseFailed {
return ctrl.Result{}, nil
// Resource exists, no action needed
return false, 0, nil
}

// checkResourcesExist verifies that all required resources (Service, Master Job, Worker Job) exist.
// If any resource is missing, it handles external deletion recovery.
// Returns (masterJob, workerJob, shouldRequeue, requeueAfter, error).
func (r *LocustTestReconciler) checkResourcesExist(
ctx context.Context,
lt *locustv2.LocustTest,
) (*batchv1.Job, *batchv1.Job, bool, time.Duration, error) {
// Check for externally deleted Service
masterServiceName := lt.Name + "-master"
masterService := &corev1.Service{}
if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion(
ctx, lt, masterServiceName, "Master Service", masterService,
); err != nil {
return nil, nil, false, 0, err
} else if shouldRequeue {
return nil, nil, true, requeueAfter, nil
}

// Fetch master Job to determine status
// Check for externally deleted master Job
masterJob := &batchv1.Job{}
masterJobName := lt.Name + "-master"
if err := r.Get(ctx, client.ObjectKey{Name: masterJobName, Namespace: lt.Namespace}, masterJob); err != nil {
if apierrors.IsNotFound(err) {
// Job was externally deleted — same recovery pattern
log.Info("Master Job externally deleted, transitioning to Pending for recovery",
"job", masterJobName)
r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted",
fmt.Sprintf("Master Job %s was deleted externally, will attempt recreation", masterJobName))

log.Info("Attempting to update status to Pending after external deletion",
"currentPhase", lt.Status.Phase,
"generation", lt.Generation,
"observedGeneration", lt.Status.ObservedGeneration)
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
if err := r.Get(ctx, client.ObjectKeyFromObject(lt), lt); err != nil {
log.Error(err, "Failed to re-fetch LocustTest during status update retry")
return err
}
log.V(1).Info("Re-fetched LocustTest for status update",
"resourceVersion", lt.ResourceVersion,
"phase", lt.Status.Phase)
lt.Status.Phase = locustv2.PhasePending
lt.Status.ObservedGeneration = lt.Generation
r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources")
return r.Status().Update(ctx, lt)
}); err != nil {
log.Error(err, "Failed to update status after detecting master Job deletion",
"job", masterJobName,
"retryAttempts", "exhausted")
// Still requeue to retry the entire reconciliation
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
log.Info("Successfully updated status to Pending, will recreate master Job",
"job", masterJobName)
return ctrl.Result{RequeueAfter: time.Second}, nil
}
return ctrl.Result{}, fmt.Errorf("failed to get master Job: %w", err)
if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion(
ctx, lt, masterJobName, "Master Job", masterJob,
); err != nil {
return nil, nil, false, 0, err
} else if shouldRequeue {
return nil, nil, true, requeueAfter, nil
}

// Fetch worker Job for worker count
// Check for externally deleted worker Job
workerJob := &batchv1.Job{}
workerJobName := lt.Name + "-worker"
if err := r.Get(ctx, client.ObjectKey{Name: workerJobName, Namespace: lt.Namespace}, workerJob); err != nil {
if apierrors.IsNotFound(err) {
// Worker Job externally deleted — recovery
log.Info("Worker Job externally deleted, transitioning to Pending for recovery",
"job", workerJobName)
r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted",
fmt.Sprintf("Worker Job %s was deleted externally, will attempt recreation", workerJobName))
if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion(
ctx, lt, workerJobName, "Worker Job", workerJob,
); err != nil {
return nil, nil, false, 0, err
} else if shouldRequeue {
return nil, nil, true, requeueAfter, nil
}

log.Info("Attempting to update status to Pending after external deletion",
"currentPhase", lt.Status.Phase,
"generation", lt.Generation,
"observedGeneration", lt.Status.ObservedGeneration)
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
if err := r.Get(ctx, client.ObjectKeyFromObject(lt), lt); err != nil {
log.Error(err, "Failed to re-fetch LocustTest during status update retry")
return err
}
log.V(1).Info("Re-fetched LocustTest for status update",
"resourceVersion", lt.ResourceVersion,
"phase", lt.Status.Phase)
lt.Status.Phase = locustv2.PhasePending
lt.Status.ObservedGeneration = lt.Generation
r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources")
return r.Status().Update(ctx, lt)
}); err != nil {
log.Error(err, "Failed to update status after detecting worker Job deletion",
"job", workerJobName,
"retryAttempts", "exhausted")
// Still requeue to retry the entire reconciliation
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
log.Info("Successfully updated status to Pending, will recreate worker Job",
"job", workerJobName)
return ctrl.Result{RequeueAfter: time.Second}, nil
}
return ctrl.Result{}, fmt.Errorf("failed to get worker Job: %w", err)
// All resources exist
return masterJob, workerJob, false, 0, nil
}

// shouldSkipStatusUpdate checks if the LocustTest is in a terminal state where status updates should be skipped.
// Returns true if Phase is Succeeded or Failed (terminal states).
func shouldSkipStatusUpdate(lt *locustv2.LocustTest) bool {
return lt.Status.Phase == locustv2.PhaseSucceeded || lt.Status.Phase == locustv2.PhaseFailed
}

// reconcileStatus updates the LocustTest status based on owned Job states.
// Called when resources already exist and we need to track Job completion.
func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2.LocustTest) (ctrl.Result, error) {
log := logf.FromContext(ctx)

// Check that all resources exist, handle external deletion if needed
masterJob, workerJob, shouldRequeue, requeueAfter, err := r.checkResourcesExist(ctx, lt)
if err != nil {
return ctrl.Result{}, err
}
if shouldRequeue {
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}

// Don't update if already in terminal state (unless resources are missing — handled above)
if shouldSkipStatusUpdate(lt) {
return ctrl.Result{}, nil
}

// Check pod health before updating status from Jobs
Expand Down
Loading