diff --git a/.codacy.yml b/.codacy.yml index cec35e4..d06261e 100644 --- a/.codacy.yml +++ b/.codacy.yml @@ -1,3 +1,4 @@ exclude_paths: - "**/zz_generated.deepcopy.go" - "test/utils/**" + - "**/*.md" diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 00a625a..94984a6 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -115,6 +115,10 @@ jobs: run: | python -m mkdocs build + - name: 📝 Copy Artifact Hub metadata + run: | + cp artifacthub-repo.yml ./site/ + - name: 🚀 Deploy documentation uses: peaceiris/actions-gh-pages@v4 with: diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml new file mode 100644 index 0000000..cb15f05 --- /dev/null +++ b/artifacthub-repo.yml @@ -0,0 +1,7 @@ +# Artifact Hub repository metadata file +# This file enables Verified Publisher status and ownership claim + +repositoryID: 8e1a59f6-36d5-4e2a-805e-aeb1c38784c0 +owners: + - name: Abdelrhman Hamouda + email: abdelrhman.ahmed@outlook.com diff --git a/charts/locust-k8s-operator/Chart.yaml b/charts/locust-k8s-operator/Chart.yaml index e3a3ac2..b1c9a5e 100644 --- a/charts/locust-k8s-operator/Chart.yaml +++ b/charts/locust-k8s-operator/Chart.yaml @@ -1,8 +1,19 @@ apiVersion: v2 name: locust-k8s-operator -description: Locust Kubernetes Operator - Go v2 +description: Production-ready Kubernetes operator for distributed Locust load testing with native OpenTelemetry, HA support, and automatic v1→v2 migration icon: https://raw.githubusercontent.com/AbdelrhmanHamouda/locust-k8s-operator/master/docs/assets/images/favicon.png +# Keywords for Artifact Hub search discoverability +keywords: + - locust + - load-testing + - performance-testing + - kubernetes-operator + - distributed-testing + - opentelemetry + - observability + - stress-testing + # A chart can be either an 'application' or a 'library' chart. # # Application charts are a collection of templates that can be packaged into versioned archives @@ -16,7 +27,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 2.0.0 +version: 2.0.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to @@ -29,10 +40,27 @@ home: https://github.com/AbdelrhmanHamouda/locust-k8s-operator sources: - https://github.com/AbdelrhmanHamouda/locust-k8s-operator maintainers: - - name: AbdelrhmanHamouda + - name: Abdelrhman Hamouda url: https://github.com/AbdelrhmanHamouda annotations: + # Artifact Hub specific metadata + artifacthub.io/license: Apache-2.0 + artifacthub.io/operator: "true" + artifacthub.io/operatorCapabilities: Basic Install + artifacthub.io/prerelease: "false" + artifacthub.io/containsSecurityUpdates: "false" + artifacthub.io/links: | + - name: Documentation + url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/ + - name: Getting Started + url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/getting_started/ + - name: Migration Guide (v1→v2) + url: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/ + - name: GitHub Discussions + url: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/discussions + - name: GitHub Issues + url: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/issues artifacthub.io/changes: | - kind: changed description: Rewritten for Go operator with clean-slate design diff --git a/charts/locust-k8s-operator/README.md b/charts/locust-k8s-operator/README.md new file mode 100644 index 0000000..bf5495f --- /dev/null +++ b/charts/locust-k8s-operator/README.md @@ -0,0 +1,98 @@ +# Locust Kubernetes Operator + +[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/locust-k8s-operator)](https://artifacthub.io/packages/helm/locust-k8s-operator/locust-k8s-operator) + +Production-ready Kubernetes operator for distributed Locust load testing with native OpenTelemetry support, high availability, and automatic v1→v2 migration. + +## ✨ Features + +- **🚀 Complete Go Rewrite**: 4x memory reduction, 60x faster startup +- **📊 Native OpenTelemetry**: Built-in metrics, traces, and optional collector deployment +- **🔒 High Availability**: Leader election support for multi-replica deployments +- **🔄 Zero-Downtime Migration**: Automatic conversion webhooks for v1→v2 upgrades +- **💚 Pod Health Monitoring**: Automatic pod recovery and health checks +- **🛡️ Production-Ready**: Comprehensive RBAC, security contexts, and resource management + +## 📦 Installation + +### Prerequisites + +- Kubernetes 1.25+ +- Helm 3.x + +### Quick Start + +```bash +# Add the Helm repository +helm repo add locust-k8s-operator https://abdelrhmanhamouda.github.io/locust-k8s-operator + +# Update repository +helm repo update + +# Install the operator +helm install my-locust-operator locust-k8s-operator/locust-k8s-operator + +# Install with OpenTelemetry collector +helm install my-locust-operator locust-k8s-operator/locust-k8s-operator \ + --set otelCollector.enabled=true +``` + +## ⚙️ Configuration + +### Key Values + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `replicas` | Number of operator replicas | `1` | +| `resources.limits.memory` | Memory limit | `128Mi` | +| `resources.limits.cpu` | CPU limit | `500m` | +| `webhook.enabled` | Enable conversion webhooks | `false` | +| `webhook.certManager.enabled` | Use cert-manager for webhook certs | `false` | +| `otelCollector.enabled` | Deploy OTel collector sidecar | `false` | +| `leaderElection.enabled` | Enable leader election for HA | `true` | + +### Example: Production Deployment with HA + +```bash +helm install locust-operator locust-k8s-operator/locust-k8s-operator \ + --set replicas=3 \ + --set leaderElection.enabled=true \ + --set webhook.enabled=true \ + --set webhook.certManager.enabled=true \ + --set otelCollector.enabled=true \ + --set resources.limits.memory=256Mi +``` + +## 🔄 Migrating from v1.x + +The operator includes automatic conversion webhooks for zero-downtime migration: + +```bash +# Install v2 with webhooks enabled +helm upgrade locust-operator locust-k8s-operator/locust-k8s-operator \ + --set webhook.enabled=true \ + --set webhook.certManager.enabled=true + +# Your existing v1 LocustTest resources will be automatically converted +``` + +For detailed migration guide, see: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/ + +## 📚 Documentation + +- **Full Documentation**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/ +- **Getting Started**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/getting_started/ +- **Migration Guide**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/migration/ +- **API Reference**: https://abdelrhmanhamouda.github.io/locust-k8s-operator/api_reference/ + +## 🐛 Support + +- **GitHub Issues**: https://github.com/AbdelrhmanHamouda/locust-k8s-operator/issues + +## 📝 License + +Apache-2.0 + +## 🙏 Contributing + +Contributions are welcome! Please see our [Contributing Guide](https://github.com/AbdelrhmanHamouda/locust-k8s-operator/blob/master/CONTRIBUTING.md). diff --git a/internal/controller/locusttest_controller.go b/internal/controller/locusttest_controller.go index 6751e85..040b03c 100644 --- a/internal/controller/locusttest_controller.go +++ b/internal/controller/locusttest_controller.go @@ -258,21 +258,25 @@ func (r *LocustTestReconciler) createResource(ctx context.Context, lt *locustv2. return nil } -// reconcileStatus updates the LocustTest status based on owned Job states. -// Called when resources already exist and we need to track Job completion. -func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2.LocustTest) (ctrl.Result, error) { +// handleExternalResourceDeletion handles the case where a resource was externally deleted. +// It transitions the LocustTest to Pending phase to trigger recreation on the next reconcile. +// Returns (shouldRequeue, requeueAfter, error). +func (r *LocustTestReconciler) handleExternalResourceDeletion( + ctx context.Context, + lt *locustv2.LocustTest, + resourceName, resourceKind string, + obj client.Object, +) (bool, time.Duration, error) { log := logf.FromContext(ctx) - // Check for externally deleted Service - masterServiceName := lt.Name + "-master" - masterService := &corev1.Service{} - if err := r.Get(ctx, client.ObjectKey{Name: masterServiceName, Namespace: lt.Namespace}, masterService); err != nil { + // Try to fetch the resource + if err := r.Get(ctx, client.ObjectKey{Name: resourceName, Namespace: lt.Namespace}, obj); err != nil { if apierrors.IsNotFound(err) { - // Service was externally deleted — transition to Pending for recovery - log.Info("Master Service externally deleted, transitioning to Pending for recovery", - "service", masterServiceName) + // Resource was externally deleted — transition to Pending for recovery + log.Info(fmt.Sprintf("%s externally deleted, transitioning to Pending for recovery", resourceKind), + resourceKind, resourceName) r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted", - fmt.Sprintf("Master Service %s was deleted externally, will attempt recreation", masterServiceName)) + fmt.Sprintf("%s %s was deleted externally, will attempt recreation", resourceKind, resourceName)) // Reset to Pending to trigger resource recreation on next reconcile log.Info("Attempting to update status to Pending after external deletion", @@ -292,104 +296,90 @@ func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2 r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources") return r.Status().Update(ctx, lt) }); err != nil { - log.Error(err, "Failed to update status after detecting Service deletion", - "service", masterServiceName, + log.Error(err, fmt.Sprintf("Failed to update status after detecting %s deletion", resourceKind), + resourceKind, resourceName, "retryAttempts", "exhausted") // Still requeue to retry the entire reconciliation - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return true, 5 * time.Second, nil } - log.Info("Successfully updated status to Pending, will recreate Service", - "service", masterServiceName) - return ctrl.Result{RequeueAfter: time.Second}, nil + log.Info(fmt.Sprintf("Successfully updated status to Pending, will recreate %s", resourceKind), + resourceKind, resourceName) + return true, time.Second, nil } - return ctrl.Result{}, fmt.Errorf("failed to get master Service: %w", err) + return false, 0, fmt.Errorf("failed to get %s: %w", resourceKind, err) } - // Don't update if already in terminal state (unless resources are missing — handled above) - if lt.Status.Phase == locustv2.PhaseSucceeded || lt.Status.Phase == locustv2.PhaseFailed { - return ctrl.Result{}, nil + // Resource exists, no action needed + return false, 0, nil +} + +// checkResourcesExist verifies that all required resources (Service, Master Job, Worker Job) exist. +// If any resource is missing, it handles external deletion recovery. +// Returns (masterJob, workerJob, shouldRequeue, requeueAfter, error). +func (r *LocustTestReconciler) checkResourcesExist( + ctx context.Context, + lt *locustv2.LocustTest, +) (*batchv1.Job, *batchv1.Job, bool, time.Duration, error) { + // Check for externally deleted Service + masterServiceName := lt.Name + "-master" + masterService := &corev1.Service{} + if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion( + ctx, lt, masterServiceName, "Master Service", masterService, + ); err != nil { + return nil, nil, false, 0, err + } else if shouldRequeue { + return nil, nil, true, requeueAfter, nil } - // Fetch master Job to determine status + // Check for externally deleted master Job masterJob := &batchv1.Job{} masterJobName := lt.Name + "-master" - if err := r.Get(ctx, client.ObjectKey{Name: masterJobName, Namespace: lt.Namespace}, masterJob); err != nil { - if apierrors.IsNotFound(err) { - // Job was externally deleted — same recovery pattern - log.Info("Master Job externally deleted, transitioning to Pending for recovery", - "job", masterJobName) - r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted", - fmt.Sprintf("Master Job %s was deleted externally, will attempt recreation", masterJobName)) - - log.Info("Attempting to update status to Pending after external deletion", - "currentPhase", lt.Status.Phase, - "generation", lt.Generation, - "observedGeneration", lt.Status.ObservedGeneration) - if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - if err := r.Get(ctx, client.ObjectKeyFromObject(lt), lt); err != nil { - log.Error(err, "Failed to re-fetch LocustTest during status update retry") - return err - } - log.V(1).Info("Re-fetched LocustTest for status update", - "resourceVersion", lt.ResourceVersion, - "phase", lt.Status.Phase) - lt.Status.Phase = locustv2.PhasePending - lt.Status.ObservedGeneration = lt.Generation - r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources") - return r.Status().Update(ctx, lt) - }); err != nil { - log.Error(err, "Failed to update status after detecting master Job deletion", - "job", masterJobName, - "retryAttempts", "exhausted") - // Still requeue to retry the entire reconciliation - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil - } - log.Info("Successfully updated status to Pending, will recreate master Job", - "job", masterJobName) - return ctrl.Result{RequeueAfter: time.Second}, nil - } - return ctrl.Result{}, fmt.Errorf("failed to get master Job: %w", err) + if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion( + ctx, lt, masterJobName, "Master Job", masterJob, + ); err != nil { + return nil, nil, false, 0, err + } else if shouldRequeue { + return nil, nil, true, requeueAfter, nil } - // Fetch worker Job for worker count + // Check for externally deleted worker Job workerJob := &batchv1.Job{} workerJobName := lt.Name + "-worker" - if err := r.Get(ctx, client.ObjectKey{Name: workerJobName, Namespace: lt.Namespace}, workerJob); err != nil { - if apierrors.IsNotFound(err) { - // Worker Job externally deleted — recovery - log.Info("Worker Job externally deleted, transitioning to Pending for recovery", - "job", workerJobName) - r.Recorder.Event(lt, corev1.EventTypeWarning, "ResourceDeleted", - fmt.Sprintf("Worker Job %s was deleted externally, will attempt recreation", workerJobName)) + if shouldRequeue, requeueAfter, err := r.handleExternalResourceDeletion( + ctx, lt, workerJobName, "Worker Job", workerJob, + ); err != nil { + return nil, nil, false, 0, err + } else if shouldRequeue { + return nil, nil, true, requeueAfter, nil + } - log.Info("Attempting to update status to Pending after external deletion", - "currentPhase", lt.Status.Phase, - "generation", lt.Generation, - "observedGeneration", lt.Status.ObservedGeneration) - if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - if err := r.Get(ctx, client.ObjectKeyFromObject(lt), lt); err != nil { - log.Error(err, "Failed to re-fetch LocustTest during status update retry") - return err - } - log.V(1).Info("Re-fetched LocustTest for status update", - "resourceVersion", lt.ResourceVersion, - "phase", lt.Status.Phase) - lt.Status.Phase = locustv2.PhasePending - lt.Status.ObservedGeneration = lt.Generation - r.setReady(lt, false, locustv2.ReasonResourcesCreating, "Recreating externally deleted resources") - return r.Status().Update(ctx, lt) - }); err != nil { - log.Error(err, "Failed to update status after detecting worker Job deletion", - "job", workerJobName, - "retryAttempts", "exhausted") - // Still requeue to retry the entire reconciliation - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil - } - log.Info("Successfully updated status to Pending, will recreate worker Job", - "job", workerJobName) - return ctrl.Result{RequeueAfter: time.Second}, nil - } - return ctrl.Result{}, fmt.Errorf("failed to get worker Job: %w", err) + // All resources exist + return masterJob, workerJob, false, 0, nil +} + +// shouldSkipStatusUpdate checks if the LocustTest is in a terminal state where status updates should be skipped. +// Returns true if Phase is Succeeded or Failed (terminal states). +func shouldSkipStatusUpdate(lt *locustv2.LocustTest) bool { + return lt.Status.Phase == locustv2.PhaseSucceeded || lt.Status.Phase == locustv2.PhaseFailed +} + +// reconcileStatus updates the LocustTest status based on owned Job states. +// Called when resources already exist and we need to track Job completion. +func (r *LocustTestReconciler) reconcileStatus(ctx context.Context, lt *locustv2.LocustTest) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + // Check that all resources exist, handle external deletion if needed + masterJob, workerJob, shouldRequeue, requeueAfter, err := r.checkResourcesExist(ctx, lt) + if err != nil { + return ctrl.Result{}, err + } + if shouldRequeue { + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + // Don't update if already in terminal state (unless resources are missing — handled above) + if shouldSkipStatusUpdate(lt) { + return ctrl.Result{}, nil } // Check pod health before updating status from Jobs diff --git a/internal/resources/job.go b/internal/resources/job.go index 1925d00..dc24320 100644 --- a/internal/resources/job.go +++ b/internal/resources/job.go @@ -18,6 +18,7 @@ package resources import ( "fmt" + "log" locustv2 "github.com/AbdelrhmanHamouda/locust-k8s-operator/api/v2" "github.com/AbdelrhmanHamouda/locust-k8s-operator/internal/config" @@ -385,21 +386,29 @@ func hasResourcesSpecified(r *corev1.ResourceRequirements) bool { // buildResourceList creates a ResourceList from CPU, memory, and ephemeral storage strings. // Empty strings are skipped (not added to the resource list). -// Safe parsing is used (errors ignored) because values are pre-validated at operator startup. +// Values are pre-validated at operator startup, but defensive logging is included to detect validation bypasses. func buildResourceList(cpu, memory, ephemeral string) corev1.ResourceList { resources := corev1.ResourceList{} if cpu != "" { - // Safe: Already validated at startup in LoadConfig - q, _ := resource.ParseQuantity(cpu) + q, err := resource.ParseQuantity(cpu) + if err != nil { + log.Printf("CRITICAL: Invalid CPU quantity %q passed startup validation: %v", cpu, err) + } resources[corev1.ResourceCPU] = q } if memory != "" { - q, _ := resource.ParseQuantity(memory) + q, err := resource.ParseQuantity(memory) + if err != nil { + log.Printf("CRITICAL: Invalid memory quantity %q passed startup validation: %v", memory, err) + } resources[corev1.ResourceMemory] = q } if ephemeral != "" { - q, _ := resource.ParseQuantity(ephemeral) + q, err := resource.ParseQuantity(ephemeral) + if err != nil { + log.Printf("CRITICAL: Invalid ephemeral storage quantity %q passed startup validation: %v", ephemeral, err) + } resources[corev1.ResourceEphemeralStorage] = q }