diff --git a/test/stress-scale-tests/README.md b/test/stress-scale-tests/README.md new file mode 100644 index 000000000..f0bb2d657 --- /dev/null +++ b/test/stress-scale-tests/README.md @@ -0,0 +1,179 @@ +# AWS EFS CSI Driver Stress and Scalability Tests + +This framework provides comprehensive stress and scalability testing for the AWS EFS CSI Driver in Kubernetes environments. + +## Overview + +The test framework automatically generates load on the EFS CSI Driver by creating and managing PVCs and Pods according to configurable parameters. It tests various access patterns and scenarios to ensure reliability under stress. + +## Features + +- **Orchestrated Testing**: Random sequence of volume and pod operations with configurable weights +- **Scenario Testing**: Specialized test scenarios including: + - Shared Volume Access (multiple pods using a single PVC to test ReadWriteMany capability) + - Dedicated Volume Access (individual pods with dedicated PVCs to test isolation) + - Concurrent Volume Operations (rapid creation and deletion of multiple PVCs to test API handling) +- **Shared Volume Testing**: Verifies read/write operations between pods sharing volumes +- **Comprehensive Reporting**: Detailed logs and metrics in JSON and summary formats +- **Configurable Parameters**: Adjust test duration, operation rates, resource limits, and more + +## Prerequisites + +- AWS Account with appropriate permissions for: + - EFS filesystem creation and management + - EKS cluster management (if creating a new cluster) +- Kubernetes cluster with: + - EFS CSI Driver installed (unless using the orchestrator to install it) + - Node(s) in the same VPC as your EFS filesystem +- `kubectl` configured to access the cluster +- Required Python packages (install via requirements.txt): + - kubernetes + - pytest + - pyyaml + - prometheus_client + - pandas + - psutil + - boto3 + +## Quick Start + +### Important Configuration Notes + +Before running tests, you'll need to configure key settings in `config/orchestrator_config.yaml`. The most important sections are: + +1. **Driver Configuration**: + ```yaml + driver: + create_filesystem: true/false # Set to true to automatically create a new EFS filesystem + filesystem_id: fs-xxx # Required if create_filesystem is false (use existing filesystem) + # Note: If create_filesystem is true, boto3 will be used to create the filesystem + ``` + +2. **Storage Class Configuration**: + ```yaml + storage_class: + parameters: + fileSystemId: fs-xxx # Must match your filesystem_id + region: us-west-1 # Your AWS region + availabilityZoneName: us-west-1b # AZ where your nodes are running + ``` + +3. **Pod Configuration**: + ```yaml + pod_config: + node_selector: + topology.kubernetes.io/zone: us-west-1b # Must match your node's AZ + ``` + +### Getting Started + +1. Set up a Python virtual environment (recommended): + ``` + # Create a virtual environment + python -m venv venv + + # Activate the virtual environment + # On Linux/macOS: + source venv/bin/activate + # On Windows: + # venv\Scripts\activate + ``` + +2. Install dependencies: + ``` + pip install -r requirements.txt + ``` + +3. Configure the test parameters in `config/orchestrator_config.yaml` + +4. Run the tests: + ``` + python run_tests.py + ``` + +## Configuration Structure + +The configuration is modularized into separate components for better organization and clarity: + +1. `config/orchestrator_config.yaml`: Main configuration file that imports component configurations +2. Component configurations in `config/components/`: + - `driver.yaml`: Driver installation and resource settings + - `storage.yaml`: Storage class configuration + - `test.yaml`: Test parameters, metrics, and reporting settings + - `pod.yaml`: Pod configuration settings + - `scenarios.yaml`: Test scenario definitions + +Each component file is well-documented with comments explaining available options. The modular structure allows you to: +- Focus on specific configuration aspects independently +- Easily understand which settings are related +- Comment out unused sections without affecting other components +- Override specific settings in the main config file if needed + +### Key Configuration Parameters + +Most commonly adjusted settings: + +1. In `driver.yaml`: + - `driver.create_filesystem`: Whether to create a new EFS filesystem + - `driver.filesystem_id`: Your EFS filesystem ID + +2. In `storage.yaml`: + - `storage_class.parameters.fileSystemId`: Must match your filesystem_id + - `storage_class.parameters.region`: Your AWS region + - `storage_class.parameters.availabilityZoneName`: Your AZ + +3. In `test.yaml`: + - `test.duration`: Test duration in seconds + - `test.namespace`: Kubernetes namespace for test resources + - `test.operation_interval`: Time between operations + +4. In `pod.yaml`: + - `pod_config.node_selector`: Must match your node's availability zone + +5. In `scenarios.yaml`: + - Enable/disable specific test scenarios as needed + - Adjust scenario parameters like pod counts and PVC limits + +## Running Tests + +Basic test with default parameters: +``` +python run_tests.py +``` + +Run with custom duration (e.g., 2 hours): +``` +python run_tests.py --duration 7200 +``` + +Run with custom interval (seconds between operations): +``` +python run_tests.py --interval 10 +``` + +## Cleanup + +To clean up resources created by tests: +``` +python cleanup_test_resources.py +``` + +## Reports + +Test reports are stored in: +- `reports/orchestrator/`: Orchestrator test reports (JSON) +- `reports/general/`: General test summary reports + +## Architecture + +- `tests/orchestrator.py`: Main test orchestration engine +- `utils/metrics_collector.py`: Collects performance metrics +- `utils/report_generator.py`: Generates test reports +- `run_tests.py`: CLI for running tests +- `cleanup_test_resources.py`: Utility for cleaning up test resources + +## Notes + +- The tests use `kubectl` subprocess calls for pod exec operations to avoid WebSocket protocol issues +- All tests run in the namespace specified in the config (default: `efs-stress-test`) +# AWS EFS CSI Driver Testing Framework diff --git a/test/stress-scale-tests/cleanup_test_resources.py b/test/stress-scale-tests/cleanup_test_resources.py new file mode 100755 index 000000000..68d6743d7 --- /dev/null +++ b/test/stress-scale-tests/cleanup_test_resources.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +import argparse +import logging +import os +import time +from datetime import datetime +from kubernetes import client, config +from kubernetes.client.rest import ApiException + +""" +EFS CSI Driver Test Cleanup Script +This script deletes all test-related resources to ensure a clean environment +""" + +# Configure logging +os.makedirs('logs', exist_ok=True) +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(f'logs/cleanup_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +def delete_resource(api_instance, resource_type, name, namespace, force=False): + """Delete a specific Kubernetes resource with proper error handling""" + try: + logger.info(f"Deleting {resource_type}/{name} in namespace {namespace}") + + # Set deletion options + body = client.V1DeleteOptions( + grace_period_seconds=0 if force else None, + propagation_policy="Background" if force else "Foreground" + ) + + if resource_type == "pod": + api_instance.delete_namespaced_pod(name=name, namespace=namespace, body=body) + elif resource_type == "pvc": + api_instance.delete_namespaced_persistent_volume_claim(name=name, namespace=namespace, body=body) + + return True + except ApiException as e: + if e.status == 404: + logger.warning(f"{resource_type}/{name} already deleted or not found") + return True + else: + logger.error(f"Failed to delete {resource_type}/{name}: {e}") + return False + +def cleanup_test_resources(namespaces=None, pod_prefixes=None, pvc_prefixes=None, force=True, wait=True): + """Clean up all test-related resources""" + # Load kube config + try: + config.load_kube_config() + except Exception as e: + logger.error(f"Failed to load kubeconfig: {e}") + return False + + core_v1 = client.CoreV1Api() + + # Default prefixes if none provided + if not pod_prefixes: + pod_prefixes = ["test-pod-", "efs-scale-test-", "efs-app", "efs-sanity-pod"] + + if not pvc_prefixes: + pvc_prefixes = ["test-pvc-", "concurrent-pvc-", "many2one-", "one2one-", + "chaos-pvc-", "chaos-ap-", "efs-volume-", "scale-test-pvc"] + + # Get list of namespaces to clean up + try: + if not namespaces: + namespaces_list = core_v1.list_namespace() + namespaces = [ns.metadata.name for ns in namespaces_list.items] + # Filter only to likely test namespaces to avoid touching system namespaces + namespaces = [ns for ns in namespaces if ns in + ["default", "efs-stress-test", "efs-test", "test"]] + + logger.info(f"Cleaning up resources in namespaces: {namespaces}") + except ApiException as e: + logger.error(f"Failed to list namespaces: {e}") + return False + + # Track deleted resources and failures + deleted_resources = { + "pods": [], + "pvcs": [] + } + failed_deletions = { + "pods": [], + "pvcs": [] + } + + # Delete pods that match the prefixes in each namespace + for namespace in namespaces: + try: + # Get all pods in the namespace + pods = core_v1.list_namespaced_pod(namespace=namespace) + + # Filter pods that match the prefixes + for pod in pods.items: + pod_name = pod.metadata.name + if any(pod_name.startswith(prefix) for prefix in pod_prefixes): + success = delete_resource(core_v1, "pod", pod_name, namespace, force) + if success: + deleted_resources["pods"].append(f"{namespace}/{pod_name}") + else: + failed_deletions["pods"].append(f"{namespace}/{pod_name}") + except ApiException as e: + logger.error(f"Failed to list pods in namespace {namespace}: {e}") + + # Wait briefly for pods to start terminating + if wait: + logger.info("Waiting 5 seconds for pods to start terminating before deleting PVCs...") + time.sleep(5) + + # Delete PVCs that match the prefixes in each namespace + for namespace in namespaces: + try: + # Get all PVCs in the namespace + pvcs = core_v1.list_namespaced_persistent_volume_claim(namespace=namespace) + + # Filter PVCs that match the prefixes + for pvc in pvcs.items: + pvc_name = pvc.metadata.name + if any(pvc_name.startswith(prefix) for prefix in pvc_prefixes): + success = delete_resource(core_v1, "pvc", pvc_name, namespace, force) + if success: + deleted_resources["pvcs"].append(f"{namespace}/{pvc_name}") + else: + failed_deletions["pvcs"].append(f"{namespace}/{pvc_name}") + except ApiException as e: + logger.error(f"Failed to list PVCs in namespace {namespace}: {e}") + + # Print summary + logger.info("Cleanup Summary:") + logger.info(f"Deleted {len(deleted_resources['pods'])} pods and {len(deleted_resources['pvcs'])} PVCs") + + if failed_deletions["pods"] or failed_deletions["pvcs"]: + logger.warning("Failed deletions:") + for pod in failed_deletions["pods"]: + logger.warning(f" - Pod: {pod}") + for pvc in failed_deletions["pvcs"]: + logger.warning(f" - PVC: {pvc}") + return False + else: + logger.info("All resources deleted successfully") + return True + +def verify_resources_deleted(namespaces=None, pod_prefixes=None, pvc_prefixes=None, timeout=60): + """Verify that resources have been completely deleted""" + if not namespaces: + namespaces = ["default", "efs-stress-test", "efs-test", "test"] + + if not pod_prefixes: + pod_prefixes = ["test-pod-", "efs-scale-test-", "efs-app", "efs-sanity-pod"] + + if not pvc_prefixes: + pvc_prefixes = ["test-pvc-", "concurrent-pvc-", "many2one-", "one2one-", + "chaos-pvc-", "chaos-ap-", "efs-volume-", "scale-test-pvc"] + + logger.info(f"Verifying resource deletion for up to {timeout} seconds...") + + start_time = time.time() + core_v1 = client.CoreV1Api() + + while time.time() - start_time < timeout: + remaining_resources = [] + + # Check each namespace for remaining resources + for namespace in namespaces: + try: + # Check for remaining pods + pods = core_v1.list_namespaced_pod(namespace=namespace) + for pod in pods.items: + if any(pod.metadata.name.startswith(prefix) for prefix in pod_prefixes): + remaining_resources.append(f"pod/{namespace}/{pod.metadata.name}") + + # Check for remaining PVCs + pvcs = core_v1.list_namespaced_persistent_volume_claim(namespace=namespace) + for pvc in pvcs.items: + if any(pvc.metadata.name.startswith(prefix) for prefix in pvc_prefixes): + remaining_resources.append(f"pvc/{namespace}/{pvc.metadata.name}") + + except ApiException as e: + logger.error(f"Error checking namespace {namespace}: {e}") + + if not remaining_resources: + logger.info(f"All resources deleted successfully after {time.time() - start_time:.1f} seconds") + return True + + logger.info(f"Still waiting on {len(remaining_resources)} resources to be deleted...") + time.sleep(5) + + # If we get here, we timed out waiting for deletion + logger.error(f"Timed out waiting for resource deletion. Remaining resources:") + for resource in remaining_resources: + logger.error(f" - {resource}") + + return False + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Clean up EFS CSI Driver test resources") + parser.add_argument("--namespaces", "-n", type=str, nargs="+", + help="Namespaces to clean up (default: default, efs-stress-test)") + parser.add_argument("--force", "-f", action="store_true", default=True, + help="Force deletion with grace period 0 (default: True)") + parser.add_argument("--verify", "-v", action="store_true", default=True, + help="Verify that all resources are deleted (default: True)") + parser.add_argument("--verify-timeout", "-t", type=int, default=60, + help="Timeout in seconds for verification (default: 60)") + + args = parser.parse_args() + + # Start the cleanup process + logger.info("Starting EFS CSI Driver test resource cleanup") + success = cleanup_test_resources( + namespaces=args.namespaces, + force=args.force + ) + + # Verify deletion if requested + if args.verify and success: + verify_resources_deleted( + namespaces=args.namespaces, + timeout=args.verify_timeout + ) + + logger.info("Cleanup process completed") diff --git a/test/stress-scale-tests/config/components/driver.yaml b/test/stress-scale-tests/config/components/driver.yaml new file mode 100644 index 000000000..458592bae --- /dev/null +++ b/test/stress-scale-tests/config/components/driver.yaml @@ -0,0 +1,56 @@ +# Driver installation and configuration +driver: + # ECR repository for the EFS CSI driver image + repository: 745939127895.dkr.ecr.us-east-1.amazonaws.com/amazon/aws-efs-csi-driver + # Kubernetes namespace for the driver + namespace: kube-system + # Driver version to deploy + version: 2.1.8 + # Existing EFS filesystem ID + filesystem_id: fs-024e09bcfd37c74aa + # Installation method (currently only helm is supported) + install_method: helm + +# Driver resource configuration +driver_resources: + controller: + # Whether to delete access point root directory on cleanup + delete_access_point_root_dir: true + # Controller log level (1-5) + log_level: 5 + resources: + limits: + cpu: 4000m + memory: 8.0Gi + requests: + cpu: 200m + memory: 1Gi + + # Node daemon configuration + node: + log_level: 5 + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 100m + memory: 128Mi + # Optional sidecar configurations + sidecars: + liveness_probe: + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 20Mi + node_driver_registrar: + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 20Mi diff --git a/test/stress-scale-tests/config/components/pod.yaml b/test/stress-scale-tests/config/components/pod.yaml new file mode 100644 index 000000000..2c417db0a --- /dev/null +++ b/test/stress-scale-tests/config/components/pod.yaml @@ -0,0 +1,30 @@ +# Pod configuration for test workloads +pod_config: + # Container image to use for test pods + image: alpine + + # Command to run in the container + command: + - /bin/sh + - -c + + # Arguments for the command + args: + - touch /data/pod-ready && while true; do sleep 30; done + + # Node selector to control pod placement + # IMPORTANT: Must match your cluster's availability zone + node_selector: + topology.kubernetes.io/zone: us-west-1b + + # Pod readiness probe configuration + readiness_probe: + initial_delay_seconds: 5 + period_seconds: 5 + + # Pod tolerations for node placement + tolerations: + - effect: NoSchedule + key: instance + operator: Equal + value: core diff --git a/test/stress-scale-tests/config/components/scenarios.yaml b/test/stress-scale-tests/config/components/scenarios.yaml new file mode 100644 index 000000000..05bacb580 --- /dev/null +++ b/test/stress-scale-tests/config/components/scenarios.yaml @@ -0,0 +1,34 @@ +# Test scenario configurations +scenarios: + # Multiple pods accessing a single PVC + many_to_one: + enabled: true + max_pods: 20 + min_pods: 10 + + # One-to-one pod to PVC testing + one_to_one: + enabled: true + max_pairs: 20 + min_pairs: 10 + + # Concurrent PVC operations testing + concurrent_pvc: + enabled: true + max_pvcs: 30 + min_pvcs: 20 + + # Advanced scenarios + + # Controller crash resilience testing + controller_crash: + enabled: true + recovery_timeout: 300 + controller_namespace: "kube-system" + controller_pod_selector: "app=efs-csi-controller" + + # Add your custom scenarios here + # custom_scenario: + # enabled: false + # parameter1: value1 + # parameter2: value2 diff --git a/test/stress-scale-tests/config/components/storage.yaml b/test/stress-scale-tests/config/components/storage.yaml new file mode 100644 index 000000000..d1294bdbd --- /dev/null +++ b/test/stress-scale-tests/config/components/storage.yaml @@ -0,0 +1,47 @@ +# Storage class configuration for EFS volumes +storage_class: + # Name of the storage class to create + name: efs-sc + + # Mount options for EFS volumes + mount_options: + - tls # Enable TLS for secure data transfer + - hard # Hard mount for better reliability + - nfsvers=4.1 # NFS version to use + + # Storage class parameters + parameters: + # Base path for dynamic provisioning (optional) + basePath: /dynamic_provisioning_test + + # Whether to delete access point root directory on PVC deletion + deleteAccessPointRootDir: 'true' + + # Directory permissions for new access points + directoryPerms: '700' + + # EFS filesystem ID - MUST match the filesystem_id in driver.yaml + fileSystemId: fs-024e09bcfd37c74aa + + # GID range for access point creation + gidRangeStart: '1500' + gidRangeEnd: '2000' + + # UID range for access point creation + uidRangeStart: '1500' + uidRangeEnd: '2000' + + # Provisioning mode (efs-ap for access point mode) + provisioningMode: efs-ap + + # AWS region where the EFS filesystem exists + region: us-west-1 + + # Availability zone for the EFS mount targets + availabilityZoneName: us-west-1b + + # What happens to PVs when PVCs are deleted + reclaim_policy: Delete + + # When volumes are bound to pods + volume_binding_mode: Immediate diff --git a/test/stress-scale-tests/config/components/test.yaml b/test/stress-scale-tests/config/components/test.yaml new file mode 100644 index 000000000..b1cad7e72 --- /dev/null +++ b/test/stress-scale-tests/config/components/test.yaml @@ -0,0 +1,58 @@ +# Core test configuration +test: + # Test duration in seconds + duration: 3000 + # Kubernetes namespace for test resources + namespace: efs-stress-test + # Time between operations in seconds + operation_interval: 2 + +# Resource limits to prevent overload +resource_limits: + # Maximum pods that can use a single PVC + max_pods_per_pvc: 50 + # Maximum number of PVCs to create + max_pvcs: 1000 + # Maximum total pods across all PVCs + total_max_pods: 3000 + +# Operation frequency weights (higher = more frequent) +operation_weights: + # Core operations + create_pvc: 30 + attach_pod: 25 + delete_pod: 25 + delete_pvc: 25 + # Additional operations + verify_readwrite: 25 + run_specific_scenario: 25 + +# Retry configuration for failed operations +retries: + max_attempts: 3 + retry_delay: 1 + +# Logging configuration +logging: + # Enable console output + console_enabled: true + # Enable file logging + file_enabled: true + # Log level (DEBUG, INFO, WARNING, ERROR) + level: INFO + +# Metrics collection settings +metrics: + # Track operation durations + collect_operation_durations: true + # Track system metrics (CPU, memory, etc) + collect_system_metrics: true + +# Report generation settings +reporting: + # Generate JSON report + json_report: true + # Generate summary report + summary_report: true + # Directory for report output + report_directory: reports/orchestrator diff --git a/test/stress-scale-tests/config/orchestrator_config.yaml b/test/stress-scale-tests/config/orchestrator_config.yaml new file mode 100644 index 000000000..67502337d --- /dev/null +++ b/test/stress-scale-tests/config/orchestrator_config.yaml @@ -0,0 +1,236 @@ +# AWS EFS CSI Driver Orchestrator Configuration + +# Cluster Configuration +cluster: + # Whether to create a new Kubernetes cluster (true) or use an existing one (false) + create: true + # Kubernetes version to use when creating a new cluster + kubernetes_version: '1.28' + # Number of worker nodes in the cluster + node_count: 3 + # EC2 instance type for the worker nodes + node_type: t3.large + # AWS region where the cluster will be created + region: us-west-2 + +# CSI Driver Configuration +driver: + # Whether to create a new EFS filesystem (true) or use an existing one (false) + create_filesystem: true + # ID of the EFS filesystem to use, required when create_filesystem is false + filesystem_id: fs-024e09bcfd37c74aa + # Method to install the CSI driver (helm, manual, etc.) + install_method: helm + # Version of the CSI driver to install + version: 2.1.0 + +# Driver Resource Allocation +driver_resources: + # Controller pod configuration + controller: + # Whether to delete the access point root directory on PVC deletion + delete_access_point_root_dir: true + # Log verbosity level (1-9) + log_level: 5 + # Resource limits and requests for the controller pod + resources: + # Maximum resources the controller pod can use + limits: + cpu: 1000m + memory: 2.5Gi + # Minimum resources guaranteed to the controller pod + requests: + cpu: 200m + memory: 1Gi + + # Node daemonset configuration + node: + # Log verbosity level (1-9) + log_level: 5 + # Resource limits and requests for the node pods + resources: + # Maximum resources the node pod can use + limits: + cpu: 1000m + memory: 1Gi + # Minimum resources guaranteed to the node pod + requests: + cpu: 100m + memory: 128Mi + # Sidecar containers configuration + sidecars: + # Liveness probe sidecar container + liveness_probe: + # Resource limits and requests + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 20Mi + # Node driver registrar sidecar container + node_driver_registrar: + # Resource limits and requests + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 20Mi + +# Logging Configuration +logging: + # Whether to enable logging to console + console_enabled: true + # Whether to enable logging to file + file_enabled: true + # Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + level: INFO + +# Metrics Collection Configuration +metrics: + # Whether to collect operation duration metrics + collect_operation_durations: true + # Whether to collect system metrics (CPU, memory, etc.) + collect_system_metrics: true + +# Operation Probability Weights +operation_weights: + # Weight for pod attachment operations (higher = more frequent) + attach_pod: 15 + # Weight for PVC creation operations + create_pvc: 25 + # Weight for pod deletion operations + delete_pod: 5 + # Weight for PVC deletion operations + delete_pvc: 10 + # Weight for running specific test scenarios + run_specific_scenario: 30 + # Weight for read/write verification operations + verify_readwrite: 25 + +# Pod Configuration +pod_config: + # Arguments passed to the container command + args: + - touch /data/pod-ready && while true; do sleep 30; done + # Command to run in the container + command: + - /bin/sh + - -c + # Container image to use for test pods + image: alpine + # Node selector for pod scheduling (currently commented out) + # node_selector: + # efs-issue: 'false' + # Readiness probe configuration + readiness_probe: + # Time to wait before first probe after container starts + initial_delay_seconds: 5 + # How often to perform the probe + period_seconds: 5 + # Pod tolerations for scheduling on tainted nodes + tolerations: + - effect: NoSchedule + key: instance + operator: Equal + value: core + +# Test Reporting Configuration +reporting: + # Whether to generate JSON reports + json_report: true + # Directory where reports will be stored + report_directory: reports/orchestrator + # Whether to generate a summary report + summary_report: true + +# Resource Limits for Tests +resource_limits: + # Maximum number of pods that can be attached to a single PVC + max_pods_per_pvc: 50 + # Maximum number of PVCs to create during testing + max_pvcs: 100 + # Maximum total number of pods to create across all tests + total_max_pods: 30 + +# Retry Configuration +retries: + # Maximum number of retry attempts for failed operations + max_attempts: 3 + # Delay in seconds between retry attempts + retry_delay: 1 + +# Test Scenario Configurations +scenarios: + # Concurrent PVC scenario - tests creating multiple PVCs concurrently + concurrent_pvc: + # Whether this scenario is enabled + enabled: true + # Maximum number of PVCs to create in this scenario + max_pvcs: 7 + # Minimum number of PVCs to create in this scenario + min_pvcs: 3 + + # Many-to-one scenario - tests multiple pods using the same PVC + many_to_one: + # Whether this scenario is enabled + enabled: true + # Maximum number of pods to create per PVC + max_pods: 5 + # Minimum number of pods to create per PVC + min_pods: 3 + + # One-to-one scenario - tests pods with their own dedicated PVCs + one_to_one: + # Whether this scenario is enabled + enabled: true + # Maximum number of pod-PVC pairs to create + max_pairs: 5 + # Minimum number of pod-PVC pairs to create + min_pairs: 3 + +# Storage Class Configuration +storage_class: + # Mount options for the volumes + mount_options: + - tls # Enable TLS for the NFS connection + - hard # Hard mount (retry indefinitely) + - nfsvers=4.1 # Use NFS version 4.1 + # Name of the storage class + name: efs-sc + # StorageClass parameters + parameters: + # Base path in the filesystem for dynamic provisioning + basePath: /dynamic_provisioning_test + # Whether to delete the access point root directory when PVC is deleted + deleteAccessPointRootDir: 'true' + # Directory permissions for new directories + directoryPerms: '700' + # EFS filesystem ID + fileSystemId: fs-024e09bcfd37c74aa + # End of GID range for access point + gidRangeEnd: '2000' + # Start of GID range for access point + gidRangeStart: '1500' + # Provisioning mode (efs-ap for access point mode) + provisioningMode: efs-ap + # End of UID range for access point + uidRangeEnd: '2000' + # Start of UID range for access point + uidRangeStart: '1500' + # What happens to PVs when PVCs are deleted (Delete or Retain) + reclaim_policy: Delete + # When to bind PVs to PVCs (Immediate or WaitForFirstConsumer) + volume_binding_mode: Immediate + +# Test Execution Configuration +test: + # Duration of the test in seconds + duration: 120 + # Kubernetes namespace where tests will be run + namespace: efs-stress-test + # Time in seconds between operations + operation_interval: 3 diff --git a/test/stress-scale-tests/requirements.txt b/test/stress-scale-tests/requirements.txt new file mode 100644 index 000000000..fd5016aa7 --- /dev/null +++ b/test/stress-scale-tests/requirements.txt @@ -0,0 +1,7 @@ +kubernetes +pytest +pyyaml +prometheus_client +pandas +psutil +boto3 diff --git a/test/stress-scale-tests/run_tests.py b/test/stress-scale-tests/run_tests.py new file mode 100644 index 000000000..7e38458ad --- /dev/null +++ b/test/stress-scale-tests/run_tests.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +import os +import sys +import yaml +import logging +import argparse +from datetime import datetime +from kubernetes import client, config +# Import test frameworks +from tests.orchestrator import EFSCSIOrchestrator +from utils.report_generator import ReportGenerator +from utils.metrics_collector import MetricsCollector +from utils.log_integration import collect_logs_on_test_failure +# Commented out to remove dependency on cluster setup +# from cluster_setup import ClusterSetup + +def setup_logging(config): + """Setup logging based on configuration + + Args: + config: Configuration dictionary + """ + log_config = config.get('logging', {}) + log_level = getattr(logging, log_config.get('level', 'INFO')) + log_file = log_config.get('file', 'logs/efs_tests.log') + + # Create logs directory if it doesn't exist + os.makedirs(os.path.dirname(log_file), exist_ok=True) + + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file), + logging.StreamHandler() + ] + ) + + return logging.getLogger(__name__) + +def parse_args(): + """Parse command line arguments + + Returns: + Parsed arguments + """ + parser = argparse.ArgumentParser(description='Run EFS CSI tests') + parser.add_argument( + '--config', + default='config/orchestrator_config.yaml', + help='Path to main configuration file (legacy option)' + ) + parser.add_argument( + '--config-dir', + default='config/components', + help='Path to directory containing component configuration files' + ) + parser.add_argument( + '--test-suite', + choices=['orchestrator', 'chaos', 'all'], + default='orchestrator', + help='Test suite to run' + ) + parser.add_argument( + '--duration', + type=int, + help='Duration in seconds for test execution' + ) + parser.add_argument( + '--interval', + type=int, + default=None, + help='Seconds to wait between operations (overrides config value)' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Print what would be done without executing tests' + ) + # Driver pod name functionality commented out as not currently used + # parser.add_argument( + # '--driver-pod-name', + # default=None, + # help='Name of the EFS CSI driver pod for log collection (optional)' + # ) + + # Cluster setup options - kept for compatibility but functionality is disabled + parser.add_argument( + '--driver-version', + help='EFS CSI Driver version to install (DISABLED)' + ) + + return parser.parse_args() + +def check_credentials(): + """Check if credentials are valid by making a simple API call""" + try: + # Attempt to get a list of namespaces - a simple, harmless API call + api = client.CoreV1Api() + api.list_namespace(_request_timeout=10) + return True + except Exception as e: + error_str = str(e) + if "401" in error_str or "Unauthorized" in error_str: + return False + # For other types of errors, we assume credentials are valid but other issues exist + return True + +def print_credential_renewal_instructions(): + """Print instructions for renewing AWS credentials""" + print("\n" + "="*80) + print(f"{'AWS CREDENTIALS EXPIRED OR INVALID':^80}") + print("="*80) + print("\nYour AWS credentials have expired or are invalid.") + print("\nPlease check your AWS credentials and ensure they are properly configured.") + print("\nAfter renewing your credentials, try running the tests again.") + print("="*80 + "\n") + +def load_config(config_path): + """Load configuration from YAML file + + Args: + config_path: Path to configuration file + + Returns: + Loaded configuration as dictionary + """ + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + return config + except Exception as e: + print(f"Error loading configuration: {e}") + sys.exit(1) + +def get_driver_pod_name(args, config): + """Get driver pod name from config + + Args: + args: Parsed command line arguments + config: Configuration dictionary + + Returns: + Driver pod name or None + """ + # Since driver-pod-name arg is commented out, we only check config + driver_pod_name = None + if 'driver' in config and 'pod_name' in config['driver']: + driver_pod_name = config['driver']['pod_name'] + return driver_pod_name + + +def initialize_components(config): + """Initialize report generator and metrics collector + + Args: + config: Configuration dictionary + + Returns: + report_generator, metrics_collector, report_dir + """ + report_dir = config.get('reporting', {}).get('output_dir', 'reports') + report_generator = ReportGenerator(output_dir=report_dir) + metrics_collector = MetricsCollector() + return report_generator, metrics_collector, report_dir + +def run_orchestrator_test(args, config, logger, metrics_collector, report_generator, report_dir): + """Run the orchestrator test suite + + Args: + args: Command line arguments + config: Configuration dictionary + logger: Logger instance + metrics_collector: Instance of MetricsCollector + report_generator: Instance of ReportGenerator + report_dir: Path to report directory + + Returns: + Orchestrator test results + """ + results = {} + + if args.test_suite not in ['orchestrator', 'all']: + return results + + logger.info("Running orchestrator stress test suite") + + if args.dry_run: + logger.info("DRY RUN MODE: Would run orchestrator with randomized operations") + return { + 'orchestrator': { + "status": "would_run", + "description": "Would run the orchestrator with randomized operations" + } + } + + # Set up the orchestrator + orchestrator = setup_orchestrator(args, config, logger, metrics_collector) + + # Run the test + logger.info(f"Starting orchestrator for {args.duration if args.duration else 'default'} seconds") + test_results = orchestrator.run_test() + + # Generate and save the test report + generate_test_report( + test_results, + report_dir, + report_generator, + metrics_collector, + logger + ) + + return {'orchestrator': test_results} + +def setup_orchestrator(args, config, logger, metrics_collector): + """Set up the orchestrator for testing + + Args: + args: Command line arguments + config: Configuration dictionary + logger: Logger instance + metrics_collector: Instance of MetricsCollector + + Returns: + Configured orchestrator instance + """ + # Get component configuration paths using the config-dir argument + config_dir = args.config_dir + component_configs = { + 'driver': os.path.join(config_dir, 'driver.yaml'), + 'storage': os.path.join(config_dir, 'storage.yaml'), + 'test': os.path.join(config_dir, 'test.yaml'), + 'pod': os.path.join(config_dir, 'pod.yaml'), + 'scenarios': os.path.join(config_dir, 'scenarios.yaml') + } + + # Log what we're doing + logger.info("Using component configuration files:") + for component, path in component_configs.items(): + logger.info(f" - {component}: {path}") + + # Create orchestrator with component configuration paths + orchestrator = EFSCSIOrchestrator( + component_configs=component_configs, + metrics_collector=metrics_collector, + ) + + # Override default test parameters if specified + if args.duration: + orchestrator.test_duration = args.duration + logger.info(f"Test duration overridden to {args.duration} seconds") + + if args.interval is not None: # Only override if explicitly specified + orchestrator.operation_interval = args.interval + logger.info(f"Operation interval overridden to {orchestrator.operation_interval} seconds") + else: + logger.info(f"Using operation interval from config: {orchestrator.operation_interval} seconds") + + return orchestrator + +def generate_test_report(test_results, report_dir, report_generator, metrics_collector, logger): + """Generate a test report with metrics and results + + Args: + test_results: Results from the test run + report_dir: Directory to store reports + report_generator: Report generator instance + metrics_collector: Metrics collector instance + logger: Logger instance + """ + # Make report directory + orchestrator_report_dir = os.path.join(report_dir, 'orchestrator') + os.makedirs(orchestrator_report_dir, exist_ok=True) + + # Create timestamp and test name + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"efs_orchestrator_{timestamp}" + + # Create report path + report_path = os.path.join(orchestrator_report_dir, f"{test_name}.json") + + # Add metadata, system info, and metrics to the results + system_info = report_generator._collect_system_info() + collected_metrics = metrics_collector.get_all_metrics() + + # Create full report + full_report = { + "test_name": test_name, + "test_type": "orchestrator", + "timestamp": timestamp, + "system_info": system_info, + "results": test_results, + "metrics": { + "file_performance": collected_metrics.get("file_performance", {}) + } + } + + # Log metrics + log_performance_metrics(collected_metrics, logger) + + # Write report to file + with open(report_path, 'w') as f: + import json + json.dump(full_report, f, indent=2) + + logger.info(f"Orchestrator report generated: {report_path}") + +def log_performance_metrics(collected_metrics, logger): + """Log performance metrics + + Args: + collected_metrics: Metrics data + logger: Logger instance + """ + logger.info("File performance metrics collected:") + for volume, metrics in collected_metrics.get("file_performance", {}).get("by_volume", {}).items(): + logger.info(f" Volume: {volume}") + # Log read metrics if available + if metrics["iops"].get("read") is not None: + logger.info(f" Read IOPS: {metrics['iops']['read']:.2f}") + if metrics["throughput"].get("read") is not None: + logger.info(f" Read Throughput: {metrics['throughput']['read']:.2f} MB/s") + + # Log write metrics if available + if metrics["iops"].get("write") is not None: + logger.info(f" Write IOPS: {metrics['iops']['write']:.2f}") + if metrics["throughput"].get("write") is not None: + logger.info(f" Write Throughput: {metrics['throughput']['write']:.2f} MB/s") + +def handle_test_failure(e, args, config, metrics_collector, logger): + """Handle test failure + + Args: + e: Exception + args: Command line arguments + config: Configuration dictionary + metrics_collector: Metrics collector instance + logger: Logger instance + """ + logger.error(f"Error running tests: {e}", exc_info=True) + + # Collect logs on failure + logger.info("Collecting logs due to test failure") + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"efs_orchestrator_failure_{timestamp}" + + # Driver pod name functionality commented out + # driver_pod_name = get_driver_pod_name(args, config) + # if driver_pod_name: + # logger.info(f"Using driver pod name from config: {driver_pod_name}") + + logs_path = collect_logs_on_test_failure(test_name, metrics_collector) # driver_pod_name parameter removed + if logs_path: + logger.info(f"Failure logs collected to: {logs_path}") + else: + logger.warning("Failed to collect logs") + + # Cluster cleanup functionality is completely disabled + logger.info("Cleanup will be handled by the test orchestrator") + +def main(): + """Main entry point""" + # Parse command line arguments + args = parse_args() + + # Load configuration + config = load_config(args.config) + + # Setup logging + logger = setup_logging(config) + logger.info(f"Starting EFS CSI tests with configuration from {args.config}") + + # Verify credentials before proceeding + logger.info("Verifying AWS credentials") + if not check_credentials(): + logger.error("AWS credentials are expired or invalid") + print_credential_renewal_instructions() + sys.exit(1) + + + # Initialize components + report_generator, metrics_collector, report_dir = initialize_components(config) + + # Run tests + try: + results = run_orchestrator_test( + args, config, logger, metrics_collector, report_generator, report_dir + ) + return results + except Exception as e: + handle_test_failure(e, args, config, metrics_collector, logger) + sys.exit(1) + +if __name__ == "__main__": + main() +# Enhanced modularized implementation diff --git a/test/stress-scale-tests/tests/__init__.py b/test/stress-scale-tests/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/stress-scale-tests/tests/orchestrator.py b/test/stress-scale-tests/tests/orchestrator.py new file mode 100644 index 000000000..cd3ba165e --- /dev/null +++ b/test/stress-scale-tests/tests/orchestrator.py @@ -0,0 +1,2665 @@ +#!/usr/bin/env python3 + +import random +import time +import yaml +import logging +import uuid +import os +import boto3 +from kubernetes import client, config +from datetime import datetime +from utils.log_integration import collect_logs_on_test_failure + +class EFSCSIOrchestrator: + """Orchestrator for testing EFS CSI driver operations""" + + def __init__(self, config_file=None, component_configs=None, namespace=None, metrics_collector=None, driver_pod_name=None): + """Initialize the orchestrator with configuration + + Args: + config_file: Path to single config file (legacy approach) + component_configs: Dictionary of component config files paths + namespace: Kubernetes namespace for test resources + metrics_collector: Metrics collector instance + driver_pod_name: Name of driver pod for log collection + """ + # Store driver pod name for log collection + self.driver_pod_name = driver_pod_name + + # Configure logger before anything else for early diagnostics + self.logger = logging.getLogger(__name__) + self._init_component_configuration(component_configs, namespace) + # Initialize clients and resources + self._init_kubernetes_clients() + self._init_metrics_collector(metrics_collector) + self._init_logging() # Now reconfigure logging with loaded config + self._init_test_parameters() + self._init_resource_tracking() + + self.logger.info("EFS CSI Orchestrator initialized") + + # Create namespace if it doesn't exist + self._ensure_namespace_exists() + + + + def _init_component_configuration(self, component_configs, namespace): + """Initialize configuration from component files + + Args: + component_configs: Dictionary mapping component names to file paths + namespace: Kubernetes namespace override (optional) + """ + if not component_configs: + self.logger.error("No component configs provided") + self.config = {} + self.namespace = namespace or 'default' + return + + self.logger.info("Loading configuration from component files") + self.config = {} + + # Load all component configs + components = { + 'driver': {'file': component_configs.get('driver'), 'key': 'driver'}, + 'storage': {'file': component_configs.get('storage'), 'key': 'storage_class'}, + 'test': {'file': component_configs.get('test'), 'key': None}, # Special handling for test + 'pod': {'file': component_configs.get('pod'), 'key': 'pod_config'}, + 'scenarios': {'file': component_configs.get('scenarios'), 'key': 'scenarios'} + } + + for component_name, details in components.items(): + self._load_component(component_name, details['file'], details['key']) + + # Set namespace from config or use default + test_namespace = None + if hasattr(self, 'test_config') and isinstance(self.test_config, dict): + if 'test' in self.test_config: + test_namespace = self.test_config.get('test', {}).get('namespace') + elif 'namespace' in self.test_config: + test_namespace = self.test_config.get('namespace') + + self.namespace = namespace or test_namespace or 'default' + self.logger.info(f"Using namespace: {self.namespace}") + + def _load_component(self, component_name, file_path, config_key): + """Load a component configuration file + + Args: + component_name: Name of the component (e.g., 'driver', 'storage') + file_path: Path to the component file + config_key: Key to use in self.config for this component, or None for special handling + """ + if not file_path or not os.path.exists(file_path): + self.logger.warning(f"Component file for {component_name} not found at {file_path}") + setattr(self, f"{component_name}_config", {}) + return + + try: + with open(file_path, 'r') as f: + component_data = yaml.safe_load(f) or {} + + # Store the complete component data + setattr(self, f"{component_name}_config", component_data) + self.logger.info(f"Loaded {component_name} config from {file_path}") + + # Special handling for test component which contains multiple top-level keys + if component_name == 'test': + # For test config, copy all top-level keys to self.config + for key, value in component_data.items(): + self.config[key] = value + self.logger.debug(f"Added {key} from test config to main config") + elif config_key: + # For other components, look for the specified key + if config_key in component_data: + self.config[config_key] = component_data[config_key] + self.logger.debug(f"Added {config_key} from {component_name} config to main config") + else: + # If the expected key isn't found, add the whole component + if len(component_data) > 0: + for key, value in component_data.items(): + self.config[key] = value + self.logger.debug(f"Added {key} from {component_name} config to main config") + else: + self.logger.warning(f"No data found in {component_name} config") + except Exception as e: + self.logger.error(f"Error loading {component_name} config: {e}") + setattr(self, f"{component_name}_config", {}) + + def _init_kubernetes_clients(self): + """Initialize Kubernetes API clients""" + config.load_kube_config() + self.core_v1 = client.CoreV1Api() + self.apps_v1 = client.AppsV1Api() + self.storage_v1 = client.StorageV1Api() + + def _init_metrics_collector(self, metrics_collector): + """Initialize metrics collector""" + from utils.metrics_collector import MetricsCollector + self.metrics_collector = metrics_collector or MetricsCollector() + + def _init_logging(self): + """Set up logging based on configuration""" + self.logger = logging.getLogger(__name__) + # Remove any existing handlers to prevent duplicates + self.logger.handlers.clear() + + log_level = getattr(logging, self.config.get('logging', {}).get('level', 'INFO')) + self.logger.setLevel(log_level) + # Prevent propagation to root logger to avoid duplicate logs + self.logger.propagate = False + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + # Console handler if enabled + if self.config.get('logging', {}).get('console_enabled', True): + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + self.logger.addHandler(console_handler) + + # File handler if enabled + if self.config.get('logging', {}).get('file_enabled', True): + os.makedirs('logs', exist_ok=True) + file_handler = logging.FileHandler(f'logs/orchestrator_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log') + file_handler.setFormatter(formatter) + self.logger.addHandler(file_handler) + + def _init_test_parameters(self): + """Initialize test parameters from configuration""" + # Test parameters + self.test_duration = self.config['test'].get('duration', 3600) # seconds + self.operation_interval = self.config['test'].get('operation_interval', 3) # seconds + + # Resource limits + resource_limits = self.config.get('resource_limits', {}) + self.max_pvcs = resource_limits.get('max_pvcs', 100) + self.max_pods_per_pvc = resource_limits.get('max_pods_per_pvc', 50) + self.total_max_pods = resource_limits.get('total_max_pods', 250) + + def _init_resource_tracking(self): + """Initialize resource tracking data structures""" + # Resource tracking + self.pvcs = [] # List of PVC names + self.pods = {} # Maps pvc_name -> list of pod_names + self.current_pod_count = 0 + + # Test status tracking + self.results = { + 'create_pvc': {'success': 0, 'fail': 0}, + 'attach_pod': {'success': 0, 'fail': 0}, + 'delete_pod': {'success': 0, 'fail': 0}, + 'delete_pvc': {'success': 0, 'fail': 0}, + 'verify_write': {'success': 0, 'fail': 0}, + 'verify_read': {'success': 0, 'fail': 0} + } + + # Initialize test scenarios + self.scenarios = { + 'shared_volume_rw': {'runs': 0, 'success': 0, 'fail': 0}, + 'many_to_one': {'runs': 0, 'success': 0, 'fail': 0}, + 'one_to_one': {'runs': 0, 'success': 0, 'fail': 0}, + 'concurrent_pvc': {'runs': 0, 'success': 0, 'fail': 0} + } + + def _ensure_namespace_exists(self): + """Create the namespace if it doesn't exist already""" + try: + # Check if namespace exists + self.core_v1.read_namespace(name=self.namespace) + self.logger.info(f"Namespace '{self.namespace}' already exists") + except client.exceptions.ApiException as e: + if e.status == 404: + # Create namespace if it doesn't exist + namespace_manifest = { + "apiVersion": "v1", + "kind": "Namespace", + "metadata": { + "name": self.namespace + } + } + + self.core_v1.create_namespace(body=namespace_manifest) + self.logger.info(f"Created namespace '{self.namespace}'") + else: + self.logger.error(f"Error checking namespace: {e}") + raise + + def deploy_csi_driver(self): + """ + Deploy or update the EFS CSI driver using Helm. + Uses the driver configuration from orchestrator_config.yaml. + """ + import subprocess + + self.logger.info("Deploying/updating EFS CSI driver with configuration") + + # Get driver configuration + driver_config = self.config.get('driver', {}) + + # Get repository and tag + repository = driver_config.get('repository', '745939127895.dkr.ecr.us-east-1.amazonaws.com/amazon/aws-efs-csi-driver') + tag = f"v{driver_config.get('version', '2.1.1')}" + + # Build Helm command with --force flag to adopt existing resources + cmd = [ + "helm", "upgrade", "--install", + "--force", # Add force flag to adopt existing resources + "aws-efs-csi-driver", + "aws-efs-csi-driver/aws-efs-csi-driver", + "--namespace", "kube-system", + "--set", f"image.repository={repository}", + "--set", f"image.tag={tag}", + "--set", "controller.serviceAccount.create=false", + "--set", "controller.serviceAccount.name=efs-csi-controller-sa", + "-f", "config/driver-values.yaml" + ] + + try: + self.logger.info(f"Running Helm command: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + self.logger.error(f"Error deploying CSI driver: {result.stderr}") + return False + + self.logger.info("EFS CSI deployed/updated successfully") + return True + except Exception as e: + self.logger.error(f"Exception while deploying CSI driver: {e}") + return False + + def run_test(self): + """ + Run the orchestrator test by randomly selecting operations + until the test duration is reached + """ + self.logger.info(f"Starting orchestrator test for {self.test_duration} seconds") + # Deploy the CSI driver with configuration + self.deploy_csi_driver() + start_time = time.time() + self._ensure_storage_class() + operations, weights = self._get_operations_and_weights() + cumulative_weights, total_weight = self._get_cumulative_weights(weights) + self._run_initial_operations() + operation_counts = {op.__name__: 0 for op, _ in operations} + + try: + while time.time() - start_time < self.test_duration: + self._run_random_operation(operations, cumulative_weights, total_weight, operation_counts) + time.sleep(self.operation_interval) + except KeyboardInterrupt: + self.logger.info("Test interrupted by user") + except Exception as e: + self._handle_unexpected_test_error(e) + finally: + elapsed = time.time() - start_time + self.logger.info(f"Test completed in {elapsed:.2f} seconds") + self._cleanup() + return self._generate_report() + + def _get_operations_and_weights(self): + weights = self.config.get('operation_weights', {}) + operations = [ + (self._create_pvc, weights.get('create_pvc', 25)), + (self._attach_pod, weights.get('attach_pod', 25)), + (self._delete_pod, weights.get('delete_pod', 20)), + (self._delete_pvc, weights.get('delete_pvc', 15)), + (self._verify_readwrite, weights.get('verify_readwrite', 15)), + (self._run_specific_scenario, weights.get('run_specific_scenario', 20)) + ] + operation_funcs, weights = zip(*operations) + return operations, weights + + def _get_cumulative_weights(self, weights): + cumulative_weights = [] + current_sum = 0 + for weight in weights: + current_sum += weight + cumulative_weights.append(current_sum) + total_weight = cumulative_weights[-1] + return cumulative_weights, total_weight + + def _run_initial_operations(self): + self.logger.info("Running each operation type once to ensure coverage") + self._create_pvc() + self._create_pvc() + self._attach_pod() + self._attach_pod() + self._attach_pod() + self._verify_readwrite() + self._run_specific_scenario() + self._delete_pod() + self._delete_pvc() + self.logger.info("Completed initial operation sequence, continuing with randomized operations") + + def _run_random_operation(self, operations, cumulative_weights, total_weight, operation_counts): + random_val = random.uniform(0, total_weight) + for i, (operation, _) in enumerate(operations): + if random_val <= cumulative_weights[i]: + op_name = operation.__name__ + operation_counts[op_name] = operation_counts.get(op_name, 0) + 1 + self.logger.info(f"Selected operation: {op_name} (selected {operation_counts[op_name]} times)") + operation() + break + + def _handle_unexpected_test_error(self, e): + self.logger.error(f"Unexpected error during test: {e}", exc_info=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"orchestrator_unexpected_failure_{timestamp}" + failed_resources = [] + for pvc_name in self.pvcs: + failed_resources.append({"type": "pvc", "name": pvc_name, "namespace": self.namespace}) + for pod_name in self.pods.get(pvc_name, []): + failed_resources.append({"type": "pod", "name": pod_name, "namespace": self.namespace}) + logs_path = collect_logs_on_test_failure( + test_name, + self.metrics_collector, + self.driver_pod_name, + failed_resources=failed_resources + ) + self.logger.info(f"Collected comprehensive failure logs to: {logs_path}") + + def _create_pvc(self): + """Create a PVC with access point using configured values""" + # Check if we've reached the maximum PVC count + if len(self.pvcs) >= self.max_pvcs: + self.logger.info("Maximum PVC count reached, skipping creation") + return + + pvc_name = f"test-pvc-{uuid.uuid4().hex[:8]}" + self.logger.info(f"Creating PVC: {pvc_name}") + + try: + # Build the PVC manifest from config + pvc_manifest = self._build_pvc_manifest(pvc_name) + + # Create and wait for PVC to be bound + success = self._create_and_wait_for_pvc(pvc_name, pvc_manifest) + + if not success: + self.logger.warning(f"PVC {pvc_name} creation process did not complete successfully") + + except Exception as e: + self.results['create_pvc']['fail'] += 1 + self.logger.error(f"Failed to create PVC: {e}") + + def _build_pvc_manifest(self, pvc_name): + """Build a PVC manifest based on configuration""" + pvc_config = self.config.get('pvc_config', {}) + + # Get storage class name from config + sc_name = self.config.get('storage_class', {}).get('name', 'efs-sc') + + # Get access modes from config or use default + access_modes = pvc_config.get('access_modes', ["ReadWriteMany"]) + + # Get storage size from config or use default + storage_size = pvc_config.get('storage_size', "1Gi") + + # Create base manifest + pvc_manifest = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": {"name": pvc_name}, + "spec": { + "accessModes": access_modes, + "storageClassName": sc_name, + "resources": { + "requests": {"storage": storage_size} + } + } + } + + # Add metadata from config + self._add_pvc_metadata(pvc_manifest, pvc_config) + + return pvc_manifest + + def _add_pvc_metadata(self, pvc_manifest, pvc_config): + """Add metadata like annotations and labels to PVC manifest""" + # Add annotations if configured + pvc_annotations = pvc_config.get('annotations', {}) + if pvc_annotations: + pvc_manifest['metadata']['annotations'] = pvc_annotations + + # Add labels if configured + pvc_labels = pvc_config.get('labels', {}) + if pvc_labels: + pvc_manifest['metadata']['labels'] = pvc_labels + + return pvc_manifest + + def _create_and_wait_for_pvc(self, pvc_name, pvc_manifest): + """Create PVC and wait for it to be bound""" + # Create PVC + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, + body=pvc_manifest + ) + + # Track the PVC + self.pvcs.append(pvc_name) + self.pods[pvc_name] = [] + + # Update results + self.results['create_pvc']['success'] += 1 + sc_name = pvc_manifest['spec']['storageClassName'] + self.logger.info(f"Created PVC: {pvc_name} with storage class {sc_name}") + + # Get timeout value from config or use default + retry_config = self.config.get('retries', {}) + pvc_bind_timeout = retry_config.get('pvc_bind_timeout', 30) + + # Wait for PVC to be bound + return self._wait_for_pvc_bound(pvc_name, timeout=pvc_bind_timeout) + + def _attach_pod(self, pvc_name=None): + """ + Attach a pod to a PVC. If pvc_name is provided, attach to that specific PVC. Otherwise, select a random PVC. + """ + if not self.pvcs: + self.logger.info("No PVCs available, skipping pod attachment") + return None + if self.current_pod_count >= self.total_max_pods: + self.logger.info("Maximum total pod count reached, skipping attachment") + return None + pvc_name = self._select_pvc_for_pod(pvc_name) + if pvc_name is None: + return None + pod_name = f"test-pod-{uuid.uuid4().hex[:8]}" + pod_manifest = self._build_pod_manifest(pod_name, pvc_name) + try: + self.core_v1.create_namespaced_pod(namespace=self.namespace, body=pod_manifest) + self._track_new_pod(pvc_name, pod_name) + self.logger.info(f"Created pod: {pod_name} using PVC: {pvc_name}") + if not self._wait_for_pod_ready(pod_name, timeout=60): + self.logger.warning(f"Timeout waiting for pod {pod_name} to be ready") + return None + return pod_name + except Exception as e: + self.results['attach_pod']['fail'] += 1 + self.logger.error(f"Failed to create pod: {e}") + return None + + def _select_pvc_for_pod(self, pvc_name): + if pvc_name is None or pvc_name not in self.pvcs: + pvc_name = random.choice(self.pvcs) + if len(self.pods[pvc_name]) >= self.max_pods_per_pvc: + self.logger.info(f"PVC {pvc_name} has reached max pods ({self.max_pods_per_pvc}), skipping") + return None + return pvc_name + + def _build_pod_manifest(self, pod_name, pvc_name): + """Build pod manifest using configuration values""" + pod_config = self.config.get('pod_config', {}) + + # Build the container specification + container = self._build_container_spec(pod_config) + + # Build pod metadata + metadata = self._build_pod_metadata(pod_name, pod_config) + + # Build pod spec + pod_spec = self._build_pod_spec(container, pvc_name, pod_config) + + # Combine into complete manifest + manifest = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": metadata, + "spec": pod_spec + } + + return manifest + + def _build_container_spec(self, pod_config): + """Build the container specification from config""" + # Determine command arguments + args = pod_config.get('args') + if not args: + startup_script = self._get_pod_startup_script() + args = [startup_script] + + # Create base container spec + container = { + "name": "test-container", + "image": pod_config.get('image', 'alpine:latest'), + "volumeMounts": [{"name": "efs-volume", "mountPath": "/data"}], + } + + # Add command if specified in config + if 'command' in pod_config: + container["command"] = pod_config['command'] + elif not args: + # Default command if args not specified and command not in config + container["command"] = ["/bin/sh", "-c"] + + # Add args if available + if args: + container["args"] = args + + # Add readiness probe + container["readinessProbe"] = self._build_readiness_probe(pod_config) + + # Add resource constraints + container["resources"] = self._build_container_resources(pod_config) + + return container + + def _build_readiness_probe(self, pod_config): + """Build readiness probe configuration from pod config""" + readiness_probe = pod_config.get('readiness_probe', {}) + return { + "exec": { + "command": ["/bin/sh", "-c", "cat /data/pod-ready 2>/dev/null || cat /tmp/ready/pod-ready 2>/dev/null"] + }, + "initialDelaySeconds": readiness_probe.get('initial_delay_seconds', 15), + "periodSeconds": readiness_probe.get('period_seconds', 5), + "failureThreshold": readiness_probe.get('failure_threshold', 6), + "timeoutSeconds": readiness_probe.get('timeout_seconds', 5) + } + + def _build_container_resources(self, pod_config): + """Build container resources configuration from config""" + container_resources = self.config.get('pod_resources', {}) + return { + "requests": container_resources.get('requests', {"cpu": "100m", "memory": "64Mi"}), + "limits": container_resources.get('limits', {"cpu": "200m", "memory": "128Mi"}) + } + + def _build_pod_metadata(self, pod_name, pod_config): + """Build pod metadata from config""" + metadata = { + "name": pod_name, + "labels": {"app": "efs-test", "component": "stress-test"} + } + + # Add custom labels if specified + custom_labels = pod_config.get('labels', {}) + if custom_labels: + metadata['labels'].update(custom_labels) + + return metadata + + def _build_pod_spec(self, container, pvc_name, pod_config): + """Build pod spec from container and config""" + pod_spec = { + "containers": [container], + "volumes": [{ + "name": "efs-volume", + "persistentVolumeClaim": {"claimName": pvc_name} + }], + "tolerations": [ + {"key": "node.kubernetes.io/not-ready", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300}, + {"key": "node.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300} + ] + } + + # Add additional tolerations from config + if 'tolerations' in pod_config: + pod_spec['tolerations'].extend(pod_config['tolerations']) + + # Add node selector if specified in config + if 'node_selector' in pod_config: + pod_spec['nodeSelector'] = pod_config['node_selector'] + self.logger.info(f"Using node selector: {pod_config['node_selector']}") + + # Add any additional pod spec settings from config + pod_spec_settings = pod_config.get('pod_spec', {}) + for key, value in pod_spec_settings.items(): + if key not in pod_spec: + pod_spec[key] = value + + return pod_spec + + def _get_pod_startup_script(self): + """Get the pod startup script by composing script components""" + base_script = self._get_basic_pod_script() + stale_handle_detection = self._get_stale_handle_detection() + readiness_check = self._get_readiness_check_script() + health_check_loop = self._get_health_check_loop() + + return f"""#!/bin/sh +{base_script} +{stale_handle_detection} +{readiness_check} +{health_check_loop} +""" + + def _get_basic_pod_script(self): + """Get the basic startup and initialization script""" + return """echo "Pod $(hostname) starting up" +ls -la /data || echo "ERROR: Cannot access /data directory" + +# Initialize stale handle tracking +mkdir -p /tmp/metrics +touch /tmp/stale_count""" + + def _get_stale_handle_detection(self): + """Get the stale file handle detection function""" + return """ +# Create stale handle detection functions +detect_stale_handle() { + # Args: $1 = path being checked + if [ $? -ne 0 ]; then + ERR_MSG=$(echo "$ERROR_OUTPUT" | grep -i "stale file handle") + if [ $? -eq 0 ]; then + echo "EFS_ERROR: STALE_FILE_HANDLE: path=$1, message=$ERR_MSG" + echo $(date +"%Y-%m-%d %H:%M:%S") > /tmp/stale_handle_detected + echo "$1: $ERR_MSG" >> /tmp/stale_count + # Count lines in stale_count file + STALE_COUNT=$(wc -l < /tmp/stale_count 2>/dev/null || echo 0) + echo "Stale file handle count: $STALE_COUNT" + fi + fi +} + +# Check for stale handles on volume root +echo "Testing volume access..." +ERROR_OUTPUT=$(ls -la /data 2>&1 1>/dev/null) +detect_stale_handle "/data" +""" + + def _get_readiness_check_script(self): + """Get the script for readiness check and file creation""" + return """ +for i in 1 2 3 4 5; do + echo "Attempt $i to create readiness file" + ERROR_OUTPUT=$(touch /data/pod-ready 2>&1) + if [ $? -eq 0 ]; then + echo "Successfully created /data/pod-ready" + break + else + echo "Failed to create readiness file on attempt $i: $ERROR_OUTPUT" + detect_stale_handle "/data/pod-ready" + + if [ $i -eq 5 ]; then + echo "All attempts failed, creating alternative readiness file" + mkdir -p /tmp/ready && touch /tmp/ready/pod-ready + fi + sleep 2 + fi +done +""" + + def _get_health_check_loop(self): + """Get the periodic health check loop script""" + return """ +# Periodic file system health checks +while true; do + # Every 30 seconds, check for stale handles + if [ $((RANDOM % 3)) -eq 0 ]; then # Do checks randomly to spread load + TEST_FILE="/data/test-$(date +%s).txt" + ERROR_OUTPUT=$(touch $TEST_FILE 2>&1) + detect_stale_handle "$TEST_FILE" + + if [ -f "$TEST_FILE" ]; then + rm $TEST_FILE 2>/dev/null + fi + fi + sleep 30 +done +""" + + def _track_new_pod(self, pvc_name, pod_name): + self.pods[pvc_name].append(pod_name) + self.current_pod_count += 1 + self.results['attach_pod']['success'] += 1 + + def _delete_pod(self, pod_name=None, pvc_name=None, force=False): + """ + Delete a pod. If pod_name and pvc_name are provided, delete that specific pod. Otherwise, select a random pod. + """ + pvc_name, pod_name = self._select_pod_for_deletion(pod_name, pvc_name) + if not pod_name: + return False + self.logger.info(f"Deleting pod: {pod_name} from PVC: {pvc_name}") + try: + self._delete_pod_k8s(pod_name, force) + if not self._wait_for_pod_deleted(pod_name): + self.logger.warning(f"Timeout waiting for pod {pod_name} to be deleted") + return False + self._untrack_deleted_pod(pvc_name, pod_name) + self.logger.info(f"Deleted pod: {pod_name}") + self.results['delete_pod']['success'] += 1 + return True + except Exception as e: + self.results['delete_pod']['fail'] += 1 + self.logger.error(f"Failed to delete pod {pod_name}: {e}") + return False + + def _select_pod_for_deletion(self, pod_name, pvc_name): + if pod_name is None or pvc_name is None: + all_pods = [(pvc, pod) for pvc, pod_list in self.pods.items() for pod in pod_list] + if not all_pods: + self.logger.info("No pods to delete") + return (None, None) + return random.choice(all_pods) + elif pod_name not in self.pods.get(pvc_name, []): + self.logger.warning(f"Pod {pod_name} not found in PVC {pvc_name}") + return (None, None) + return (pvc_name, pod_name) + + def _delete_pod_k8s(self, pod_name, force): + if force: + grace_period_seconds = 0 + propagation_policy = 'Background' + self.logger.info(f"Force deleting pod {pod_name} with grace period 0") + else: + grace_period_seconds = None + propagation_policy = 'Foreground' + self.core_v1.delete_namespaced_pod( + name=pod_name, + namespace=self.namespace, + grace_period_seconds=grace_period_seconds, + propagation_policy=propagation_policy + ) + + def _untrack_deleted_pod(self, pvc_name, pod_name): + if pod_name in self.pods.get(pvc_name, []): + self.pods[pvc_name].remove(pod_name) + self.current_pod_count -= 1 + + def _delete_pvc(self, pvc_name=None, force=False): + """ + Delete a PVC. If pvc_name is provided, delete that specific PVC. Otherwise, select a random PVC. + """ + pvc_name = self._select_pvc_for_deletion(pvc_name) + if not pvc_name: + return False + self.logger.info(f"Deleting PVC: {pvc_name}") + self._delete_all_pods_for_pvc(pvc_name) + try: + self._delete_pvc_k8s(pvc_name, force) + if not self._wait_for_pvc_deleted(pvc_name): + self.logger.warning(f"Timeout waiting for PVC {pvc_name} to be deleted") + return False + self._untrack_deleted_pvc(pvc_name) + self.logger.info(f"Deleted PVC: {pvc_name}") + self.results['delete_pvc']['success'] += 1 + return True + except Exception as e: + self.results['delete_pvc']['fail'] += 1 + self.logger.error(f"Failed to delete PVC {pvc_name}: {e}") + return False + + def _select_pvc_for_deletion(self, pvc_name): + if not self.pvcs: + self.logger.info("No PVCs to delete") + return None + if pvc_name is None or pvc_name not in self.pvcs: + return random.choice(self.pvcs) + return pvc_name + + def _delete_all_pods_for_pvc(self, pvc_name): + if self.pods.get(pvc_name): + self.logger.info(f"Deleting {len(self.pods[pvc_name])} pods using PVC {pvc_name}") + for pod_name in list(self.pods[pvc_name]): + self._delete_pod(pod_name, pvc_name) + + def _delete_pvc_k8s(self, pvc_name, force): + if force: + grace_period_seconds = 0 + propagation_policy = 'Background' + self.logger.info(f"Force deleting PVC {pvc_name} with grace period 0") + else: + grace_period_seconds = None + propagation_policy = 'Foreground' + self.core_v1.delete_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace, + grace_period_seconds=grace_period_seconds, + propagation_policy=propagation_policy + ) + + def _untrack_deleted_pvc(self, pvc_name): + if pvc_name in self.pvcs: + self.pvcs.remove(pvc_name) + if pvc_name in self.pods: + del self.pods[pvc_name] + + def _verify_readwrite(self): + """ + Verify read/write operations between pods sharing a PVC + This tests that pods sharing the same volume can see each other's writes + """ + # Find PVCs that have multiple pods + shared_pvcs = [(pvc, pods) for pvc, pods in self.pods.items() if len(pods) >= 2] + if not shared_pvcs: + self.logger.info("No shared PVCs with multiple pods for read/write test") + return + pvc_name, pod_names = random.choice(shared_pvcs) + if len(pod_names) < 2: + return + writer_pod = random.choice(pod_names) + reader_pod = random.choice([p for p in pod_names if p != writer_pod]) + test_file = f"test-{uuid.uuid4().hex[:8]}.txt" + test_content = f"Test content: {uuid.uuid4()}" * 50 + content_size_bytes = len(test_content.encode('utf-8')) + self.logger.info(f"Testing read/write between pods {writer_pod} and {reader_pod} sharing PVC {pvc_name}") + self.logger.info(f"File size: {content_size_bytes} bytes") + try: + write_success, write_duration = self._run_write_op(writer_pod, test_file, test_content, pvc_name, content_size_bytes) + if not write_success: + self._track_rw_failure('write') + self._track_scenario_failure('shared_volume_rw') + return + time.sleep(2) + read_success, read_duration, resp = self._run_read_op(reader_pod, test_file, test_content, pvc_name, content_size_bytes) + if read_success: + self._track_rw_success('read') + self._track_scenario_success('shared_volume_rw') + self._run_metadata_ls(reader_pod, pvc_name) + else: + self._track_rw_failure('read') + self._track_scenario_failure('shared_volume_rw') + self.logger.error(f"Pod {reader_pod} failed to read content written by {writer_pod}. Got different content length: {len(resp)} vs expected {len(test_content)}") + except Exception as e: + self.logger.error(f"Failed in read/write verification: {e}") + self._track_rw_failure('write') + self._track_scenario_failure('shared_volume_rw') + + def _run_write_op(self, writer_pod, test_file, test_content, pvc_name, content_size_bytes): + import subprocess + write_op_start = time.time() + write_cmd = f"kubectl exec -n {self.namespace} {writer_pod} -- /bin/sh -c 'echo \"{test_content}\" > /data/{test_file}'" + self.logger.info(f"Executing write command: {write_cmd}") + try: + write_process = subprocess.run( + write_cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + write_duration = time.time() - write_op_start + self.metrics_collector.track_file_operation_latency(pvc_name, "write", write_duration) + self.metrics_collector.track_file_operation_iops(pvc_name, "write", 1, write_duration) + self.metrics_collector.track_file_operation_throughput(pvc_name, "write", content_size_bytes, write_duration) + self.logger.info(f"Write operation completed in {write_duration:.3f}s") + self.logger.info(f"Write throughput: {(content_size_bytes / 1024 / 1024) / write_duration:.2f} MB/s") + self._track_rw_success('write') + self.logger.info(f"Pod {writer_pod} wrote to /data/{test_file}") + return True, write_duration + except subprocess.CalledProcessError as e: + self.logger.error(f"Write command execution failed: {e}") + self.logger.error(f"Command stderr: {e.stderr}") + return False, 0 + + def _run_read_op(self, reader_pod, test_file, test_content, pvc_name, content_size_bytes): + import subprocess + read_op_start = time.time() + read_cmd = f"kubectl exec -n {self.namespace} {reader_pod} -- cat /data/{test_file}" + self.logger.info(f"Executing read command: {read_cmd}") + try: + read_process = subprocess.run( + read_cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + read_duration = time.time() - read_op_start + self.metrics_collector.track_file_operation_latency(pvc_name, "read", read_duration) + self.metrics_collector.track_file_operation_iops(pvc_name, "read", 1, read_duration) + self.metrics_collector.track_file_operation_throughput(pvc_name, "read", content_size_bytes, read_duration) + resp = read_process.stdout.strip() + self.logger.info(f"Read operation completed in {read_duration:.3f}s") + self.logger.info(f"Read throughput: {(content_size_bytes / 1024 / 1024) / read_duration:.2f} MB/s") + self.logger.info(f"Read result length: {len(resp)} bytes") + if test_content in resp: + self.logger.info(f"Pod {reader_pod} successfully read content written by writer pod") + return True, read_duration, resp + else: + return False, read_duration, resp + except subprocess.CalledProcessError as e: + self.logger.error(f"Read command execution failed: {e}") + self.logger.error(f"Command stderr: {e.stderr}") + return False, 0, '' + + def _run_metadata_ls(self, reader_pod, pvc_name): + import subprocess + meta_op_start = time.time() + ls_cmd = f"kubectl exec -n {self.namespace} {reader_pod} -- ls -la /data/" + try: + ls_process = subprocess.run( + ls_cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + meta_duration = time.time() - meta_op_start + self.metrics_collector.track_file_operation_latency(pvc_name, "metadata", meta_duration) + self.metrics_collector.track_file_operation_iops(pvc_name, "metadata", 1, meta_duration) + self.logger.info(f"Metadata operation (ls) completed in {meta_duration:.3f}s") + except subprocess.CalledProcessError as e: + self.logger.error(f"Metadata ls command failed: {e}") + self.logger.error(f"Command stderr: {e}") + + def _track_rw_success(self, op_type): + if op_type == 'write': + self.results['verify_write']['success'] += 1 + elif op_type == 'read': + self.results['verify_read']['success'] += 1 + + def _track_rw_failure(self, op_type): + if op_type == 'write': + self.results['verify_write']['fail'] += 1 + elif op_type == 'read': + self.results['verify_read']['fail'] += 1 + + def _track_scenario_success(self, scenario): + self.scenarios[scenario]['runs'] += 1 + self.scenarios[scenario]['success'] += 1 + + def _track_scenario_failure(self, scenario): + self.scenarios[scenario]['runs'] += 1 + self.scenarios[scenario]['fail'] += 1 + + def _run_specific_scenario(self): + """ + Run a specific test scenario + Randomly select from the required scenarios + """ + scenarios = [ + self._scenario_many_to_one, + self._scenario_one_to_one, + self._scenario_concurrent_pvc, + self._scenario_controller_crash_test + ] + + # scenarios = [self._scenario_controller_crash_test] + + # Add controller crash test if enabled + # if self.config["scenarios"].get("controller_crash", {}).get("enabled", False): + # scenarios.append(self._scenario_controller_crash_test) + + # Pick a random scenario + scenario = random.choice(scenarios) + scenario_name = scenario.__name__ + + # Enhanced logging - make it very clear which scenario was selected + self.logger.info("=" * 60) + self.logger.info(f"SELECTED SCENARIO: {scenario_name}") + self.logger.info("=" * 60) + + # Execute the scenario + scenario() + + # Log when scenario completes + self.logger.info(f"COMPLETED SCENARIO: {scenario_name}") + self.logger.info("-" * 60) + + def _scenario_many_to_one(self): + """ + Test many pods mounting a single PVC + 1. Create one PVC + 2. Create multiple pods that all mount the same PVC + 3. Verify pods can read/write successfully using kubectl subprocess + 4. Clean up + """ + self.logger.info("+" * 80) + self.logger.info("STARTING MANY-TO-ONE SCENARIO DIAGNOSTICS") + self.logger.info("+" * 80) + self.scenarios['many_to_one']['runs'] += 1 + try: + pvc_name = self._create_many_to_one_pvc() + if not pvc_name: + self.scenarios['many_to_one']['fail'] += 1 + return + pod_names = self._create_many_to_one_pods(pvc_name) + if len(pod_names) < 2: + self.logger.error(f"[MANY2ONE] FAILED: Insufficient pods created ({len(pod_names)}), need at least 2 for read/write test") + self.scenarios['many_to_one']['fail'] += 1 + return + success = self._test_many_to_one_rw(pvc_name, pod_names) + if success: + self.logger.info(f"[MANY2ONE] SUCCESS: Many-to-one scenario successful with {len(pod_names)} pods") + self.scenarios['many_to_one']['success'] += 1 + else: + self.scenarios['many_to_one']['fail'] += 1 + self._collect_many_to_one_failure_logs(pvc_name, pod_names) + except Exception as e: + self.logger.error(f"[MANY2ONE] FAILED: Unhandled error in many-to-one scenario: {e}") + self.scenarios['many_to_one']['fail'] += 1 + self.logger.info("+" * 80) + self.logger.info("COMPLETED MANY-TO-ONE SCENARIO DIAGNOSTICS") + self.logger.info("+" * 80) + + def _create_many_to_one_pvc(self): + """Create a PVC for many-to-one scenario using configuration""" + # Generate PVC name with unique identifier + pvc_name = f"many2one-{uuid.uuid4().hex[:8]}" + + self.logger.info(f"[MANY2ONE] STEP 1: Creating dedicated PVC: {pvc_name}") + + try: + # Get configuration values + scenario_config = self.config.get('scenarios', {}).get('many_to_one', {}) + sc_name = self.config.get('storage_class', {}).get('name', 'efs-sc') + + # Create PVC manifest using config values + pvc_manifest = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": {"name": pvc_name}, + "spec": { + "accessModes": ["ReadWriteMany"], # This is generally fixed for EFS + "storageClassName": sc_name, + "resources": { + "requests": {"storage": "1Gi"} # Size doesn't matter for EFS but required in PVC spec + } + } + } + + # Add annotations if configured + pvc_annotations = scenario_config.get('pvc_annotations', {}) + if pvc_annotations: + pvc_manifest['metadata']['annotations'] = pvc_annotations + + # Create PVC + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, + body=pvc_manifest + ) + + # Track the PVC + self.pvcs.append(pvc_name) + self.pods[pvc_name] = [] + + # Get timeout from config + retry_config = self.config.get('retries', {}) + pvc_bind_timeout = retry_config.get('pvc_bind_timeout', 30) + + self.logger.info(f"[MANY2ONE] PVC {pvc_name} created with storage class {sc_name}") + + # Wait for PVC to be bound with configured timeout + if not self._wait_for_pvc_bound(pvc_name, timeout=pvc_bind_timeout): + self.logger.error(f"[MANY2ONE] FAILED: Timeout waiting for PVC {pvc_name} to be bound after {pvc_bind_timeout}s") + return None + + return pvc_name + + except Exception as e: + self.logger.error(f"[MANY2ONE] FAILED: Error creating PVC: {e}") + return None + + def _create_many_to_one_pods(self, pvc_name): + # Get pod count range from config or use defaults + scenario_config = self.config.get('scenarios', {}).get('many_to_one', {}) + min_pods = scenario_config.get('min_pods', 3) + max_pods = scenario_config.get('max_pods', 5) + num_pods = random.randint(min_pods, max_pods) + + self.logger.info(f"[MANY2ONE] STEP 2: Creating {num_pods} pods for the same PVC {pvc_name}") + pod_names = [] + for i in range(num_pods): + self.logger.info(f"[MANY2ONE] Creating pod {i+1}/{num_pods} for PVC {pvc_name}") + pod_name = self._attach_pod(pvc_name) + if pod_name: + self.logger.info(f"[MANY2ONE] Successfully created and attached pod {pod_name}") + pod_names.append(pod_name) + else: + self.logger.error(f"[MANY2ONE] Failed to create pod {i+1}/{num_pods}") + self.logger.info(f"[MANY2ONE] Created {len(pod_names)}/{num_pods} pods successfully") + return pod_names + + def _test_many_to_one_rw(self, pvc_name, pod_names): + import subprocess + test_file = f"many2one-{uuid.uuid4().hex[:8]}.txt" + test_content = f"Many2One test content: {uuid.uuid4()}" + writer_pod = random.choice(pod_names) + reader_pod = random.choice([p for p in pod_names if p != writer_pod]) + self.logger.info(f"[MANY2ONE] STEP 3: Testing read/write operations") + self.logger.info(f"[MANY2ONE] Writer pod: {writer_pod}, Reader pod: {reader_pod}") + try: + write_cmd = f"kubectl exec -n {self.namespace} {writer_pod} -- /bin/sh -c 'echo \"{test_content}\" > /data/{test_file}'" + self.logger.info(f"[MANY2ONE] Executing write command: {write_cmd}") + subprocess.run(write_cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + time.sleep(5) + read_cmd = f"kubectl exec -n {self.namespace} {reader_pod} -- cat /data/{test_file}" + self.logger.info(f"[MANY2ONE] Executing read command: {read_cmd}") + read_process = subprocess.run(read_cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + read_result = read_process.stdout.strip() + self.logger.info(f"[MANY2ONE] Read command result: '{read_result}'") + return test_content in read_result + except Exception as e: + self.logger.error(f"[MANY2ONE] FAILED: Error during read/write test: {e}") + return False + + def _collect_many_to_one_failure_logs(self, pvc_name, pod_names): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"many2one_failure_{timestamp}" + failed_resources = ( + [{"type": "pod", "name": pod, "namespace": self.namespace} for pod in pod_names] + + [{"type": "pvc", "name": pvc_name, "namespace": self.namespace}] + ) + logs_path = collect_logs_on_test_failure( + test_name, + self.metrics_collector, + self.driver_pod_name, + failed_resources=failed_resources + ) + self.logger.info(f"Collected detailed failure logs to: {logs_path}") + + def _scenario_one_to_one(self): + """ + Test one pod per PVC scenario + 1. Create multiple PVCs + 2. Create one pod per PVC + 3. Verify each pod can write to its own volume using kubectl subprocess + 4. Clean up + """ + self.logger.info("Running scenario: One pod per PVC") + self.scenarios['one_to_one']['runs'] += 1 + + # Get pair count range from config + scenario_config = self.config.get('scenarios', {}).get('one_to_one', {}) + min_pairs = scenario_config.get('min_pairs', 3) + max_pairs = scenario_config.get('max_pairs', 5) + + # Use configured values instead of hardcoded ones + num_pairs = random.randint(min_pairs, max_pairs) + self.logger.info(f"Creating {num_pairs} PVC-pod pairs (range from config: {min_pairs}-{max_pairs})") + pairs = self._create_one_to_one_pairs(num_pairs) + if len(pairs) < 2: + self.logger.warning(f"Failed to create enough PVC-pod pairs, only created {len(pairs)}") + self.scenarios['one_to_one']['fail'] += 1 + return + success = self._test_one_to_one_rw(pairs) + if success: + self.logger.info(f"[ONE2ONE] One-to-one scenario successful with {len(pairs)} PVC-pod pairs") + self.scenarios['one_to_one']['success'] += 1 + else: + self.logger.error("[ONE2ONE] One-to-one scenario failed") + self.scenarios['one_to_one']['fail'] += 1 + self._collect_one_to_one_failure_logs(pairs) + + def _create_one_to_one_pairs(self, num_pairs): + """Create pairs of PVCs and pods for one-to-one scenario""" + # Get the number of pairs to create + num_pairs = self._get_one_to_one_pair_count(num_pairs) + self.logger.info(f"[ONE2ONE] Creating {num_pairs} PVC-pod pairs") + + # Create the pairs + pairs = [] + for i in range(num_pairs): + pair = self._create_one_to_one_pair() + if pair: + pairs.append(pair) + + return pairs + + def _get_one_to_one_pair_count(self, requested_pairs): + """Determine how many PVC-pod pairs to create based on config and request""" + # Get configuration for one-to-one scenario + scenario_config = self.config.get('scenarios', {}).get('one_to_one', {}) + min_pairs = scenario_config.get('min_pairs', 3) + max_pairs = scenario_config.get('max_pairs', 5) + + # If requested_pairs wasn't specified, use configured range + if requested_pairs <= 0: + pairs = random.randint(min_pairs, max_pairs) + self.logger.info(f"[ONE2ONE] Using configured range: creating {pairs} PVC-pod pairs") + return pairs + + return requested_pairs + + def _create_one_to_one_pair(self): + """Create a single PVC-pod pair for one-to-one scenario""" + # Generate PVC name + pvc_name = f"one2one-{uuid.uuid4().hex[:8]}" + + # Create the PVC manifest + pvc_manifest = self._build_one_to_one_pvc_manifest(pvc_name) + + try: + # Create the PVC + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, + body=pvc_manifest + ) + self.pvcs.append(pvc_name) + self.pods[pvc_name] = [] + + # Get timeout from config + retry_config = self.config.get('retries', {}) + pvc_bind_timeout = retry_config.get('pvc_bind_timeout', 30) + + # Wait for PVC to be bound + if not self._wait_for_pvc_bound(pvc_name, timeout=pvc_bind_timeout): + self.logger.warning(f"[ONE2ONE] Timeout waiting for PVC {pvc_name} to be bound after {pvc_bind_timeout}s") + return None + + # Create and attach pod + pod_name = self._attach_pod(pvc_name) + if pod_name: + self.logger.info(f"[ONE2ONE] Successfully created pair: PVC {pvc_name}, Pod {pod_name}") + return (pvc_name, pod_name) + + return None + + except Exception as e: + self.logger.error(f"[ONE2ONE] Error creating PVC or pod: {e}") + return None + + def _build_one_to_one_pvc_manifest(self, pvc_name): + """Build PVC manifest for one-to-one scenario""" + scenario_config = self.config.get('scenarios', {}).get('one_to_one', {}) + + # Get storage class name from config + sc_name = self.config.get('storage_class', {}).get('name', 'efs-sc') + + # Create base manifest + pvc_manifest = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": {"name": pvc_name}, + "spec": { + "accessModes": ["ReadWriteMany"], + "storageClassName": sc_name, + "resources": {"requests": {"storage": "1Gi"}} + } + } + + # Add any PVC annotations if configured + pvc_annotations = scenario_config.get('pvc_annotations', {}) + if pvc_annotations: + if 'metadata' not in pvc_manifest: + pvc_manifest['metadata'] = {} + pvc_manifest['metadata']['annotations'] = pvc_annotations + + return pvc_manifest + + def _test_one_to_one_rw(self, pairs): + import subprocess + for pvc_name, pod_name in pairs: + test_file = f"one2one-{uuid.uuid4().hex[:8]}.txt" + test_content = f"One2One test content for {pvc_name}: {uuid.uuid4()}" + try: + write_cmd = f"kubectl exec -n {self.namespace} {pod_name} -- /bin/sh -c 'echo \"{test_content}\" > /data/{test_file}'" + self.logger.info(f"[ONE2ONE] Executing write command for pod {pod_name}: {write_cmd}") + subprocess.run( + write_cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + read_cmd = f"kubectl exec -n {self.namespace} {pod_name} -- cat /data/{test_file}" + self.logger.info(f"[ONE2ONE] Executing read command for pod {pod_name}: {read_cmd}") + read_process = subprocess.run( + read_cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + read_result = read_process.stdout.strip() + self.logger.info(f"[ONE2ONE] Pod {pod_name} read result: '{read_result}'") + if test_content not in read_result: + self.logger.error(f"[ONE2ONE] Pod {pod_name} failed to read its own write. Expected '{test_content}', got '{read_result}'") + return False + else: + self.logger.info(f"[ONE2ONE] Pod {pod_name} successfully wrote and read from its own volume") + except subprocess.CalledProcessError as e: + self.logger.error(f"[ONE2ONE] Command execution failed for pod {pod_name}: {e}") + self.logger.error(f"[ONE2ONE] Command stderr: {e.stderr}") + return False + except Exception as e: + self.logger.error(f"[ONE2ONE] Error in one-to-one scenario for pod {pod_name}: {e}") + return False + return True + + def _collect_one_to_one_failure_logs(self, pairs): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"one2one_failure_{timestamp}" + failed_resources = [] + for pvc_name, pod_name in pairs: + failed_resources.append({"type": "pod", "name": pod_name, "namespace": self.namespace}) + failed_resources.append({"type": "pvc", "name": pvc_name, "namespace": self.namespace}) + logs_path = collect_logs_on_test_failure( + test_name, + self.metrics_collector, + self.driver_pod_name, + failed_resources=failed_resources + ) + self.logger.info(f"Collected detailed failure logs to: {logs_path}") + + def _scenario_concurrent_pvc(self): + """ + Test rapid PVC creation and deletion + 1. Create multiple PVCs in quick succession + 2. Create pods for some of them + 3. Delete some PVCs in quick succession + 4. Verify operations successfully + """ + self.logger.info("Running scenario: Rapid PVC operations") + self.scenarios['concurrent_pvc']['runs'] += 1 + + # Get PVC count range from config + scenario_config = self.config.get('scenarios', {}).get('concurrent_pvc', {}) + min_pvcs = scenario_config.get('min_pvcs', 3) + max_pvcs = scenario_config.get('max_pvcs', 7) + + # Number of PVCs to create + num_pvcs = random.randint(min_pvcs, max_pvcs) + self.logger.info(f"Creating {num_pvcs} PVCs (range from config: {min_pvcs}-{max_pvcs})") + + pvc_names = [f"concurrent-pvc-{uuid.uuid4().hex[:8]}" for _ in range(num_pvcs)] + created_pvcs = [] + + try: + # Step 1: Create multiple PVCs in quick succession + created_pvcs = self._concurrent_create_pvcs(pvc_names) + + if len(created_pvcs) < 2: + self._mark_concurrent_scenario_failed(f"Failed to create enough PVCs, only created {len(created_pvcs)}") + return + + # Step 2: Create pods for some of the PVCs + self._concurrent_create_pods(created_pvcs) + + # Step 3: Delete some PVCs in quick succession + self._concurrent_delete_pvcs(created_pvcs, min_pvcs) + + # Mark scenario as successful + self.logger.info("Rapid PVC scenario completed successfully") + self.scenarios['concurrent_pvc']['success'] += 1 + + except Exception as e: + self._handle_concurrent_scenario_failure(e, created_pvcs) + + def _concurrent_create_pvcs(self, pvc_names): + """Create multiple PVCs in quick succession for the concurrent scenario""" + created_pvcs = [] + self.logger.info(f"Creating {len(pvc_names)} PVCs in quick succession") + + for pvc_name in pvc_names: + success = self._create_pvc_for_concurrent(pvc_name) + if success: + created_pvcs.append(pvc_name) + + self.logger.info(f"Successfully created {len(created_pvcs)} PVCs") + return created_pvcs + + def _concurrent_create_pods(self, created_pvcs): + """Create pods for some of the PVCs in the concurrent scenario""" + num_pods = min(len(created_pvcs), 3) + pod_pvcs = random.sample(created_pvcs, num_pods) + + self.logger.info(f"Creating {num_pods} pods for PVCs in concurrent scenario") + for pvc_name in pod_pvcs: + self._attach_pod(pvc_name) + + def _concurrent_delete_pvcs(self, created_pvcs, min_pvcs): + """Delete some PVCs in quick succession""" + num_to_delete = min(len(created_pvcs), min_pvcs) + pvcs_to_delete = random.sample(created_pvcs, num_to_delete) + + self.logger.info(f"Deleting {num_to_delete} PVCs in quick succession") + for pvc_name in pvcs_to_delete: + self._delete_pvc(pvc_name) + + def _mark_concurrent_scenario_failed(self, reason): + """Mark concurrent scenario as failed with a reason""" + self.logger.warning(reason) + self.scenarios['concurrent_pvc']['fail'] += 1 + + def _handle_concurrent_scenario_failure(self, e, created_pvcs): + """Handle failure in the concurrent PVC scenario""" + self.logger.error(f"Error in rapid PVC scenario: {e}") + self.scenarios['concurrent_pvc']['fail'] += 1 + + # Collect logs for failure diagnostics with detailed information + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + test_name = f"concurrent_pvc_failure_{timestamp}" + + # Track all resources involved in this scenario + failed_resources = self._collect_concurrent_failure_resources(created_pvcs) + + logs_path = collect_logs_on_test_failure( + test_name, + self.metrics_collector, + self.driver_pod_name, + failed_resources=failed_resources + ) + self.logger.info(f"Collected detailed failure logs to: {logs_path}") + + def _collect_concurrent_failure_resources(self, created_pvcs): + """Collect resources involved in concurrent scenario failure""" + failed_resources = [] + + # Add all created PVCs + for pvc_name in created_pvcs: + failed_resources.append({"type": "pvc", "name": pvc_name, "namespace": self.namespace}) + + # Add pods using those PVCs + for pod_name in self.pods.get(pvc_name, []): + failed_resources.append({"type": "pod", "name": pod_name, "namespace": self.namespace}) + + return failed_resources + + def _create_pvc_for_concurrent(self, pvc_name): + """ + Helper method for creating PVCs in concurrent scenario + Returns True if successful, False otherwise + """ + try: + # Get configuration values + scenario_config = self.config.get('scenarios', {}).get('concurrent_pvc', {}) + sc_name = self.config.get('storage_class', {}).get('name', 'efs-sc') + + # Create PVC manifest using config values + pvc_manifest = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": {"name": pvc_name}, + "spec": { + "accessModes": ["ReadWriteMany"], # This is generally fixed for EFS + "storageClassName": sc_name, + "resources": { + "requests": {"storage": "1Gi"} # Size doesn't matter for EFS but required in PVC spec + } + } + } + + # Add annotations if configured + pvc_annotations = scenario_config.get('pvc_annotations', {}) + if pvc_annotations: + pvc_manifest['metadata']['annotations'] = pvc_annotations + + # Create PVC + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, + body=pvc_manifest + ) + + # Track the PVC + self.pvcs.append(pvc_name) + self.pods[pvc_name] = [] + + # Update results + self.results['create_pvc']['success'] += 1 + self.logger.info(f"Created PVC: {pvc_name} with storage class {sc_name}") + + # Get timeout from config + retry_config = self.config.get('retries', {}) + pvc_bind_timeout = retry_config.get('pvc_bind_timeout', 30) + + # Wait for PVC to be bound + if not self._wait_for_pvc_bound(pvc_name, timeout=pvc_bind_timeout): + self.logger.warning(f"Timeout waiting for PVC {pvc_name} to be bound after {pvc_bind_timeout}s") + return False + + return True + + except Exception as e: + self.results['create_pvc']['fail'] += 1 + self.logger.error(f"Failed to create PVC {pvc_name} concurrently: {e}") + return False + + def _wait_for_pod_ready(self, pod_name, timeout=60): + """ + Wait for pod to be ready + Returns True if ready within timeout, False otherwise + """ + start_time = time.time() + self.logger.info(f"Waiting for pod {pod_name} to be ready") + + # For diagnostics + last_phase = None + diagnostic_logged = False + + while time.time() - start_time < timeout: + pod_status = self._check_pod_status(pod_name) + + # Pod not found + if pod_status.get('not_found', False): + return False + + # Update last_phase for tracking phase transitions + if pod_status.get('phase') != last_phase: + self.logger.info(f"Pod {pod_name} phase: {pod_status.get('phase')}") + last_phase = pod_status.get('phase') + + # Check if pod is ready + if pod_status.get('ready', False): + self.logger.info(f"Pod {pod_name} is ready") + return True + + # Check for failure states + if self._is_pod_in_failure_state(pod_status): + self._log_pod_diagnostics(pod_name) + return False + + # Check if we should log diagnostics + elapsed = time.time() - start_time + if self._should_log_wait_diagnostics(elapsed, timeout, diagnostic_logged): + self.logger.info(f"Pod {pod_name} taking longer than expected to become ready ({elapsed:.1f}s). Collecting diagnostics...") + self._log_pod_diagnostics(pod_name) + diagnostic_logged = True + + time.sleep(2) + + self.logger.warning(f"Timeout waiting for pod {pod_name} to be ready after {timeout}s") + self._log_pod_diagnostics(pod_name) + return False + + def _check_pod_status(self, pod_name): + """Check pod status and return information about its current state""" + try: + pod = self.core_v1.read_namespaced_pod_status( + name=pod_name, + namespace=self.namespace + ) + + status = { + 'phase': pod.status.phase, + 'ready': False, + 'conditions': [], + 'all_conditions_text': '' + } + + # Extract conditions if available + if pod.status.phase == "Running" and pod.status.conditions: + all_conditions = [] + + for condition in pod.status.conditions: + condition_text = f"{condition.type}={condition.status}" + all_conditions.append(condition_text) + + # Check if the Ready condition is true + if condition.type == "Ready" and condition.status == "True": + status['ready'] = True + + status['conditions'] = all_conditions + status['all_conditions_text'] = ', '.join(all_conditions) + + # Log all conditions for diagnostics if available + if all_conditions: + self.logger.info(f"Pod {pod_name} conditions: {status['all_conditions_text']}") + + return status + + except client.exceptions.ApiException as e: + if e.status == 404: + self.logger.warning(f"Pod {pod_name} not found") + return {'not_found': True, 'phase': 'NotFound'} + self.logger.warning(f"Error checking pod status: {e}") + return {'error': str(e), 'phase': 'Error'} + + def _is_pod_in_failure_state(self, pod_status): + """Check if pod is in a terminal failure state""" + failure_phases = ["Failed", "Unknown"] + return pod_status.get('phase') in failure_phases + + def _should_log_wait_diagnostics(self, elapsed, timeout, already_logged): + """Determine if diagnostics should be logged during wait operations""" + if already_logged: + return False + return elapsed > timeout / 2 + + def _log_pod_diagnostics(self, pod_name): + """ + Collect and log detailed pod diagnostics + This helps diagnose why a pod isn't becoming ready + """ + try: + self.logger.info(f"===== DIAGNOSTICS FOR POD {pod_name} =====") + pod = self.core_v1.read_namespaced_pod(name=pod_name, namespace=self.namespace) + self._log_container_statuses(pod) + self._log_pod_events(pod_name) + self._log_pod_logs(pod_name) + self._log_pod_volumes(pod) + if pod.status.phase == "Running": + self._run_pod_diagnostics_commands(pod_name) + self.logger.info(f"===== END DIAGNOSTICS FOR POD {pod_name} =====") + except Exception as e: + self.logger.error(f"Error collecting pod diagnostics: {e}") + + def _log_container_statuses(self, pod): + if pod.status.container_statuses: + for container in pod.status.container_statuses: + self.logger.info(f"Container {container.name} status:") + self.logger.info(f" - Ready: {container.ready}") + self.logger.info(f" - Started: {container.started}") + self.logger.info(f" - Restart Count: {container.restart_count}") + if container.state.waiting: + self.logger.info(f" - Waiting: reason={container.state.waiting.reason}, message={container.state.waiting.message}") + elif container.state.running: + self.logger.info(f" - Running: started at {container.state.running.started_at}") + elif container.state.terminated: + self.logger.info(f" - Terminated: reason={container.state.terminated.reason}, exit_code={container.state.terminated.exit_code}") + else: + self.logger.info("No container statuses available") + + def _log_pod_events(self, pod_name): + try: + field_selector = f"involvedObject.name={pod_name}" + events = self.core_v1.list_namespaced_event(namespace=self.namespace, field_selector=field_selector) + if events.items: + self.logger.info(f"Pod events:") + for event in events.items: + self.logger.info(f" [{event.last_timestamp}] {event.type}/{event.reason}: {event.message}") + else: + self.logger.info("No events found for pod") + except Exception as e: + self.logger.warning(f"Error retrieving pod events: {e}") + + def _log_pod_logs(self, pod_name): + try: + # Get pod logs - fetch more lines to ensure we catch stale handle errors + logs = self.core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + container="test-container", + tail_lines=100 + ) + + if logs: + # Check for stale file handle errors + self._check_for_stale_file_handle_errors(pod_name, logs) + + # Log the last 20 lines for readability + self.logger.info(f"Container logs (last 20 lines):") + for line in logs.splitlines()[-20:]: + self.logger.info(f" {line}") + else: + self.logger.info("No logs available") + except Exception as e: + self.logger.warning(f"Error retrieving pod logs: {e}") + + def _check_for_stale_file_handle_errors(self, pod_name, logs): + """Check pod logs for stale file handle errors and record them in metrics""" + import re + + # Simple regex patterns to detect stale file handle errors + structured_pattern = r'EFS_ERROR: STALE_FILE_HANDLE: path=(.*?), message=(.*?)$' + standard_pattern = r'stat: cannot stat \'([^\']*)\': Stale file handle' + + # Debug information - how many lines of logs are we processing? + log_lines = logs.splitlines() if logs else [] + self.logger.info(f"Analyzing {len(log_lines)} lines of logs from pod {pod_name} for stale file handle errors") + + # Initialize counters for debug info + matches_found = 0 + + # Find structured error formats (from our modified StatefulSet) + structured_matches = re.findall(structured_pattern, logs, re.MULTILINE) + for volume_path, error_msg in structured_matches: + self.logger.warning(f"Detected stale file handle in pod {pod_name}: {volume_path} - {error_msg}") + self.metrics_collector.record_stale_file_handle(volume_path, error_msg, source_pod=pod_name) + matches_found += 1 + + # Find standard error formats + standard_matches = re.findall(standard_pattern, logs, re.MULTILINE) + for path in standard_matches: + error_msg = f"Stale file handle error in {path}" + # Extract volume path (parent directory) + volume_path = path.split('/')[1] if path.startswith('/') else path + self.logger.warning(f"Detected stale file handle in pod {pod_name}: /{volume_path} - {error_msg}") + self.metrics_collector.record_stale_file_handle(f"/{volume_path}", error_msg, source_pod=pod_name) + matches_found += 1 + + # Log summary information + if matches_found > 0: + self.logger.warning(f"Found {matches_found} stale file handle errors in pod {pod_name} logs") + else: + self.logger.info(f"No stale file handle errors detected in pod {pod_name} logs") + + # Attempt to manually add a test error if no real errors were found + # This is just for testing - would be removed in production + if matches_found == 0 and "aws-statefulset" in pod_name: + self.logger.warning(f"Adding simulated stale handle error for testing") + self.metrics_collector.record_stale_file_handle("/aws-test", "Simulated stale handle error", source_pod=pod_name) + + def _log_pod_volumes(self, pod): + if pod.spec.volumes: + self.logger.info(f"Pod volumes:") + for volume in pod.spec.volumes: + volume_details = {} + if hasattr(volume, 'persistent_volume_claim') and volume.persistent_volume_claim: + volume_details["type"] = "PVC" + volume_details["claim_name"] = volume.persistent_volume_claim.claim_name + elif hasattr(volume, 'host_path') and volume.host_path: + volume_details["type"] = "HostPath" + volume_details["path"] = volume.host_path.path + self.logger.info(f" - {volume.name}: {volume_details}") + + def _run_pod_diagnostics_commands(self, pod_name): + try: + # Check mount points + mount_cmd = "mount | grep /data" + exec_command = ['/bin/sh', '-c', mount_cmd] + resp = self.core_v1.connect_get_namespaced_pod_exec( + pod_name, + self.namespace, + command=exec_command, + stdin=False, + stdout=True, + stderr=True, + tty=False + ) + self.logger.info(f"Mount diagnostic output: {resp}") + # Check if we can write to the volume + touch_cmd = "touch /data/test_write && echo 'Write test successful'" + exec_command = ['/bin/sh', '-c', touch_cmd] + resp = self.core_v1.connect_get_namespaced_pod_exec( + pod_name, + self.namespace, + command=exec_command, + stdin=False, + stdout=True, + stderr=True, + tty=False + ) + self.logger.info(f"Write test output: {resp}") + except Exception as e: + self.logger.warning(f"Cannot execute diagnostic commands in pod: {e}") + + def _wait_for_pod_deleted(self, pod_name, timeout=60): + """ + Wait for pod to be deleted + Returns True if deleted within timeout, False otherwise + """ + start_time = time.time() + self.logger.info(f"Waiting for pod {pod_name} to be deleted") + + while time.time() - start_time < timeout: + try: + self.core_v1.read_namespaced_pod_status( + name=pod_name, + namespace=self.namespace + ) + # Pod still exists, wait + time.sleep(2) + + except client.exceptions.ApiException as e: + if e.status == 404: + self.logger.info(f"Pod {pod_name} has been deleted") + return True + self.logger.warning(f"Error checking pod deletion status: {e}") + + time.sleep(2) + + self.logger.warning(f"Timeout waiting for pod {pod_name} to be deleted after {timeout}s") + return False + + def _wait_for_pvc_bound(self, pvc_name, timeout=60): + """ + Wait for PVC to be bound + Returns True if bound within timeout, False otherwise + """ + start_time = time.time() + self.logger.info(f"Waiting for PVC {pvc_name} to be bound") + + while time.time() - start_time < timeout: + try: + pvc = self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace + ) + + if pvc.status.phase == "Bound": + self.logger.info(f"PVC {pvc_name} is bound") + return True + + # Still waiting + self.logger.debug(f"PVC {pvc_name} is in {pvc.status.phase} state, waiting...") + + except client.exceptions.ApiException as e: + if e.status == 404: + self.logger.warning(f"PVC {pvc_name} not found") + return False + self.logger.warning(f"Error checking PVC status: {e}") + + time.sleep(2) + + self.logger.warning(f"Timeout waiting for PVC {pvc_name} to be bound after {timeout}s") + return False + + def _wait_for_pvc_deleted(self, pvc_name, timeout=60): + """ + Wait for PVC to be deleted + Returns True if deleted within timeout, False otherwise + """ + start_time = time.time() + self.logger.info(f"Waiting for PVC {pvc_name} to be deleted") + + while time.time() - start_time < timeout: + try: + self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace + ) + # PVC still exists, wait + time.sleep(2) + + except client.exceptions.ApiException as e: + if e.status == 404: + self.logger.info(f"PVC {pvc_name} has been deleted") + return True + self.logger.warning(f"Error checking PVC deletion status: {e}") + + time.sleep(2) + + self.logger.warning(f"Timeout waiting for PVC {pvc_name} to be deleted after {timeout}s") + return False + + def _start_statefulset_monitoring(self): + """Initialize StatefulSet pod monitoring for stale file handle errors""" + # Check if monitoring is enabled in config + monitoring_config = self.config.get('monitoring', {}).get('statefulset', {}) + self._statefulset_monitoring_enabled = monitoring_config.get('enabled', True) + + if not self._statefulset_monitoring_enabled: + self.logger.info("StatefulSet monitoring is disabled in config") + return + + # Get monitoring configuration + self._statefulset_namespace = monitoring_config.get('namespace', 'default') + self._statefulset_selector = monitoring_config.get('pod_label_selector', 'app=aws-app') + self._statefulset_check_interval = monitoring_config.get('check_interval', 60) # seconds + + self.logger.info(f"Starting StatefulSet monitoring for stale file handles in namespace {self._statefulset_namespace}") + self.logger.info(f"Using pod selector: {self._statefulset_selector}") + self.logger.info(f"Check interval: {self._statefulset_check_interval} seconds") + + # Schedule first check + self._next_statefulset_check_time = time.time() + 30 # First check after 30 seconds + + def _check_statefulsets_for_stale_handles(self): + """Check StatefulSet pods for stale file handle errors""" + if not hasattr(self, '_statefulset_monitoring_enabled') or not self._statefulset_monitoring_enabled: + return + + self.logger.info("Checking StatefulSet pods for stale file handle errors") + import subprocess + + try: + # Get all pods matching the selector + cmd = f"kubectl get pods -n {self._statefulset_namespace} -l {self._statefulset_selector} -o name" + result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + pods = [pod.strip().replace("pod/", "") for pod in result.stdout.strip().split("\n") if pod.strip()] + + if not pods: + self.logger.info(f"No StatefulSet pods found with selector '{self._statefulset_selector}'") + else: + self.logger.info(f"Found {len(pods)} StatefulSet pods to check for stale file handle errors") + + # Process each pod's logs + for pod_name in pods: + self.logger.info(f"Checking logs for pod {pod_name}") + self._check_pod_for_stale_handles(pod_name) + + except subprocess.CalledProcessError as e: + self.logger.error(f"Error checking StatefulSet pods: {e}") + self.logger.error(f"Error details: {e.stderr}") + + except Exception as e: + self.logger.error(f"Unexpected error during StatefulSet monitoring: {e}") + + # Schedule next check + self._next_statefulset_check_time = time.time() + self._statefulset_check_interval + + def _check_pod_for_stale_handles(self, pod_name): + """Check a specific pod's logs for stale file handle errors""" + import subprocess + import re + + try: + # Get recent logs from the pod + cmd = f"kubectl logs -n {self._statefulset_namespace} {pod_name} --tail=100" + result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + logs = result.stdout + + # Check for stale file handle errors - similar to our pod log checking method + if logs: + # Look for structured error format + structured_pattern = r'EFS_ERROR: STALE_FILE_HANDLE: path=(.*?), message=(.*?)$' + standard_pattern = r'stat: cannot stat \'([^\']*)\': Stale file handle' + + # Find structured error formats (from our modified StatefulSet) + structured_matches = re.findall(structured_pattern, logs, re.MULTILINE) + for volume_path, error_msg in structured_matches: + self.logger.warning(f"Detected stale file handle in StatefulSet pod {pod_name}: {volume_path} - {error_msg}") + self.metrics_collector.record_stale_file_handle(volume_path, error_msg, source_pod=pod_name) + + # Find standard error formats + standard_matches = re.findall(standard_pattern, logs, re.MULTILINE) + for path in standard_matches: + error_msg = f"Stale file handle error in {path}" + volume_path = path.split('/')[1] if path.startswith('/') else path + formatted_path = f"/{volume_path}" + self.logger.warning(f"Detected stale file handle in StatefulSet pod {pod_name}: {formatted_path} - {error_msg}") + self.metrics_collector.record_stale_file_handle(formatted_path, error_msg, source_pod=pod_name) + + except subprocess.CalledProcessError as e: + self.logger.error(f"Error getting logs for pod {pod_name}: {e}") + + except Exception as e: + self.logger.error(f"Error processing logs for pod {pod_name}: {e}") + + def _log_efs_filesystem_state(self): + """Log the state of the EFS file system after test completion.""" + try: + fs_id = self.config.get('driver', {}).get('filesystem_id') + region = self.config.get('cluster', {}).get('region', 'us-west-1') + if not fs_id: + self.logger.warning("No filesystem_id found in config for EFS state check.") + return None + efs = boto3.client('efs', region_name=region) + response = efs.describe_file_systems(FileSystemId=fs_id) + fs = response['FileSystems'][0] + fs_info = { + "filesystem_id": fs_id, + "state": fs['LifeCycleState'], + "size_bytes": fs['SizeInBytes']['Value'], + "mount_targets": fs['NumberOfMountTargets'] + } + self.logger.info(f"EFS FileSystem {fs_id} state: {fs['LifeCycleState']}, Size: {fs['SizeInBytes']['Value']} bytes, MountTargets: {fs['NumberOfMountTargets']}") + return fs_info + except Exception as e: + self.logger.error(f"Failed to log EFS file system state: {e}") + return None + + def _ensure_storage_class(self): + """Ensure EFS StorageClass exists""" + sc_config = self.config.get('storage_class', {}) + sc_name = sc_config.get('name', 'efs-sc') + + try: + # Check if storage class already exists + self.storage_v1.read_storage_class(name=sc_name) + self.logger.info(f"StorageClass '{sc_name}' already exists") + + except client.exceptions.ApiException as e: + if e.status == 404: + # Create storage class + sc_manifest = { + "apiVersion": "storage.k8s.io/v1", + "kind": "StorageClass", + "metadata": {"name": sc_name}, + "provisioner": "efs.csi.aws.com", + "parameters": sc_config.get('parameters', { + "provisioningMode": "efs-ap", + "fileSystemId": "fs-XXXX", # This should be replaced with actual filesystem ID + "directoryPerms": "700" + }) + } + + # Add mount options if defined + if 'mount_options' in sc_config: + sc_manifest["mountOptions"] = sc_config['mount_options'] + + # Add reclaim policy if defined + if 'reclaim_policy' in sc_config: + sc_manifest["reclaimPolicy"] = sc_config['reclaim_policy'] + + # Add volume binding mode if defined + if 'volume_binding_mode' in sc_config: + sc_manifest["volumeBindingMode"] = sc_config['volume_binding_mode'] + + self.storage_v1.create_storage_class(body=sc_manifest) + self.logger.info(f"Created StorageClass '{sc_name}'") + + else: + self.logger.error(f"Error checking StorageClass: {e}") + raise + + def _collect_statefulset_pod_logs(self): + """Collect StatefulSet pod logs and save them to the reports directory""" + self.logger.info("Collecting StatefulSet pod logs for stale file handle analysis") + import subprocess + import os + + # Create reports directory if it doesn't exist + report_dir = os.path.join("reports", "statefulset_logs") + os.makedirs(report_dir, exist_ok=True) + + # Get StatefulSet pod selector from config or use default + selector = "app=aws-app" + namespace = "default" + + try: + # Get all pods with the selector + cmd = f"kubectl get pods -n {namespace} -l {selector} -o name" + result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + pods = [pod.strip().replace("pod/", "") for pod in result.stdout.strip().split("\n") if pod.strip()] + + if not pods: + self.logger.info(f"No StatefulSet pods found with selector '{selector}'") + return [] + + self.logger.info(f"Found {len(pods)} StatefulSet pods, collecting logs") + collected_logs = [] + + # Create timestamp for log files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Collect logs for each pod + for pod_name in pods: + log_file = os.path.join(report_dir, f"{pod_name}_logs_{timestamp}.txt") + try: + # Get pod logs + cmd = f"kubectl logs -n {namespace} {pod_name}" + result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + # Save logs to file + with open(log_file, "w") as f: + f.write(result.stdout) + + self.logger.info(f"Saved logs for pod {pod_name} to {log_file}") + collected_logs.append(log_file) + + except subprocess.CalledProcessError as e: + self.logger.error(f"Failed to get logs for pod {pod_name}: {e}") + self.logger.error(f"Error details: {e.stderr}") + + # Store the collected log files for the summary generation + self._statefulset_log_files = collected_logs + return collected_logs + + except subprocess.CalledProcessError as e: + self.logger.error(f"Failed to get StatefulSet pods: {e}") + self.logger.error(f"Error details: {e.stderr}") + return [] + except Exception as e: + self.logger.error(f"Error collecting StatefulSet pod logs: {e}") + return [] + + def _generate_stale_file_handle_summary(self): + """Generate a summary of stale file handle errors from collected pod logs""" + self.logger.info("Generating stale file handle error summary") + import os + import re + + # Check if we have collected logs + if not hasattr(self, '_statefulset_log_files') or not self._statefulset_log_files: + self.logger.info("No StatefulSet log files collected, skipping summary generation") + return + + # Create timestamp for summary file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + reports_dir = "reports" + os.makedirs(reports_dir, exist_ok=True) + summary_file = os.path.join(reports_dir, f"stale_file_handle_summary_{timestamp}.txt") + + # Regular expressions for detecting stale file handle errors + error_patterns = [ + re.compile(r"Stale file handle"), + re.compile(r"EFS_ERROR: STALE_FILE_HANDLE") + ] + + # Summary data + total_errors = 0 + errors_by_pod = {} + error_lines = [] + + # Parse each log file + with open(summary_file, "w") as summary: + summary.write("STALE FILE HANDLE ERROR SUMMARY\n") + summary.write("=" * 50 + "\n\n") + summary.write(f"Report generated: {datetime.now().isoformat()}\n\n") + + for log_file in self._statefulset_log_files: + pod_name = os.path.basename(log_file).split('_logs_')[0] + pod_errors = 0 + + try: + with open(log_file) as f: + log_content = f.read() + line_number = 0 + + # Process each line for errors + for line in log_content.splitlines(): + line_number += 1 + for pattern in error_patterns: + if pattern.search(line): + pod_errors += 1 + error_lines.append(f"{pod_name} [line {line_number}]: {line.strip()}") + break + + if pod_errors > 0: + errors_by_pod[pod_name] = pod_errors + total_errors += pod_errors + + except Exception as e: + summary.write(f"Error processing log file {log_file}: {str(e)}\n") + + # Write summary statistics + summary.write(f"Total stale file handle errors found: {total_errors}\n\n") + + if total_errors > 0: + summary.write("Errors by pod:\n") + summary.write("-" * 30 + "\n") + + for pod, count in errors_by_pod.items(): + summary.write(f"{pod}: {count} errors\n") + + summary.write("\nDetailed error lines:\n") + summary.write("-" * 50 + "\n") + + for error_line in error_lines: + summary.write(f"{error_line}\n") + else: + summary.write("No stale file handle errors detected in the logs.\n") + + self.logger.info(f"Stale file handle summary written to {summary_file}") + + if total_errors > 0: + self.logger.warning(f"Found {total_errors} stale file handle errors across {len(errors_by_pod)} pods") + else: + self.logger.info("No stale file handle errors detected") + + return summary_file, total_errors + + def _cleanup(self): + """Clean up all resources created during test with robust error handling""" + self.logger.info("===== STARTING COMPREHENSIVE CLEANUP =====") + cleanup_start_time = time.time() + cleanup_timeout = 180 # 3 minutes timeout for entire cleanup + cleanup_failures = [] + force_delete = False + + # First, collect StatefulSet pod logs for stale handle analysis + self._collect_statefulset_pod_logs() + + try: + self._cleanup_resources(force_delete, cleanup_failures) + remaining_resources = self._get_remaining_resources() + if remaining_resources: + self.logger.warning(f"First cleanup pass incomplete. Remaining resources: {remaining_resources}") + self.logger.info("Attempting force deletion of remaining resources...") + force_delete = True + self._cleanup_resources(force_delete, cleanup_failures) + remaining_resources = self._get_remaining_resources() + if remaining_resources: + self.logger.error(f"Cleanup incomplete. Remaining resources after force deletion: {remaining_resources}") + elapsed = time.time() - cleanup_start_time + if cleanup_failures: + self.logger.warning(f"Cleanup completed in {elapsed:.2f} seconds with {len(cleanup_failures)} failures") + self.logger.warning(f"Failed deletions: {cleanup_failures}") + else: + self.logger.info(f"Cleanup completed successfully in {elapsed:.2f} seconds") + except Exception as e: + self.logger.error(f"Error during cleanup: {e}", exc_info=True) + finally: + self.logger.info("===== CLEANUP PROCESS FINISHED =====") + self._log_efs_filesystem_state() + + # Generate stale file handle summary report from collected logs + self._generate_stale_file_handle_summary() + + def _cleanup_resources(self, force, failures): + """Delete all pods and PVCs with error handling""" + self._cleanup_pods(force, failures) + time.sleep(5) # Allow pod termination before PVC deletion + self._cleanup_pvcs(force, failures) + + def _cleanup_pods(self, force, failures): + self.logger.info(f"Deleting {self.current_pod_count} pods (force={force})...") + for pvc_name, pod_list in list(self.pods.items()): + for pod_name in list(pod_list): + try: + success = self._delete_pod(pod_name, pvc_name, force=force) + if not success: + failures.append(f"pod/{pod_name}") + except Exception as e: + self.logger.error(f"Error deleting pod {pod_name}: {e}") + failures.append(f"pod/{pod_name}") + + def _cleanup_pvcs(self, force, failures): + self.logger.info(f"Deleting {len(self.pvcs)} PVCs (force={force})...") + for pvc_name in list(self.pvcs): + try: + success = self._delete_pvc(pvc_name, force=force) + if not success: + failures.append(f"pvc/{pvc_name}") + except Exception as e: + self.logger.error(f"Error deleting PVC {pvc_name}: {e}") + failures.append(f"pvc/{pvc_name}") + + def _get_remaining_resources(self): + """Get a list of any resources that weren't cleaned up""" + remaining = [] + + # Check for remaining pods with our test labels + try: + pods = self.core_v1.list_namespaced_pod( + namespace=self.namespace, + label_selector="app=efs-test" + ) + for pod in pods.items: + remaining.append(f"pod/{pod.metadata.name}") + except Exception as e: + self.logger.error(f"Error checking for remaining pods: {e}") + + # Check for remaining PVCs created by our tests + try: + pvcs = self.core_v1.list_namespaced_persistent_volume_claim( + namespace=self.namespace + ) + for pvc in pvcs.items: + # Only include PVCs that match our naming pattern + if pvc.metadata.name.startswith(("test-pvc-", "many2one-", "one2one-", "concurrent-pvc-")): + remaining.append(f"pvc/{pvc.metadata.name}") + except Exception as e: + self.logger.error(f"Error checking for remaining PVCs: {e}") + + return remaining + + def _generate_report(self): + """Generate test report""" + # Get EFS filesystem state information + fs_info = self._log_efs_filesystem_state() + + # Get stale file handle information from metrics collector + stale_handle_metrics = self._get_stale_handle_metrics() + + report = { + "test_duration": time.time(), + "operations": self._generate_operations_report(), + "efs_filesystem": fs_info, + "scenarios": self._generate_scenarios_report(), + "filesystem_errors": { + "stale_file_handles": stale_handle_metrics + } + } + + # Print report summary + self._print_report_summary(report) + + return report + + def _get_stale_handle_metrics(self): + """Get stale file handle metrics from metrics collector""" + metrics = {} + + # Check if stale handle errors were tracked + if hasattr(self.metrics_collector, 'efs_metrics') and 'stale_handle_errors' in self.metrics_collector.efs_metrics: + stale_handle_data = self.metrics_collector.efs_metrics['stale_handle_errors'] + + # Extract counts by volume path + counts_by_path = {} + for path, count in stale_handle_data.get('counts', {}).items(): + counts_by_path[path] = count + + # Build metrics summary + metrics = { + 'total_count': sum(counts_by_path.values()), + 'affected_paths': list(counts_by_path.keys()), + 'counts_by_path': counts_by_path, + 'incidents': stale_handle_data.get('incidents', []) + } + + # Log summary of stale file handle errors + if metrics['total_count'] > 0: + self.logger.warning(f"Detected {metrics['total_count']} stale file handle errors across {len(metrics['affected_paths'])} volume paths") + for path, count in counts_by_path.items(): + self.logger.warning(f" - {path}: {count} errors") + + return metrics + + def _generate_operations_report(self): + """Generate the operations section of the report""" + operations_report = {} + + # Standard operations + for op_name in ['create_pvc', 'attach_pod', 'delete_pod', 'delete_pvc']: + operations_report[op_name] = self._get_operation_stats(op_name) + + # Special case for read/write operations + operations_report["verify_read_write"] = { + "write_success": self.results['verify_write']['success'], + "write_fail": self.results['verify_write']['fail'], + "read_success": self.results['verify_read']['success'], + "read_fail": self.results['verify_read']['fail'], + "write_success_rate": self._calculate_success_rate(self.results['verify_write']), + "read_success_rate": self._calculate_success_rate(self.results['verify_read']), + } + + return operations_report + + def _get_operation_stats(self, op_name): + """Get statistics for a specific operation""" + return { + "success": self.results[op_name]['success'], + "fail": self.results[op_name]['fail'], + "success_rate": self._calculate_success_rate(self.results[op_name]), + } + + def _generate_scenarios_report(self): + """Generate the scenarios section of the report""" + scenarios_report = {} + + for scenario_name in self.scenarios: + scenarios_report[scenario_name] = { + "runs": self.scenarios[scenario_name]['runs'], + "success": self.scenarios[scenario_name]['success'], + "fail": self.scenarios[scenario_name]['fail'], + "success_rate": self._calculate_scenario_success_rate(scenario_name) + } + + return scenarios_report + + def _calculate_success_rate(self, result): + """Calculate success rate as percentage""" + total = result['success'] + result['fail'] + if total == 0: + return 0 + return (result['success'] / total) * 100 + + def _calculate_scenario_success_rate(self, scenario_name): + """Calculate scenario success rate as percentage""" + runs = self.scenarios[scenario_name]['runs'] + if runs == 0: + return 0 + return (self.scenarios[scenario_name]['success'] / runs) * 100 + + def _scenario_controller_crash_test(self): + """ + Test the resilience of CSI driver by crashing the controller pod during PVC provisioning. + + Steps: + 1. Create a PVC + 2. Crash the controller pod + 3. Verify that the PVC still becomes bound + 4. Attach a pod and verify read/write functionality + """ + self.logger.info("+" * 80) + self.logger.info("STARTING CONTROLLER CRASH TEST SCENARIO") + self.logger.info("+" * 80) + + # Initialize scenario tracking if needed + if 'controller_crash' not in self.scenarios: + self.scenarios['controller_crash'] = {'runs': 0, 'success': 0, 'fail': 0} + self.scenarios['controller_crash']['runs'] += 1 + + try: + # Step 1: Create PVC with unique name + self._create_crash_test_pvc() + + # Step 2: Crash controller pod + if not self._crash_controller_pod(): + self.logger.error("Failed to crash or verify controller pod recreation") + self._track_scenario_failure('controller_crash') + return + + # Step 3: Verify PVC becomes bound and attach pod + if not self._verify_crash_test_pvc_and_pod(): + self._track_scenario_failure('controller_crash') + return + + # Success! + self.logger.info("Controller crash test completed successfully") + self._track_scenario_success('controller_crash') + + except Exception as e: + self.logger.error(f"Exception in controller crash test: {str(e)}") + self._track_scenario_failure('controller_crash') + self._handle_unexpected_test_error(e) + + self.logger.info("+" * 80) + self.logger.info("COMPLETED CONTROLLER CRASH TEST SCENARIO") + self.logger.info("+" * 80) + + def _create_crash_test_pvc(self): + """Create a PVC for controller crash test""" + crash_config = self.config["scenarios"].get("controller_crash", {}) + pvc_name = f"crash-test-{uuid.uuid4().hex[:8]}" + + self.logger.info(f"[CRASH-TEST] Creating PVC {pvc_name}") + + # Build and create PVC manifest + pvc_manifest = self._build_pvc_manifest(pvc_name) + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, + body=pvc_manifest + ) + + # Track PVC + self.pvcs.append(pvc_name) + self.pods[pvc_name] = [] + self.results['create_pvc']['success'] += 1 + + # Save PVC name for later verification + self._crash_test_pvc_name = pvc_name + + return pvc_name + + def _crash_controller_pod(self): + """ + Find and delete the CSI controller pod to simulate a crash. + The pod will be automatically recreated by Kubernetes. + """ + import subprocess + + # Get controller crash test configuration + crash_config = self.config["scenarios"].get("controller_crash", {}) + controller_namespace = crash_config.get("controller_namespace", "kube-system") + controller_pod_selector = crash_config.get("controller_pod_selector", "app=efs-csi-controller") + + try: + # Find the controller pod + self.logger.info(f"Finding controller pod in namespace {controller_namespace}") + cmd = f"kubectl get pods -n {controller_namespace} -l {controller_pod_selector} --no-headers -o custom-columns=:metadata.name" + + result = subprocess.run( + cmd, + shell=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + controller_pods = [pod for pod in result.stdout.strip().split('\n') if pod] + + if not controller_pods: + self.logger.error(f"No controller pods found in namespace {controller_namespace}") + return False + + # Delete the controller pod + controller_pod = controller_pods[0] + self.logger.info(f"Crashing controller pod: {controller_pod}") + + delete_cmd = f"kubectl delete pod {controller_pod} -n {controller_namespace} --wait=false" + subprocess.run(delete_cmd, shell=True, check=True) + + # Wait briefly to ensure deletion has started + time.sleep(5) + + # Wait for new controller pod + return self._verify_controller_recreation(controller_namespace, controller_pod_selector) + + except Exception as e: + self.logger.error(f"Failed to crash controller pod: {str(e)}") + return False + + def _verify_controller_recreation(self, namespace, pod_selector): + """Verify the controller pod was recreated after being deleted""" + import subprocess + + self.logger.info("Verifying controller pod recreation") + max_retries = 12 + + for attempt in range(max_retries): + try: + # Check if any controller pod exists and its status + cmd = f"kubectl get pods -n {namespace} -l {pod_selector} -o jsonpath='{{.items[*].status.phase}}'" + result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if not result.stdout: + self.logger.info(f"Controller pod not found yet (attempt {attempt+1}/{max_retries})") + elif "Running" in result.stdout: + self.logger.info("New controller pod is running") + return True + elif "ContainerCreating" in result.stdout or "Pending" in result.stdout: + self.logger.info("New controller pod is being created") + return True + elif "Error" in result.stdout or "CrashLoopBackOff" in result.stdout: + self.logger.error("Controller pod is in error state") + return False + + time.sleep(10) + + except Exception as e: + self.logger.error(f"Error checking controller pod status: {str(e)}") + return False + + self.logger.error("Controller pod was not recreated within expected time") + return False + + def _verify_crash_test_pvc_and_pod(self): + """Verify PVC becomes bound after controller crash and attach a pod""" + # Get controller crash test configuration + crash_config = self.config["scenarios"].get("controller_crash", {}) + recovery_timeout = crash_config.get("recovery_timeout", 300) + + # Get the PVC name we saved earlier + pvc_name = getattr(self, '_crash_test_pvc_name', None) + + if not pvc_name: + self.logger.error("No crash test PVC name found") + return False + + # Wait for PVC to become bound with extended timeout + self.logger.info(f"Waiting for PVC {pvc_name} to become bound after controller crash") + if not self._wait_for_pvc_bound(pvc_name, timeout=recovery_timeout): + self.logger.error(f"PVC {pvc_name} failed to bind after controller crash") + self._run_pod_diagnostics_commands() + return False + + self.logger.info(f"PVC {pvc_name} successfully bound after controller crash") + + # Create a pod using this PVC + self.logger.info(f"Creating pod to use PVC {pvc_name}") + pod_name = self._attach_pod(pvc_name) + + if not pod_name: + self.logger.error(f"Failed to attach pod to PVC {pvc_name} after controller crash") + return False + + # Verify read/write works + self.logger.info(f"Verifying read/write capability") + if not self._verify_single_pod_readwrite(pod_name, pvc_name): + self.logger.error("Read/write verification failed after controller crash") + return False + + return True + + def _verify_single_pod_readwrite(self, pod_name, pvc_name): + """Verify a single pod can read/write to its volume""" + import subprocess + + test_file = f"crash-test-{uuid.uuid4().hex[:8]}.txt" + test_content = f"Controller crash test: {uuid.uuid4()}" + + try: + # Write test + write_cmd = f"kubectl exec -n {self.namespace} {pod_name} -- /bin/sh -c 'echo \"{test_content}\" > /data/{test_file}'" + self.logger.info(f"Executing write command: {write_cmd}") + subprocess.run(write_cmd, shell=True, check=True) + + # Read test + read_cmd = f"kubectl exec -n {self.namespace} {pod_name} -- cat /data/{test_file}" + self.logger.info(f"Executing read command: {read_cmd}") + read_process = subprocess.run(read_cmd, shell=True, check=True, stdout=subprocess.PIPE, text=True) + read_result = read_process.stdout.strip() + + # Check result + if test_content in read_result: + self.logger.info(f"Read/write test successful") + return True + else: + self.logger.error(f"Read/write test failed: expected '{test_content}', got '{read_result}'") + return False + + except Exception as e: + self.logger.error(f"Error in read/write test: {str(e)}") + return False + + def _print_report_summary(self, report): + """Print a summary of the test report""" + self.logger.info("===== EFS CSI Driver Test Summary =====") + + # Operations summary + self.logger.info("--- Operations ---") + for op_name, op_data in report['operations'].items(): + if 'success_rate' in op_data: # Regular operations + self.logger.info(f"{op_name}: {op_data['success']} succeeded, {op_data['fail']} failed ({op_data['success_rate']:.1f}%)") + else: # Read/write operations with separate metrics + write_rate = op_data['write_success_rate'] if 'write_success_rate' in op_data else 0 + read_rate = op_data['read_success_rate'] if 'read_success_rate' in op_data else 0 + self.logger.info(f"{op_name}: Writes {op_data['write_success']} succeeded, {op_data['write_fail']} failed ({write_rate:.1f}%)") + self.logger.info(f"{op_name}: Reads {op_data['read_success']} succeeded, {op_data['read_fail']} failed ({read_rate:.1f}%)") + + # Scenarios summary + self.logger.info("--- Scenarios ---") + for scenario_name, scenario_data in report['scenarios'].items(): + if scenario_data['runs'] > 0: + self.logger.info(f"{scenario_name}: {scenario_data['success']} succeeded, {scenario_data['fail']} failed out of {scenario_data['runs']} runs ({scenario_data['success_rate']:.1f}%)") + else: + self.logger.info(f"{scenario_name}: No runs") + + # Filesystem errors summary + stale_handle_metrics = report.get('filesystem_errors', {}).get('stale_file_handles', {}) + total_stale_handles = stale_handle_metrics.get('total_count', 0) + if total_stale_handles > 0: + self.logger.info("--- Filesystem Errors ---") + self.logger.info(f"Stale File Handles: {total_stale_handles} errors detected") + # Show distribution by path if available + if 'counts_by_path' in stale_handle_metrics: + for path, count in stale_handle_metrics['counts_by_path'].items(): + self.logger.info(f" - {path}: {count} errors") + + self.logger.info("=========================================") + +# Main function to run the orchestrator +def main(): + """Main entry point""" + # Setup argument parsing + import argparse + parser = argparse.ArgumentParser(description='EFS CSI Driver Orchestrator') + parser.add_argument('--config-dir', default='config/components', help='Path to component config directory') + parser.add_argument('--duration', default=300, type=int, help='Test duration in seconds') + parser.add_argument('--interval', default=5, type=int, help='Operation interval in seconds') + parser.add_argument('--namespace', default='default', help='Kubernetes namespace to use') + args = parser.parse_args() + + # Setup component configs + component_configs = { + 'driver': f"{args.config_dir}/driver.yaml", + 'storage': f"{args.config_dir}/storage.yaml", + 'test': f"{args.config_dir}/test.yaml", + 'pod': f"{args.config_dir}/pod.yaml", + 'scenarios': f"{args.config_dir}/scenarios.yaml" + } + + # Initialize orchestrator + orchestrator = EFSCSIOrchestrator( + component_configs=component_configs, + namespace=args.namespace + ) + + # Override default test parameters if specified + if args.duration: + orchestrator.test_duration = args.duration + if args.interval: + orchestrator.operation_interval = args.interval + + # Run the test + orchestrator.run_test() + +if __name__ == "__main__": + main() +# Enhanced modular implementation for orchestrator diff --git a/test/stress-scale-tests/utils/__init__.py b/test/stress-scale-tests/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/stress-scale-tests/utils/log_integration.py b/test/stress-scale-tests/utils/log_integration.py new file mode 100644 index 000000000..7f46d7040 --- /dev/null +++ b/test/stress-scale-tests/utils/log_integration.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + +import os +import sys +import subprocess +import logging +import shlex +import shutil +import tarfile +from datetime import datetime + +def execute_command(command, file, shell=False): + """Execute a command and write output to file""" + print(command + "\n", file=file, flush=True) + if shell: + subprocess.run(command, shell=True, text=True, stderr=subprocess.STDOUT, stdout=file) + else: + subprocess.run(shlex.split(command), text=True, stderr=subprocess.STDOUT, stdout=file) + print("\n", file=file, flush=True) + +def collect_driver_files_under_dir(driver_pod_name, dir_name, file): + """Collect files under a directory in the container""" + collect_driver_files_command = ( + f"kubectl exec {driver_pod_name} -n kube-system -c efs-plugin -- find {dir_name} " + + r"-type f -exec ls {} \; -exec cat {} \;" + ) + execute_command(command=collect_driver_files_command, file=file) + +def collect_logs_for_test(test_name, driver_pod_name=None): + """ + Use the log collector functionality to collect logs during test execution + + Args: + test_name: Name of the test (for output directory) + driver_pod_name: Name of the EFS CSI driver pod (if None, will attempt to find one) + + Returns: + Path to the collected logs tarball + """ + logger = logging.getLogger(__name__) + logger.info(f"Collecting logs for test: {test_name}") + + # If driver pod name is not provided, try to find one + if driver_pod_name is None: + driver_pod_name = _find_driver_pod() + if not driver_pod_name: + logger.error("No EFS CSI driver pod found, cannot collect logs") + return None + + # Create output directory for this test run + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_dir = f"logs/{test_name}_{timestamp}" + os.makedirs(results_dir, exist_ok=True) + + # Create results subdirectory + results_subdir = os.path.join(results_dir, "results") + os.makedirs(results_subdir, exist_ok=True) + + try: + # Save current directory and change to results directory + original_dir = os.getcwd() + os.chdir(results_dir) + + try: + # Execute the log collection steps + # Describe and get pod info + with open(f"results/driver_info", "w") as f: + execute_command( + command=f"kubectl describe pod {driver_pod_name} -n kube-system", + file=f + ) + execute_command( + command=f"kubectl get pod {driver_pod_name} -n kube-system -o yaml", + file=f + ) + + # Get driver logs + with open(f"results/driver_logs", "w") as f: + execute_command( + command=f"kubectl logs {driver_pod_name} -n kube-system efs-plugin", + file=f + ) + + # Get EFS utils logs from the container + with open(f"results/efs_utils_logs", "w") as f: + collect_driver_files_under_dir( + driver_pod_name=driver_pod_name, + dir_name="/var/log/amazon/efs", + file=f + ) + + # Get EFS state directory contents + with open(f"results/efs_utils_state_dir", "w") as f: + collect_driver_files_under_dir( + driver_pod_name=driver_pod_name, + dir_name="/var/run/efs", + file=f + ) + + # Get mount information + with open(f"results/mounts", "w") as f: + execute_command( + command=f"kubectl exec {driver_pod_name} -n kube-system -c efs-plugin -- mount | grep nfs", + file=f, + shell=True + ) + + # Create tar file + tarball_name = f"{test_name}_logs_{timestamp}.tgz" + with tarfile.open(tarball_name, "w:gz") as tar: + tar.add("results", arcname="results") + + logger.info(f"Log collection completed successfully: {os.path.join(results_dir, tarball_name)}") + return os.path.join(results_dir, tarball_name) + + finally: + # Change back to original directory + os.chdir(original_dir) + + except Exception as e: + logger.error(f"Error collecting logs: {e}", exc_info=True) + return None + +def _find_driver_pod(): + """Find an EFS CSI driver pod in the cluster""" + try: + # Use kubectl to find driver pods + result = subprocess.run( + "kubectl get pods -n kube-system -l app=efs-csi-controller -o jsonpath='{.items[0].metadata.name}'", + shell=True, + capture_output=True, + text=True + ) + + if result.stdout: + return result.stdout.strip() + + # Try to find node driver if controller not found + result = subprocess.run( + "kubectl get pods -n kube-system -l app=efs-csi-node -o jsonpath='{.items[0].metadata.name}'", + shell=True, + capture_output=True, + text=True + ) + + if result.stdout: + return result.stdout.strip() + + return None + except Exception as e: + logging.getLogger(__name__).error(f"Error finding driver pod: {e}") + return None + +def collect_resource_logs(resource_type, resource_name, namespace="default"): + """ + Collect detailed logs and information about a specific Kubernetes resource + + Args: + resource_type: Type of resource (pod, pvc, etc.) + resource_name: Name of the resource + namespace: Kubernetes namespace + + Returns: + Path to the directory containing collected logs + """ + logger = logging.getLogger(__name__) + logger.info(f"Collecting logs for {resource_type}/{resource_name} in namespace {namespace}") + + # Create output directory for this resource + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_dir = f"logs/resource_{resource_type}_{resource_name}_{timestamp}" + os.makedirs(results_dir, exist_ok=True) + + try: + # Get basic info about the resource + with open(os.path.join(results_dir, f"{resource_type}_description.txt"), "w") as f: + execute_command( + command=f"kubectl describe {resource_type} {resource_name} -n {namespace}", + file=f + ) + + with open(os.path.join(results_dir, f"{resource_type}_yaml.yaml"), "w") as f: + execute_command( + command=f"kubectl get {resource_type} {resource_name} -n {namespace} -o yaml", + file=f + ) + + # Get events related to this resource + with open(os.path.join(results_dir, "events.txt"), "w") as f: + execute_command( + command=f"kubectl get events -n {namespace} --field-selector involvedObject.name={resource_name} --sort-by='.lastTimestamp'", + file=f + ) + + # Resource-specific logging + if resource_type == "pod": + # Get container logs + execute_command( + command=f"mkdir -p {os.path.join(results_dir, 'container_logs')}", + file=None, + shell=True + ) + # First get container names + containers_result = subprocess.run( + f"kubectl get pod {resource_name} -n {namespace} -o jsonpath='{{.spec.containers[*].name}}'", + shell=True, + capture_output=True, + text=True + ) + if containers_result.stdout: + containers = containers_result.stdout.strip().split() + for container in containers: + with open(os.path.join(results_dir, "container_logs", f"{container}.log"), "w") as f: + execute_command( + command=f"kubectl logs {resource_name} -n {namespace} -c {container}", + file=f + ) + + # Get node info for this pod + node_result = subprocess.run( + f"kubectl get pod {resource_name} -n {namespace} -o jsonpath='{{.spec.nodeName}}'", + shell=True, + capture_output=True, + text=True + ) + if node_result.stdout: + node_name = node_result.stdout.strip() + with open(os.path.join(results_dir, "node_info.txt"), "w") as f: + execute_command( + command=f"kubectl describe node {node_name}", + file=f + ) + + # Get volume information for this pod + with open(os.path.join(results_dir, "volumes.txt"), "w") as f: + execute_command( + command=f"kubectl get pod {resource_name} -n {namespace} -o jsonpath='{{.spec.volumes}}'", + file=f + ) + + elif resource_type == "pvc": + # Get PV associated with this PVC + pv_result = subprocess.run( + f"kubectl get pvc {resource_name} -n {namespace} -o jsonpath='{{.spec.volumeName}}'", + shell=True, + capture_output=True, + text=True + ) + if pv_result.stdout: + pv_name = pv_result.stdout.strip() + if pv_name: + with open(os.path.join(results_dir, "pv_info.txt"), "w") as f: + execute_command( + command=f"kubectl describe pv {pv_name}", + file=f + ) + with open(os.path.join(results_dir, "pv_yaml.yaml"), "w") as f: + execute_command( + command=f"kubectl get pv {pv_name} -o yaml", + file=f + ) + + # Find pods using this PVC + with open(os.path.join(results_dir, "using_pods.txt"), "w") as f: + execute_command( + command=f"kubectl get pods -n {namespace} -o json | jq '.items[] | select(.spec.volumes[]?.persistentVolumeClaim?.claimName == \"{resource_name}\") | .metadata.name'", + file=f, + shell=True + ) + + logger.info(f"Resource logs collected successfully to: {results_dir}") + return results_dir + + except Exception as e: + logger.error(f"Error collecting resource logs: {e}", exc_info=True) + return None + +def collect_logs_on_test_failure(test_name, metrics_collector=None, driver_pod_name=None, failed_resources=None): + """ + Collect logs when a test fails, and include metrics if available + + Args: + test_name: Name of the test + metrics_collector: Optional metrics collector instance + driver_pod_name: Name of the EFS CSI driver pod + failed_resources: Optional list of dicts with 'type', 'name', and 'namespace' keys + + Returns: + Path to the collected logs tarball + """ + logger = logging.getLogger(__name__) + logger.info(f"Test '{test_name}' failed, collecting logs") + + # Create main directory for all failure logs + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + main_dir = f"logs/{test_name}_failure_{timestamp}" + os.makedirs(main_dir, exist_ok=True) + + # Collect CSI driver logs using our own functions + driver_logs_dir = collect_logs_for_test(f"{test_name}_driver", driver_pod_name) + + # Collect logs for each failed resource if provided + if failed_resources: + resources_dir = os.path.join(main_dir, "failed_resources") + os.makedirs(resources_dir, exist_ok=True) + + for resource in failed_resources: + resource_type = resource.get("type", "unknown") + resource_name = resource.get("name", "unknown") + namespace = resource.get("namespace", "default") + + resource_logs_dir = collect_resource_logs( + resource_type=resource_type, + resource_name=resource_name, + namespace=namespace + ) + + # Copy resource logs to main directory + if resource_logs_dir and os.path.exists(resource_logs_dir): + logger.info(f"Adding {resource_type}/{resource_name} logs to failure archive") + resource_target_dir = os.path.join(resources_dir, f"{resource_type}_{resource_name}") + os.makedirs(resource_target_dir, exist_ok=True) + + # Copy all files from resource_logs_dir to resource_target_dir + for item in os.listdir(resource_logs_dir): + source = os.path.join(resource_logs_dir, item) + target = os.path.join(resource_target_dir, item) + if os.path.isdir(source): + shutil.copytree(source, target, dirs_exist_ok=True) + else: + shutil.copy2(source, target) + + # If we have a metrics collector, save its data + if metrics_collector: + try: + metrics_dir = os.path.join(main_dir, "metrics") + os.makedirs(metrics_dir, exist_ok=True) + + metrics_file = os.path.join(metrics_dir, "test_metrics.json") + with open(metrics_file, "w") as f: + import json + json.dump(metrics_collector.get_all_metrics(), f, indent=2) + + logger.info(f"Metrics saved to {metrics_file}") + except Exception as e: + logger.error(f"Error saving metrics: {e}") + + # Create tar file containing all logs + tarball_path = f"{main_dir}.tgz" + with tarfile.open(tarball_path, "w:gz") as tar: + tar.add(main_dir, arcname=os.path.basename(main_dir)) + + logger.info(f"Comprehensive failure logs collected to: {tarball_path}") + return tarball_path + +# Example of how to use this in tests +if __name__ == "__main__": + # Set up basic logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Example usage + tarball_path = collect_logs_for_test("example_test") + print(f"Logs collected to: {tarball_path}") +# Enhanced log integration module diff --git a/test/stress-scale-tests/utils/metrics_collector.py b/test/stress-scale-tests/utils/metrics_collector.py new file mode 100644 index 000000000..419cf48c3 --- /dev/null +++ b/test/stress-scale-tests/utils/metrics_collector.py @@ -0,0 +1,634 @@ +import time +import psutil +import logging +import requests +import re +from datetime import datetime, timezone +from kubernetes import client, watch +from collections import defaultdict + +class MetricsCollector: + """Collect and store metrics during test execution""" + + def __init__(self): + """Initialize metrics collector""" + # Basic metrics structure + self.operations = {} + self.system_metrics = {} + self.k8s_metrics = {} + self.csi_metrics = {} + + # Controller metrics + self.controller_metrics = { + "request_latency": {}, + "operation_counts": defaultdict(int), + "success_rates": defaultdict(lambda: {"success": 0, "failure": 0}), + "volume_attach_timing": {} + } + + # Node-level metrics + self.node_metrics = { + "mount_timing": {}, + "mount_errors": defaultdict(int), + "resource_utilization": {} + } + + # EFS-specific metrics + self.efs_metrics = { + "access_point_timing": {}, + "mount_completion_timing": {}, + "api_throttling_incidents": [] + } + + # Kubernetes events + self.k8s_events = { + "volume_events": [], + "binding_times": {}, + "pod_startup_delays": {} + } + + # File performance metrics + self.file_performance = { + "iops": { + "read": defaultdict(list), # PV/PVC name -> list of read IOPS measurements + "write": defaultdict(list), # PV/PVC name -> list of write IOPS measurements + "metadata": defaultdict(list) # PV/PVC name -> list of metadata IOPS measurements + }, + "latency": { + "read": defaultdict(list), # PV/PVC name -> list of read latency measurements + "write": defaultdict(list), # PV/PVC name -> list of write latency measurements + "metadata": defaultdict(list) # PV/PVC name -> list of metadata latency measurements + }, + "throughput": { + "read": defaultdict(list), # PV/PVC name -> list of read throughput measurements + "write": defaultdict(list), # PV/PVC name -> list of write throughput measurements + }, + "operation_counts": { + "read": defaultdict(int), # PV/PVC name -> count of read operations + "write": defaultdict(int), # PV/PVC name -> count of write operations + "metadata": defaultdict(int) # PV/PVC name -> count of metadata operations + }, + "measurement_windows": defaultdict(list) # PV/PVC name -> list of measurement timestamps + } + + self.logger = logging.getLogger(__name__) + + def start_operation(self, name=None): + """Start timing an operation + + Args: + name: Name of the operation (optional) + """ + op_id = name or f"op_{len(self.operations) + 1}" + self.operations[op_id] = { + "start_time": time.time(), + "samples": [] + } + self._collect_system_metrics(op_id) + return op_id + + def end_operation(self, op_id): + """End timing an operation and return duration + + Args: + op_id: Operation ID or name + + Returns: + Duration in seconds + """ + if op_id not in self.operations: + self.logger.warning(f"Operation {op_id} not found") + return 0 + + self.operations[op_id]["end_time"] = time.time() + self.operations[op_id]["duration"] = ( + self.operations[op_id]["end_time"] - + self.operations[op_id]["start_time"] + ) + self._collect_system_metrics(op_id, end=True) + + return self.operations[op_id]["duration"] + + def add_sample(self, op_id, metrics): + """Add a sample to an operation + + Args: + op_id: Operation ID or name + metrics: Dictionary of metrics to add + """ + if op_id not in self.operations: + self.logger.warning(f"Operation {op_id} not found") + return + + sample = { + "timestamp": time.time(), + "metrics": metrics + } + self.operations[op_id]["samples"].append(sample) + + def _collect_system_metrics(self, op_id, end=False): + """Collect system metrics + + Args: + op_id: Operation ID or name + end: Whether this is the end of an operation + """ + prefix = "end_" if end else "start_" + + # Collect CPU, memory, disk I/O metrics + cpu_percent = psutil.cpu_percent(interval=0.1) + memory = psutil.virtual_memory() + disk_io = psutil.disk_io_counters() + + metrics = { + f"{prefix}cpu_percent": cpu_percent, + f"{prefix}memory_percent": memory.percent, + f"{prefix}disk_read_bytes": disk_io.read_bytes, + f"{prefix}disk_write_bytes": disk_io.write_bytes + } + + if op_id not in self.system_metrics: + self.system_metrics[op_id] = {} + + self.system_metrics[op_id].update(metrics) + + def collect_csi_metrics(self, config=None): + """Collect CSI driver metrics + + Args: + config: Configuration dictionary with metrics settings + """ + if not config: + return + + if not config.get('metrics_collection', {}).get('enabled', False): + return + + ports = config.get('metrics_collection', {}).get('controller_ports', [8080, 8081]) + + # Get EFS CSI controller pod + try: + kube_client = client.CoreV1Api() + pods = kube_client.list_namespaced_pod( + namespace="kube-system", + label_selector="app=efs-csi-controller" + ) + + if not pods.items: + self.logger.warning("No EFS CSI controller pods found") + return + + controller_pod = pods.items[0] + pod_name = controller_pod.metadata.name + + # Port-forward to the controller pod + for port in ports: + try: + # Use kubectl port-forward in a subprocess + import subprocess + import threading + import time + + # Start port-forwarding in a separate process + process = subprocess.Popen( + ["kubectl", "port-forward", pod_name, f"{port}:{port}", "-n", "kube-system"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + + # Give it time to establish the connection + time.sleep(2) + + # Collect metrics + try: + response = requests.get(f"http://localhost:{port}/metrics", timeout=5) + if response.status_code == 200: + self.csi_metrics[f"port_{port}"] = response.text + except requests.RequestException as e: + self.logger.warning(f"Failed to collect metrics from port {port}: {e}") + + # Terminate the port-forwarding process + process.terminate() + process.wait(timeout=5) + + except Exception as e: + self.logger.warning(f"Error collecting metrics from port {port}: {e}") + + except Exception as e: + self.logger.warning(f"Error collecting CSI metrics: {e}") + + def get_operation_metrics(self, op_id): + """Get metrics for an operation + + Args: + op_id: Operation ID or name + + Returns: + Dictionary of metrics + """ + if op_id not in self.operations: + self.logger.warning(f"Operation {op_id} not found") + return {} + + metrics = self.operations[op_id].copy() + + # Add system metrics if available + if op_id in self.system_metrics: + metrics["system"] = self.system_metrics[op_id] + + return metrics + + def track_controller_request(self, operation_type, start_time, success=True): + """Track a controller request + + Args: + operation_type: Type of operation (e.g., 'create_volume', 'delete_volume') + start_time: Start time of the operation + success: Whether the operation succeeded + """ + duration = time.time() - start_time + + # Record request latency + if operation_type not in self.controller_metrics["request_latency"]: + self.controller_metrics["request_latency"][operation_type] = [] + self.controller_metrics["request_latency"][operation_type].append(duration) + + # Record operation count + self.controller_metrics["operation_counts"][operation_type] += 1 + + # Record success/failure + status = "success" if success else "failure" + self.controller_metrics["success_rates"][operation_type][status] += 1 + + def track_volume_attachment(self, volume_id, start_time): + """Track volume attachment time + + Args: + volume_id: ID of the volume + start_time: Start time of the attachment + """ + duration = time.time() - start_time + self.controller_metrics["volume_attach_timing"][volume_id] = duration + + def track_mount_operation(self, node_name, pod_name, start_time, success=True): + """Track mount operation time + + Args: + node_name: Name of the node + pod_name: Name of the pod + start_time: Start time of the mount operation + success: Whether the operation succeeded + """ + duration = time.time() - start_time + + if node_name not in self.node_metrics["mount_timing"]: + self.node_metrics["mount_timing"][node_name] = {} + + self.node_metrics["mount_timing"][node_name][pod_name] = duration + + if not success: + self.node_metrics["mount_errors"][node_name] += 1 + + def track_node_resources(self, node_name, cpu_percent, memory_percent): + """Track node resource utilization + + Args: + node_name: Name of the node + cpu_percent: CPU utilization percentage + memory_percent: Memory utilization percentage + """ + if node_name not in self.node_metrics["resource_utilization"]: + self.node_metrics["resource_utilization"][node_name] = [] + + self.node_metrics["resource_utilization"][node_name].append({ + "timestamp": time.time(), + "cpu_percent": cpu_percent, + "memory_percent": memory_percent + }) + + def track_access_point_creation(self, access_point_id, start_time): + """Track access point creation time + + Args: + access_point_id: ID of the access point + start_time: Start time of the creation + """ + duration = time.time() - start_time + self.efs_metrics["access_point_timing"][access_point_id] = duration + + def track_mount_completion(self, pod_name, pvc_name, start_time): + """Track mount completion time + + Args: + pod_name: Name of the pod + pvc_name: Name of the PVC + start_time: Start time of the mount operation + """ + duration = time.time() - start_time + + if pod_name not in self.efs_metrics["mount_completion_timing"]: + self.efs_metrics["mount_completion_timing"][pod_name] = {} + + self.efs_metrics["mount_completion_timing"][pod_name][pvc_name] = duration + + def record_api_throttling(self, operation_type, error_message): + """Record API throttling incident + + Args: + operation_type: Type of operation that was throttled + error_message: Error message from the API + """ + self.efs_metrics["api_throttling_incidents"].append({ + "timestamp": time.time(), + "operation_type": operation_type, + "error_message": error_message + }) + + def collect_volume_events(self, namespace="default"): + """Collect volume-related Kubernetes events + + Args: + namespace: Kubernetes namespace to collect events from + """ + try: + kube_client = client.CoreV1Api() + events = kube_client.list_namespaced_event(namespace=namespace) + + for event in events.items: + if event.involved_object.kind in ["PersistentVolume", "PersistentVolumeClaim"]: + self.k8s_events["volume_events"].append({ + "timestamp": time.time(), + "name": event.involved_object.name, + "kind": event.involved_object.kind, + "reason": event.reason, + "message": event.message, + "count": event.count + }) + except Exception as e: + self.logger.warning(f"Error collecting volume events: {e}") + + def track_pv_pvc_binding(self, pvc_name, pv_name, bind_time): + """Track PV-PVC binding time + + Args: + pvc_name: Name of the PVC + pv_name: Name of the PV + bind_time: Time taken for binding in seconds + """ + self.k8s_events["binding_times"][f"{pvc_name}-{pv_name}"] = bind_time + + def track_pod_startup_delay(self, pod_name, create_time, ready_time): + """Track pod startup delay + + Args: + pod_name: Name of the pod + create_time: Time when the pod was created + ready_time: Time when the pod became ready + """ + delay = ready_time - create_time + self.k8s_events["pod_startup_delays"][pod_name] = delay + + def parse_prometheus_metrics(self, metrics_text): + """Parse Prometheus metrics from the CSI driver + + Args: + metrics_text: Raw Prometheus metrics text + + Returns: + Dictionary of parsed metrics + """ + parsed_metrics = {} + + if not metrics_text: + return parsed_metrics + + # Simple regex pattern to extract metrics + pattern = r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s*({[^}]*})?\s*([0-9.eE+-]+)' + + for line in metrics_text.split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + + match = re.match(pattern, line) + if match: + metric_name = match.group(1) + labels = match.group(2) or "" + value = float(match.group(3)) + + if metric_name not in parsed_metrics: + parsed_metrics[metric_name] = [] + + parsed_metrics[metric_name].append({ + "labels": labels, + "value": value + }) + + return parsed_metrics + + def track_file_operation_iops(self, pv_pvc_name, operation_type, operations_count, duration_seconds): + """Track IOPS for a file operation + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write', 'metadata') + operations_count: Number of operations performed + duration_seconds: Duration of the measurement in seconds + """ + if operation_type not in ['read', 'write', 'metadata']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read', 'write', or 'metadata'") + return + + # Calculate IOPS + if duration_seconds > 0: + iops = operations_count / duration_seconds + else: + iops = 0 + + # Update the operation counts + self.file_performance["operation_counts"][operation_type][pv_pvc_name] += operations_count + + # Record the IOPS measurement + self.file_performance["iops"][operation_type][pv_pvc_name].append(iops) + + # Record measurement timestamp + self.file_performance["measurement_windows"][pv_pvc_name].append({ + "timestamp": time.time(), + "operation_type": operation_type, + "duration_seconds": duration_seconds + }) + + self.logger.debug(f"Recorded {operation_type} IOPS: {iops} for {pv_pvc_name}") + + def track_file_operation_latency(self, pv_pvc_name, operation_type, latency_seconds): + """Track latency for a file operation + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write', 'metadata') + latency_seconds: Latency of the operation in seconds + """ + if operation_type not in ['read', 'write', 'metadata']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read', 'write', or 'metadata'") + return + + # Record the latency measurement + self.file_performance["latency"][operation_type][pv_pvc_name].append(latency_seconds) + + self.logger.debug(f"Recorded {operation_type} latency: {latency_seconds}s for {pv_pvc_name}") + + def track_file_operation_throughput(self, pv_pvc_name, operation_type, bytes_transferred, duration_seconds): + """Track throughput for a file operation + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write') + bytes_transferred: Number of bytes transferred + duration_seconds: Duration of the transfer in seconds + """ + if operation_type not in ['read', 'write']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read' or 'write'") + return + + # Calculate throughput (MB/s) + if duration_seconds > 0: + throughput_mbps = (bytes_transferred / 1024 / 1024) / duration_seconds + else: + throughput_mbps = 0 + + # Record the throughput measurement + self.file_performance["throughput"][operation_type][pv_pvc_name].append(throughput_mbps) + + self.logger.debug(f"Recorded {operation_type} throughput: {throughput_mbps} MB/s for {pv_pvc_name}") + + def calculate_latency_percentiles(self, pv_pvc_name, operation_type): + """Calculate percentiles for latency measurements + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write', 'metadata') + + Returns: + Dictionary with p50, p95, p99 percentiles + """ + if operation_type not in ['read', 'write', 'metadata']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read', 'write', or 'metadata'") + return {} + + latencies = self.file_performance["latency"][operation_type].get(pv_pvc_name, []) + + if not latencies: + return { + "p50": None, + "p95": None, + "p99": None + } + + # Sort latencies for percentile calculation + sorted_latencies = sorted(latencies) + n = len(sorted_latencies) + + p50_idx = int(n * 0.5) + p95_idx = int(n * 0.95) + p99_idx = int(n * 0.99) + + return { + "p50": sorted_latencies[p50_idx], + "p95": sorted_latencies[p95_idx], + "p99": sorted_latencies[p99_idx] + } + + def calculate_average_iops(self, pv_pvc_name, operation_type): + """Calculate average IOPS for a specific PV/PVC and operation type + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write', 'metadata') + + Returns: + Average IOPS value or None if no measurements + """ + if operation_type not in ['read', 'write', 'metadata']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read', 'write', or 'metadata'") + return None + + iops_values = self.file_performance["iops"][operation_type].get(pv_pvc_name, []) + + if not iops_values: + return None + + return sum(iops_values) / len(iops_values) + + def calculate_average_throughput(self, pv_pvc_name, operation_type): + """Calculate average throughput for a specific PV/PVC and operation type + + Args: + pv_pvc_name: Name of the PV or PVC + operation_type: Type of operation ('read', 'write') + + Returns: + Average throughput in MB/s or None if no measurements + """ + if operation_type not in ['read', 'write']: + self.logger.warning(f"Invalid operation type: {operation_type}. Must be 'read' or 'write'") + return None + + throughput_values = self.file_performance["throughput"][operation_type].get(pv_pvc_name, []) + + if not throughput_values: + return None + + return sum(throughput_values) / len(throughput_values) + + def get_all_metrics(self): + """Get all collected metrics + + Returns: + Dictionary of all metrics + """ + # Calculate summarized file performance metrics + file_performance_summary = { + "by_volume": {} + } + + # Collect all unique PV/PVC names + all_volumes = set() + for op_type in ['read', 'write', 'metadata']: + all_volumes.update(self.file_performance["iops"][op_type].keys()) + all_volumes.update(self.file_performance["latency"][op_type].keys()) + + for pv_pvc_name in all_volumes: + file_performance_summary["by_volume"][pv_pvc_name] = { + "iops": { + "read": self.calculate_average_iops(pv_pvc_name, "read"), + "write": self.calculate_average_iops(pv_pvc_name, "write"), + "metadata": self.calculate_average_iops(pv_pvc_name, "metadata") + }, + "throughput": { + "read": self.calculate_average_throughput(pv_pvc_name, "read"), + "write": self.calculate_average_throughput(pv_pvc_name, "write") + }, + "latency": { + "read": self.calculate_latency_percentiles(pv_pvc_name, "read"), + "write": self.calculate_latency_percentiles(pv_pvc_name, "write"), + "metadata": self.calculate_latency_percentiles(pv_pvc_name, "metadata") + }, + "operation_counts": { + "read": self.file_performance["operation_counts"]["read"].get(pv_pvc_name, 0), + "write": self.file_performance["operation_counts"]["write"].get(pv_pvc_name, 0), + "metadata": self.file_performance["operation_counts"]["metadata"].get(pv_pvc_name, 0) + } + } + + return { + "operations": self.operations, + "system": self.system_metrics, + "kubernetes": self.k8s_metrics, + "csi": self.csi_metrics, + "controller": self.controller_metrics, + "node": self.node_metrics, + "efs": self.efs_metrics, + "k8s_events": self.k8s_events, + "file_performance": file_performance_summary + } +# Enhanced modular implementation for metrics collection diff --git a/test/stress-scale-tests/utils/report_generator.py b/test/stress-scale-tests/utils/report_generator.py new file mode 100644 index 000000000..2c07c5915 --- /dev/null +++ b/test/stress-scale-tests/utils/report_generator.py @@ -0,0 +1,336 @@ +import json +import os +import datetime +import platform +import psutil +import yaml +from pathlib import Path +import socket +import subprocess + +class ReportGenerator: + """Generate detailed test reports in various formats""" + + def __init__(self, output_dir="reports"): + """Initialize report generator + + Args: + output_dir: Base directory to store reports + """ + self.base_output_dir = output_dir + Path(output_dir).mkdir(parents=True, exist_ok=True) + + def _get_output_dir(self, test_type): + """Get the output directory for a specific test type + + Args: + test_type: Type of test (e.g., 'stress', 'scalability') + + Returns: + Path to the output directory + """ + output_dir = os.path.join(self.base_output_dir, test_type) + Path(output_dir).mkdir(parents=True, exist_ok=True) + return output_dir + + def _determine_test_type(self, test_results): + """Determine the test type from the results structure + + Args: + test_results: Dictionary containing test results + + Returns: + Test type string + """ + # Check for orchestrator results (operations, scenarios, etc.) + if "operations" in test_results and "scenarios" in test_results: + return "orchestrator" + + # Default to orchestrator if nothing else matches + return "orchestrator" + + def _collect_system_info(self): + """Collect system information for the report + + Returns: + Dictionary with system information + """ + system_info = { + "hostname": socket.gethostname(), + "platform": platform.platform(), + "python_version": platform.python_version(), + "cpu_count": psutil.cpu_count(), + "memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2), + "timestamp": datetime.datetime.now().isoformat() + } + + # Try to get Kubernetes cluster info + try: + kubectl_version = subprocess.check_output(["kubectl", "version", "--short"], + stderr=subprocess.STDOUT).decode('utf-8') + system_info["kubernetes_version"] = kubectl_version.strip() + except (subprocess.SubprocessError, FileNotFoundError): + system_info["kubernetes_version"] = "Unknown" + + # Try to get AWS region + try: + with open('config/test_config.yaml', 'r') as f: + config = yaml.safe_load(f) + system_info["aws_region"] = config.get('cluster', {}).get('region', 'Unknown') + system_info["efs_filesystem_id"] = config.get('efs', {}).get('filesystem_id', 'Unknown') + except Exception: + system_info["aws_region"] = "Unknown" + system_info["efs_filesystem_id"] = "Unknown" + + return system_info + + def generate_json_report(self, test_results, test_name): + """Generate detailed JSON report + + Args: + test_results: Dictionary containing test results + test_name: Name of the test + + Returns: + Path to the generated report + """ + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + + # Determine test type from results structure + test_type = self._determine_test_type(test_results) + + output_dir = self._get_output_dir(test_type) + filename = f"{test_name}_{timestamp}.json" + filepath = os.path.join(output_dir, filename) + + # Add metadata and system info + report = { + "test_name": test_name, + "test_type": test_type, + "timestamp": timestamp, + "system_info": self._collect_system_info(), + "results": test_results + } + + with open(filepath, 'w') as f: + json.dump(report, f, indent=2) + + return filepath + + def generate_summary_report(self, test_results, test_name): + """Generate a detailed human-readable summary report + + Args: + test_results: Dictionary containing test results + test_name: Name of the test + + Returns: + Path to the generated report + """ + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + + # Determine test type from results structure + test_type = self._determine_test_type(test_results) + + output_dir = self._get_output_dir(test_type) + filename = f"{test_name}_{timestamp}_summary.txt" + filepath = os.path.join(output_dir, filename) + + system_info = self._collect_system_info() + + with open(filepath, 'w') as f: + # Write header + f.write(f"{'='*80}\n") + f.write(f"EFS CSI DRIVER TEST REPORT: {test_name.upper()}\n") + f.write(f"{'='*80}\n\n") + + # Write system information + f.write("SYSTEM INFORMATION\n") + f.write(f"{'-'*80}\n") + f.write(f"Hostname: {system_info['hostname']}\n") + f.write(f"Platform: {system_info['platform']}\n") + f.write(f"Python Version: {system_info['python_version']}\n") + f.write(f"CPU Count: {system_info['cpu_count']}\n") + f.write(f"Memory Total: {system_info['memory_total_gb']} GB\n") + f.write(f"Kubernetes Version: {system_info['kubernetes_version']}\n") + f.write(f"AWS Region: {system_info['aws_region']}\n") + f.write(f"EFS Filesystem ID: {system_info['efs_filesystem_id']}\n") + f.write(f"Test Timestamp: {system_info['timestamp']}\n\n") + + # Write test results + f.write("TEST RESULTS\n") + f.write(f"{'-'*80}\n") + + # Process different test types + if test_type == "scalability": + self._write_scalability_results(f, test_results) + elif test_type == "stress": + self._write_stress_results(f, test_results) + elif test_type == "access_points": + self._write_access_point_results(f, test_results) + elif test_type == "statefulset": + self._write_statefulset_results(f, test_results) + else: + self._write_generic_results(f, test_results) + + # Write footer + f.write(f"\n{'='*80}\n") + f.write(f"END OF REPORT: {datetime.datetime.now().isoformat()}\n") + f.write(f"{'='*80}\n") + + return filepath + + def _write_scalability_results(self, file, results): + """Write scalability test results to the report file""" + if "pod_scaling" in results: + file.write("\nPOD SCALING TEST RESULTS\n") + file.write(f"{'-'*40}\n") + + pod_results = results["pod_scaling"] + if isinstance(pod_results, dict): + # Sort by scale (number of pods) + for scale in sorted([int(k) for k in pod_results.keys()]): + data = pod_results[scale] + success = data.get('success', False) + duration = data.get('duration', 0) + pods_ready = data.get('pods_ready', 0) + + file.write(f"Scale: {scale} pods\n") + file.write(f" Status: {'SUCCESS' if success else 'FAILED'}\n") + file.write(f" Duration: {duration:.2f} seconds\n") + file.write(f" Pods Ready: {pods_ready}\n") + if duration > 0: + file.write(f" Scale Rate: {scale/duration:.2f} pods/second\n\n") + else: + file.write(f"Error: {pod_results}\n\n") + + if "volume_scaling" in results: + file.write("\nVOLUME SCALING TEST RESULTS\n") + file.write(f"{'-'*40}\n") + + volume_results = results["volume_scaling"] + if isinstance(volume_results, dict): + # Sort by scale (number of volumes) + for scale in sorted([int(k) for k in volume_results.keys()]): + data = volume_results[scale] + success = data.get('success', False) + duration = data.get('duration', 0) + volumes_ready = data.get('volumes_ready', 0) + + file.write(f"Scale: {scale} volumes\n") + file.write(f" Status: {'SUCCESS' if success else 'FAILED'}\n") + file.write(f" Duration: {duration:.2f} seconds\n") + file.write(f" Volumes Ready: {volumes_ready}\n") + if duration > 0: + file.write(f" Scale Rate: {scale/duration:.2f} volumes/second\n\n") + else: + file.write(f"Error: {volume_results}\n\n") + + def _write_stress_results(self, file, results): + """Write stress test results to the report file""" + for test_name, result in results.items(): + file.write(f"\n{test_name.upper()} RESULTS\n") + file.write(f"{'-'*40}\n") + + if isinstance(result, dict): + if "sequential_write" in result: + file.write("\nSequential Write Test:\n") + seq_result = result["sequential_write"] + file.write(f" Status: {seq_result.get('status', 'unknown')}\n") + file.write(f" Duration: {seq_result.get('duration', 'N/A')} seconds\n") + + # Add detailed metrics if available + if "pod_metrics" in seq_result: + file.write(" Pod Metrics:\n") + for i, metric in enumerate(seq_result["pod_metrics"]): + file.write(f" Sample {i+1}: Phase={metric.get('phase', 'unknown')}\n") + + if "random_write" in result: + file.write("\nRandom Write Test:\n") + rand_result = result["random_write"] + file.write(f" Status: {rand_result.get('status', 'unknown')}\n") + file.write(f" Duration: {rand_result.get('duration', 'N/A')} seconds\n") + + # Add detailed metrics if available + if "pod_metrics" in rand_result: + file.write(" Pod Metrics:\n") + for i, metric in enumerate(rand_result["pod_metrics"]): + file.write(f" Sample {i+1}: Phase={metric.get('phase', 'unknown')}\n") + + if "mixed_io" in result: + file.write("\nMixed I/O Test:\n") + mixed_result = result["mixed_io"] + file.write(f" Status: {mixed_result.get('status', 'unknown')}\n") + file.write(f" Duration: {mixed_result.get('duration', 'N/A')} seconds\n") + + # Add detailed metrics if available + if "pod_metrics" in mixed_result: + file.write(" Pod Metrics:\n") + for i, metric in enumerate(mixed_result["pod_metrics"]): + file.write(f" Sample {i+1}: Phase={metric.get('phase', 'unknown')}\n") + else: + file.write(f"Error: {result}\n") + + def _write_access_point_results(self, file, results): + """Write access point test results to the report file""" + if "access_point_scaling" in results: + file.write("\nACCESS POINT SCALING TEST RESULTS\n") + file.write(f"{'-'*40}\n") + + ap_results = results["access_point_scaling"] + if isinstance(ap_results, dict): + # Sort by scale (number of access points) + for scale in sorted([int(k) for k in ap_results.keys()]): + data = ap_results[scale] + success = data.get('success', False) + duration = data.get('duration', 0) + aps_ready = data.get('access_points_ready', 0) + + file.write(f"Scale: {scale} access points\n") + file.write(f" Status: {'SUCCESS' if success else 'FAILED'}\n") + file.write(f" Duration: {duration:.2f} seconds\n") + file.write(f" Access Points Ready: {aps_ready}\n") + if duration > 0: + file.write(f" Scale Rate: {scale/duration:.2f} access points/second\n\n") + else: + file.write(f"Error: {ap_results}\n\n") + + def _write_statefulset_results(self, file, results): + """Write StatefulSet test results to the report file""" + if "statefulset_scaling" in results: + file.write("\nSTATEFULSET SCALING TEST RESULTS\n") + file.write(f"{'-'*40}\n") + + sts_result = results["statefulset_scaling"] + if isinstance(sts_result, dict): + replicas = sts_result.get('replicas', 0) + success = sts_result.get('success', False) + duration = sts_result.get('duration', 0) + pods_ready = sts_result.get('pods_ready', 0) + + file.write(f"Replicas: {replicas}\n") + file.write(f"Status: {'SUCCESS' if success else 'FAILED'}\n") + file.write(f"Duration: {duration:.2f} seconds\n") + file.write(f"Pods Ready: {pods_ready}\n") + if duration > 0: + file.write(f"Scale Rate: {replicas/duration:.2f} pods/second\n\n") + else: + file.write(f"Error: {sts_result}\n\n") + + def _write_generic_results(self, file, results): + """Write generic test results to the report file""" + for test_name, result in results.items(): + file.write(f"\n{test_name.upper()}\n") + file.write(f"{'-'*40}\n") + + if isinstance(result, dict): + for key, value in result.items(): + if isinstance(value, dict): + file.write(f"{key}:\n") + for sub_key, sub_value in value.items(): + file.write(f" {sub_key}: {sub_value}\n") + else: + file.write(f"{key}: {value}\n") + else: + file.write(f"{result}\n") +# Enhanced report generator module