From 070f1bea2b35e15adfa2c18a85c4976505634a89 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 20 May 2025 09:21:43 +0100 Subject: [PATCH 1/5] test running only failing test --- .github/workflows/e2e_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index fca6d6e7..d100241c 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -117,7 +117,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" From d6a07f3bd64b206176022b19379cd7b4ed272483 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 20 May 2025 09:54:14 +0100 Subject: [PATCH 2/5] add excessive amounts of logs --- tests/e2e/local_interactive_sdk_kind_test.py | 87 ++++++++++++++++++-- 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index c20fd879..3d2365aa 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -8,37 +8,54 @@ import pytest import ray import math +import logging +import time +import os from support import * +# Configure logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + @pytest.mark.kind class TestRayLocalInteractiveOauth: def setup_method(self): + logger.info("Setting up test environment...") initialize_kubernetes_client(self) + logger.info("Kubernetes client initialized") def teardown_method(self): + logger.info("Cleaning up test environment...") delete_namespace(self) delete_kueue_resources(self) + logger.info("Cleanup completed") def test_local_interactives(self): + logger.info("Starting test_local_interactives...") self.setup_method() create_namespace(self) create_kueue_resources(self) self.run_local_interactives() + logger.info("test_local_interactives completed") @pytest.mark.nvidia_gpu def test_local_interactives_nvidia_gpu(self): + logger.info("Starting test_local_interactives_nvidia_gpu...") self.setup_method() create_namespace(self) create_kueue_resources(self) self.run_local_interactives(number_of_gpus=1) + logger.info("test_local_interactives_nvidia_gpu completed") def run_local_interactives( self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 ): cluster_name = "test-ray-cluster-li" + logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs") + logger.info("Creating cluster configuration...") cluster = Cluster( ClusterConfiguration( name=cluster_name, @@ -57,37 +74,97 @@ def run_local_interactives( verify_tls=False, ) ) + logger.info("Cluster configuration created") + + logger.info("Starting cluster deployment...") cluster.up() + logger.info("Cluster deployment initiated") + + logger.info("Waiting for cluster to be ready...") cluster.wait_ready() + logger.info("Cluster is ready") + logger.info("Generating TLS certificates...") generate_cert.generate_tls_cert(cluster_name, self.namespace) + logger.info("TLS certificates generated") + + logger.info("Exporting environment variables...") generate_cert.export_env(cluster_name, self.namespace) + logger.info("Environment variables exported") + + client_url = cluster.local_client_url() + logger.info(f"Ray client URL: {client_url}") - print(cluster.local_client_url()) + logger.info("Checking cluster status...") + status = cluster.status() + logger.info(f"Cluster status: {status}") + logger.info("Checking cluster dashboard URI...") + dashboard_uri = cluster.cluster_dashboard_uri() + logger.info(f"Dashboard URI: {dashboard_uri}") + + logger.info("Checking cluster URI...") + cluster_uri = cluster.cluster_uri() + logger.info(f"Cluster URI: {cluster_uri}") + + logger.info("Shutting down any existing Ray connections...") ray.shutdown() - ray.init(address=cluster.local_client_url(), logging_level="DEBUG") + logger.info("Ray shutdown completed") + + logger.info("Initializing Ray connection...") + try: + ray.init(address=client_url, logging_level="DEBUG") + logger.info("Ray initialization successful") + except Exception as e: + logger.error(f"Ray initialization failed: {str(e)}") + logger.error(f"Error type: {type(e)}") + raise + + logger.info("Defining Ray remote functions...") @ray.remote(num_gpus=number_of_gpus / 2) def heavy_calculation_part(num_iterations): + logger.info( + f"Starting heavy_calculation_part with {num_iterations} iterations" + ) result = 0.0 for i in range(num_iterations): for j in range(num_iterations): for k in range(num_iterations): result += math.sin(i) * math.cos(j) * math.tan(k) + logger.info("heavy_calculation_part completed") return result @ray.remote(num_gpus=number_of_gpus / 2) def heavy_calculation(num_iterations): + logger.info(f"Starting heavy_calculation with {num_iterations} iterations") results = ray.get( [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] ) + logger.info("heavy_calculation completed") return sum(results) + logger.info("Submitting calculation task...") ref = heavy_calculation.remote(3000) - result = ray.get(ref) - assert result == 1789.4644387076714 - ray.cancel(ref) + logger.info("Task submitted, waiting for result...") + + try: + result = ray.get(ref) + logger.info(f"Calculation completed with result: {result}") + assert result == 1789.4644387076714 + logger.info("Result assertion passed") + except Exception as e: + logger.error(f"Error during calculation: {str(e)}") + raise + finally: + logger.info("Cancelling task reference...") + ray.cancel(ref) + logger.info("Task cancelled") + + logger.info("Shutting down Ray...") ray.shutdown() + logger.info("Ray shutdown completed") + logger.info("Tearing down cluster...") cluster.down() + logger.info("Cluster teardown completed") From 6eeb49cdc7b3937481c90cb9eba33c3dfdb79a00 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 20 May 2025 10:51:41 +0100 Subject: [PATCH 3/5] test certificate changes --- .../common/utils/generate_cert.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py index 7c072da0..3a98b4e5 100644 --- a/src/codeflare_sdk/common/utils/generate_cert.py +++ b/src/codeflare_sdk/common/utils/generate_cert.py @@ -240,12 +240,27 @@ def export_env(cluster_name, namespace): Environment Variables Set: - RAY_USE_TLS: Enables TLS for Ray. - - RAY_TLS_SERVER_CERT: Path to the TLS server certificate. - - RAY_TLS_SERVER_KEY: Path to the TLS server private key. - RAY_TLS_CA_CERT: Path to the CA certificate. """ + # Assuming logger is configured elsewhere or add basicConfig here for the module + # import logging + # logger = logging.getLogger(__name__) + # logging.basicConfig(level=logging.INFO) # Or use existing logger if available + tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}") os.environ["RAY_USE_TLS"] = "1" - os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") - os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key") + # os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") # Client usually doesn't need to present a server cert + # os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key") # Client usually doesn't need to present a server key + if "RAY_TLS_SERVER_CERT" in os.environ: + del os.environ["RAY_TLS_SERVER_CERT"] + if "RAY_TLS_SERVER_KEY" in os.environ: + del os.environ["RAY_TLS_SERVER_KEY"] os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt") + + # It's better to use a logger instance if this module has one, + # otherwise, these prints will go to stdout. + # For now, using print for visibility in test logs if logger isn't set up in this exact scope. + print(f"generate_cert.export_env: RAY_USE_TLS set to: {os.environ.get('RAY_USE_TLS')}") + print(f"generate_cert.export_env: RAY_TLS_CA_CERT set to: {os.environ.get('RAY_TLS_CA_CERT')}") + print(f"generate_cert.export_env: RAY_TLS_SERVER_CERT is: {os.environ.get('RAY_TLS_SERVER_CERT')}") + print(f"generate_cert.export_env: RAY_TLS_SERVER_KEY is: {os.environ.get('RAY_TLS_SERVER_KEY')}") From 440fac259e8534c9da1f1f472e3abc335c91b486 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 20 May 2025 11:04:02 +0100 Subject: [PATCH 4/5] Revert "test certificate changes" This reverts commit 6eeb49cdc7b3937481c90cb9eba33c3dfdb79a00. --- .github/workflows/e2e_tests.yaml | 2 +- .../common/utils/generate_cert.py | 22 +++++++------------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index d100241c..26246c67 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -117,7 +117,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s --log-cli-level=INFO ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py index 3a98b4e5..a0b4f8cd 100644 --- a/src/codeflare_sdk/common/utils/generate_cert.py +++ b/src/codeflare_sdk/common/utils/generate_cert.py @@ -240,27 +240,21 @@ def export_env(cluster_name, namespace): Environment Variables Set: - RAY_USE_TLS: Enables TLS for Ray. + - RAY_TLS_SERVER_CERT: Path to the TLS server certificate. + - RAY_TLS_SERVER_KEY: Path to the TLS server private key. - RAY_TLS_CA_CERT: Path to the CA certificate. + - RAY_CLIENT_SKIP_TLS_VERIFY: Skips TLS verification by the client. """ - # Assuming logger is configured elsewhere or add basicConfig here for the module - # import logging - # logger = logging.getLogger(__name__) - # logging.basicConfig(level=logging.INFO) # Or use existing logger if available - tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}") os.environ["RAY_USE_TLS"] = "1" - # os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") # Client usually doesn't need to present a server cert - # os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key") # Client usually doesn't need to present a server key - if "RAY_TLS_SERVER_CERT" in os.environ: - del os.environ["RAY_TLS_SERVER_CERT"] - if "RAY_TLS_SERVER_KEY" in os.environ: - del os.environ["RAY_TLS_SERVER_KEY"] + os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") + os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key") os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt") + os.environ["RAY_CLIENT_SKIP_TLS_VERIFY"] = "1" # Skip verification for E2E - # It's better to use a logger instance if this module has one, - # otherwise, these prints will go to stdout. - # For now, using print for visibility in test logs if logger isn't set up in this exact scope. + # Optional: Add print statements here if you still want to log them for verification print(f"generate_cert.export_env: RAY_USE_TLS set to: {os.environ.get('RAY_USE_TLS')}") print(f"generate_cert.export_env: RAY_TLS_CA_CERT set to: {os.environ.get('RAY_TLS_CA_CERT')}") print(f"generate_cert.export_env: RAY_TLS_SERVER_CERT is: {os.environ.get('RAY_TLS_SERVER_CERT')}") print(f"generate_cert.export_env: RAY_TLS_SERVER_KEY is: {os.environ.get('RAY_TLS_SERVER_KEY')}") + print(f"generate_cert.export_env: RAY_CLIENT_SKIP_TLS_VERIFY is: {os.environ.get('RAY_CLIENT_SKIP_TLS_VERIFY')}") From 5693808274bf60435388df48e5ba04e3bb00a0ad Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 20 May 2025 11:59:39 +0100 Subject: [PATCH 5/5] add gpu checker --- .github/workflows/e2e_tests.yaml | 68 ++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 26246c67..fd01a30f 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -70,6 +70,74 @@ jobs: - name: Install NVidia GPU operator for KinD uses: ./common/github-actions/nvidia-gpu-operator + - name: Verify GPU availability in KinD + run: | + echo "Checking for available GPUs in the KinD cluster..." + + # Wait for GPU operator pods to be ready (with timeout) + echo "Waiting for GPU operator pods to be ready..." + TIMEOUT=300 # 5 minutes timeout + END=$((SECONDS + TIMEOUT)) + + while [ $SECONDS -lt $END ]; do + # Get total number of pods in the namespace + TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l) + + # Count pods that are either running and ready or completed successfully + # Exclude pods that are still initializing + READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l) + + if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then + echo "All GPU operator pods are ready or completed successfully!" + break + fi + + echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)" + echo "Pod status:" + kubectl get pods -n gpu-operator + sleep 10 + done + + if [ $SECONDS -ge $END ]; then + echo "::error::Timeout waiting for GPU operator pods to be ready" + echo "GPU operator pod status:" + kubectl get pods -n gpu-operator -o wide + echo "GPU operator pod logs:" + kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator + echo "GPU operator pod events:" + kubectl get events -n gpu-operator + exit 1 + fi + + echo "Node details:" + kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:' + + # Check if GPU operator has labeled nodes + GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu") + if [ "$GPU_LABELS" -eq 0 ]; then + echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly." + echo "Full node descriptions for debugging:" + kubectl describe nodes + exit 1 + fi + + # Check if GPUs are actually allocatable + GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l) + if [ "$GPU_ALLOCATABLE" -eq 0 ]; then + echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs." + echo "Checking GPU operator pods:" + kubectl get pods -n gpu-operator -o wide + echo "GPU operator pod logs:" + kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator + echo "GPU operator pod events:" + kubectl get events -n gpu-operator + echo "GPU operator pod descriptions:" + kubectl describe pods -n gpu-operator + exit 1 + fi + + echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster." + - name: Deploy CodeFlare stack id: deploy run: |