From 070f1bea2b35e15adfa2c18a85c4976505634a89 Mon Sep 17 00:00:00 2001
From: kryanbeane <bryankeane0@gmail.com>
Date: Tue, 20 May 2025 09:21:43 +0100
Subject: [PATCH 1/5] test running only failing test

---
 .github/workflows/e2e_tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index fca6d6e7..d100241c 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -117,7 +117,7 @@ jobs:
           pip install poetry
           poetry install --with test,docs
           echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+          poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
         env:
           GRPC_DNS_RESOLVER: "native"
 

From d6a07f3bd64b206176022b19379cd7b4ed272483 Mon Sep 17 00:00:00 2001
From: kryanbeane <bryankeane0@gmail.com>
Date: Tue, 20 May 2025 09:54:14 +0100
Subject: [PATCH 2/5] add excessive amounts of logs

---
 tests/e2e/local_interactive_sdk_kind_test.py | 87 ++++++++++++++++++--
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
index c20fd879..3d2365aa 100644
--- a/tests/e2e/local_interactive_sdk_kind_test.py
+++ b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -8,37 +8,54 @@
 import pytest
 import ray
 import math
+import logging
+import time
+import os
 
 from support import *
 
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
 
 @pytest.mark.kind
 class TestRayLocalInteractiveOauth:
     def setup_method(self):
+        logger.info("Setting up test environment...")
         initialize_kubernetes_client(self)
+        logger.info("Kubernetes client initialized")
 
     def teardown_method(self):
+        logger.info("Cleaning up test environment...")
         delete_namespace(self)
         delete_kueue_resources(self)
+        logger.info("Cleanup completed")
 
     def test_local_interactives(self):
+        logger.info("Starting test_local_interactives...")
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
         self.run_local_interactives()
+        logger.info("test_local_interactives completed")
 
     @pytest.mark.nvidia_gpu
     def test_local_interactives_nvidia_gpu(self):
+        logger.info("Starting test_local_interactives_nvidia_gpu...")
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
         self.run_local_interactives(number_of_gpus=1)
+        logger.info("test_local_interactives_nvidia_gpu completed")
 
     def run_local_interactives(
         self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
     ):
         cluster_name = "test-ray-cluster-li"
+        logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
 
+        logger.info("Creating cluster configuration...")
         cluster = Cluster(
             ClusterConfiguration(
                 name=cluster_name,
@@ -57,37 +74,97 @@ def run_local_interactives(
                 verify_tls=False,
             )
         )
+        logger.info("Cluster configuration created")
+
+        logger.info("Starting cluster deployment...")
         cluster.up()
+        logger.info("Cluster deployment initiated")
+
+        logger.info("Waiting for cluster to be ready...")
         cluster.wait_ready()
+        logger.info("Cluster is ready")
 
+        logger.info("Generating TLS certificates...")
         generate_cert.generate_tls_cert(cluster_name, self.namespace)
+        logger.info("TLS certificates generated")
+
+        logger.info("Exporting environment variables...")
         generate_cert.export_env(cluster_name, self.namespace)
+        logger.info("Environment variables exported")
+
+        client_url = cluster.local_client_url()
+        logger.info(f"Ray client URL: {client_url}")
 
-        print(cluster.local_client_url())
+        logger.info("Checking cluster status...")
+        status = cluster.status()
+        logger.info(f"Cluster status: {status}")
 
+        logger.info("Checking cluster dashboard URI...")
+        dashboard_uri = cluster.cluster_dashboard_uri()
+        logger.info(f"Dashboard URI: {dashboard_uri}")
+
+        logger.info("Checking cluster URI...")
+        cluster_uri = cluster.cluster_uri()
+        logger.info(f"Cluster URI: {cluster_uri}")
+
+        logger.info("Shutting down any existing Ray connections...")
         ray.shutdown()
-        ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
+        logger.info("Ray shutdown completed")
+
+        logger.info("Initializing Ray connection...")
+        try:
+            ray.init(address=client_url, logging_level="DEBUG")
+            logger.info("Ray initialization successful")
+        except Exception as e:
+            logger.error(f"Ray initialization failed: {str(e)}")
+            logger.error(f"Error type: {type(e)}")
+            raise
+
+        logger.info("Defining Ray remote functions...")
 
         @ray.remote(num_gpus=number_of_gpus / 2)
         def heavy_calculation_part(num_iterations):
+            logger.info(
+                f"Starting heavy_calculation_part with {num_iterations} iterations"
+            )
             result = 0.0
             for i in range(num_iterations):
                 for j in range(num_iterations):
                     for k in range(num_iterations):
                         result += math.sin(i) * math.cos(j) * math.tan(k)
+            logger.info("heavy_calculation_part completed")
             return result
 
         @ray.remote(num_gpus=number_of_gpus / 2)
         def heavy_calculation(num_iterations):
+            logger.info(f"Starting heavy_calculation with {num_iterations} iterations")
             results = ray.get(
                 [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
             )
+            logger.info("heavy_calculation completed")
             return sum(results)
 
+        logger.info("Submitting calculation task...")
         ref = heavy_calculation.remote(3000)
-        result = ray.get(ref)
-        assert result == 1789.4644387076714
-        ray.cancel(ref)
+        logger.info("Task submitted, waiting for result...")
+
+        try:
+            result = ray.get(ref)
+            logger.info(f"Calculation completed with result: {result}")
+            assert result == 1789.4644387076714
+            logger.info("Result assertion passed")
+        except Exception as e:
+            logger.error(f"Error during calculation: {str(e)}")
+            raise
+        finally:
+            logger.info("Cancelling task reference...")
+            ray.cancel(ref)
+            logger.info("Task cancelled")
+
+        logger.info("Shutting down Ray...")
         ray.shutdown()
+        logger.info("Ray shutdown completed")
 
+        logger.info("Tearing down cluster...")
         cluster.down()
+        logger.info("Cluster teardown completed")

From 6eeb49cdc7b3937481c90cb9eba33c3dfdb79a00 Mon Sep 17 00:00:00 2001
From: kryanbeane <bryankeane0@gmail.com>
Date: Tue, 20 May 2025 10:51:41 +0100
Subject: [PATCH 3/5] test certificate changes

---
 .../common/utils/generate_cert.py             | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py
index 7c072da0..3a98b4e5 100644
--- a/src/codeflare_sdk/common/utils/generate_cert.py
+++ b/src/codeflare_sdk/common/utils/generate_cert.py
@@ -240,12 +240,27 @@ def export_env(cluster_name, namespace):
 
     Environment Variables Set:
         - RAY_USE_TLS: Enables TLS for Ray.
-        - RAY_TLS_SERVER_CERT: Path to the TLS server certificate.
-        - RAY_TLS_SERVER_KEY: Path to the TLS server private key.
         - RAY_TLS_CA_CERT: Path to the CA certificate.
     """
+    # Assuming logger is configured elsewhere or add basicConfig here for the module
+    # import logging
+    # logger = logging.getLogger(__name__)
+    # logging.basicConfig(level=logging.INFO) # Or use existing logger if available
+
     tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}")
     os.environ["RAY_USE_TLS"] = "1"
-    os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt")
-    os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")
+    # os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") # Client usually doesn't need to present a server cert
+    # os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")   # Client usually doesn't need to present a server key
+    if "RAY_TLS_SERVER_CERT" in os.environ:
+        del os.environ["RAY_TLS_SERVER_CERT"]
+    if "RAY_TLS_SERVER_KEY" in os.environ:
+        del os.environ["RAY_TLS_SERVER_KEY"]
     os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt")
+
+    # It's better to use a logger instance if this module has one,
+    # otherwise, these prints will go to stdout.
+    # For now, using print for visibility in test logs if logger isn't set up in this exact scope.
+    print(f"generate_cert.export_env: RAY_USE_TLS set to: {os.environ.get('RAY_USE_TLS')}")
+    print(f"generate_cert.export_env: RAY_TLS_CA_CERT set to: {os.environ.get('RAY_TLS_CA_CERT')}")
+    print(f"generate_cert.export_env: RAY_TLS_SERVER_CERT is: {os.environ.get('RAY_TLS_SERVER_CERT')}")
+    print(f"generate_cert.export_env: RAY_TLS_SERVER_KEY is: {os.environ.get('RAY_TLS_SERVER_KEY')}")

From 440fac259e8534c9da1f1f472e3abc335c91b486 Mon Sep 17 00:00:00 2001
From: kryanbeane <bryankeane0@gmail.com>
Date: Tue, 20 May 2025 11:04:02 +0100
Subject: [PATCH 4/5] Revert "test certificate changes"

This reverts commit 6eeb49cdc7b3937481c90cb9eba33c3dfdb79a00.
---
 .github/workflows/e2e_tests.yaml              |  2 +-
 .../common/utils/generate_cert.py             | 22 +++++++------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index d100241c..26246c67 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -117,7 +117,7 @@ jobs:
           pip install poetry
           poetry install --with test,docs
           echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+          poetry run pytest -v -s --log-cli-level=INFO ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
         env:
           GRPC_DNS_RESOLVER: "native"
 
diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py
index 3a98b4e5..a0b4f8cd 100644
--- a/src/codeflare_sdk/common/utils/generate_cert.py
+++ b/src/codeflare_sdk/common/utils/generate_cert.py
@@ -240,27 +240,21 @@ def export_env(cluster_name, namespace):
 
     Environment Variables Set:
         - RAY_USE_TLS: Enables TLS for Ray.
+        - RAY_TLS_SERVER_CERT: Path to the TLS server certificate.
+        - RAY_TLS_SERVER_KEY: Path to the TLS server private key.
         - RAY_TLS_CA_CERT: Path to the CA certificate.
+        - RAY_CLIENT_SKIP_TLS_VERIFY: Skips TLS verification by the client.
     """
-    # Assuming logger is configured elsewhere or add basicConfig here for the module
-    # import logging
-    # logger = logging.getLogger(__name__)
-    # logging.basicConfig(level=logging.INFO) # Or use existing logger if available
-
     tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}")
     os.environ["RAY_USE_TLS"] = "1"
-    # os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") # Client usually doesn't need to present a server cert
-    # os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")   # Client usually doesn't need to present a server key
-    if "RAY_TLS_SERVER_CERT" in os.environ:
-        del os.environ["RAY_TLS_SERVER_CERT"]
-    if "RAY_TLS_SERVER_KEY" in os.environ:
-        del os.environ["RAY_TLS_SERVER_KEY"]
+    os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt")
+    os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")
     os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt")
+    os.environ["RAY_CLIENT_SKIP_TLS_VERIFY"] = "1" # Skip verification for E2E
 
-    # It's better to use a logger instance if this module has one,
-    # otherwise, these prints will go to stdout.
-    # For now, using print for visibility in test logs if logger isn't set up in this exact scope.
+    # Optional: Add print statements here if you still want to log them for verification
     print(f"generate_cert.export_env: RAY_USE_TLS set to: {os.environ.get('RAY_USE_TLS')}")
     print(f"generate_cert.export_env: RAY_TLS_CA_CERT set to: {os.environ.get('RAY_TLS_CA_CERT')}")
     print(f"generate_cert.export_env: RAY_TLS_SERVER_CERT is: {os.environ.get('RAY_TLS_SERVER_CERT')}")
     print(f"generate_cert.export_env: RAY_TLS_SERVER_KEY is: {os.environ.get('RAY_TLS_SERVER_KEY')}")
+    print(f"generate_cert.export_env: RAY_CLIENT_SKIP_TLS_VERIFY is: {os.environ.get('RAY_CLIENT_SKIP_TLS_VERIFY')}")

From 5693808274bf60435388df48e5ba04e3bb00a0ad Mon Sep 17 00:00:00 2001
From: kryanbeane <bryankeane0@gmail.com>
Date: Tue, 20 May 2025 11:59:39 +0100
Subject: [PATCH 5/5] add gpu checker

---
 .github/workflows/e2e_tests.yaml | 68 ++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index 26246c67..fd01a30f 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -70,6 +70,74 @@ jobs:
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
 
+      - name: Verify GPU availability in KinD
+        run: |
+          echo "Checking for available GPUs in the KinD cluster..."
+          
+          # Wait for GPU operator pods to be ready (with timeout)
+          echo "Waiting for GPU operator pods to be ready..."
+          TIMEOUT=300  # 5 minutes timeout
+          END=$((SECONDS + TIMEOUT))
+          
+          while [ $SECONDS -lt $END ]; do
+            # Get total number of pods in the namespace
+            TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
+            
+            # Count pods that are either running and ready or completed successfully
+            # Exclude pods that are still initializing
+            READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
+            
+            if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
+              echo "All GPU operator pods are ready or completed successfully!"
+              break
+            fi
+            
+            echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
+            echo "Pod status:"
+            kubectl get pods -n gpu-operator
+            sleep 10
+          done
+          
+          if [ $SECONDS -ge $END ]; then
+            echo "::error::Timeout waiting for GPU operator pods to be ready"
+            echo "GPU operator pod status:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Node details:"
+          kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
+          
+          # Check if GPU operator has labeled nodes
+          GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
+          if [ "$GPU_LABELS" -eq 0 ]; then
+            echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
+            echo "Full node descriptions for debugging:"
+            kubectl describe nodes
+            exit 1
+          fi
+          
+          # Check if GPUs are actually allocatable
+          GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
+          if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
+            echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
+            echo "Checking GPU operator pods:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            echo "GPU operator pod descriptions:"
+            kubectl describe pods -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |