88import pytest
99import ray
1010import math
11+ import logging
12+ import time
13+ import os
1114
1215from support import *
1316
17+ # Configure logging
18+ logging .basicConfig (level = logging .DEBUG )
19+ logger = logging .getLogger (__name__ )
20+
1421
1522@pytest .mark .kind
1623class TestRayLocalInteractiveOauth :
1724 def setup_method (self ):
25+ logger .info ("Setting up test environment..." )
1826 initialize_kubernetes_client (self )
27+ logger .info ("Kubernetes client initialized" )
1928
2029 def teardown_method (self ):
30+ logger .info ("Cleaning up test environment..." )
2131 delete_namespace (self )
2232 delete_kueue_resources (self )
33+ logger .info ("Cleanup completed" )
2334
2435 def test_local_interactives (self ):
36+ logger .info ("Starting test_local_interactives..." )
2537 self .setup_method ()
2638 create_namespace (self )
2739 create_kueue_resources (self )
2840 self .run_local_interactives ()
41+ logger .info ("test_local_interactives completed" )
2942
3043 @pytest .mark .nvidia_gpu
3144 def test_local_interactives_nvidia_gpu (self ):
45+ logger .info ("Starting test_local_interactives_nvidia_gpu..." )
3246 self .setup_method ()
3347 create_namespace (self )
3448 create_kueue_resources (self )
3549 self .run_local_interactives (number_of_gpus = 1 )
50+ logger .info ("test_local_interactives_nvidia_gpu completed" )
3651
3752 def run_local_interactives (
3853 self , gpu_resource_name = "nvidia.com/gpu" , number_of_gpus = 0
3954 ):
4055 cluster_name = "test-ray-cluster-li"
56+ logger .info (f"Starting run_local_interactives with { number_of_gpus } GPUs" )
4157
58+ logger .info ("Creating cluster configuration..." )
4259 cluster = Cluster (
4360 ClusterConfiguration (
4461 name = cluster_name ,
@@ -57,37 +74,97 @@ def run_local_interactives(
5774 verify_tls = False ,
5875 )
5976 )
77+ logger .info ("Cluster configuration created" )
78+
79+ logger .info ("Starting cluster deployment..." )
6080 cluster .up ()
81+ logger .info ("Cluster deployment initiated" )
82+
83+ logger .info ("Waiting for cluster to be ready..." )
6184 cluster .wait_ready ()
85+ logger .info ("Cluster is ready" )
6286
87+ logger .info ("Generating TLS certificates..." )
6388 generate_cert .generate_tls_cert (cluster_name , self .namespace )
89+ logger .info ("TLS certificates generated" )
90+
91+ logger .info ("Exporting environment variables..." )
6492 generate_cert .export_env (cluster_name , self .namespace )
93+ logger .info ("Environment variables exported" )
94+
95+ client_url = cluster .local_client_url ()
96+ logger .info (f"Ray client URL: { client_url } " )
6597
66- print (cluster .local_client_url ())
98+ logger .info ("Checking cluster status..." )
99+ status = cluster .status ()
100+ logger .info (f"Cluster status: { status } " )
67101
102+ logger .info ("Checking cluster dashboard URI..." )
103+ dashboard_uri = cluster .cluster_dashboard_uri ()
104+ logger .info (f"Dashboard URI: { dashboard_uri } " )
105+
106+ logger .info ("Checking cluster URI..." )
107+ cluster_uri = cluster .cluster_uri ()
108+ logger .info (f"Cluster URI: { cluster_uri } " )
109+
110+ logger .info ("Shutting down any existing Ray connections..." )
68111 ray .shutdown ()
69- ray .init (address = cluster .local_client_url (), logging_level = "DEBUG" )
112+ logger .info ("Ray shutdown completed" )
113+
114+ logger .info ("Initializing Ray connection..." )
115+ try :
116+ ray .init (address = client_url , logging_level = "DEBUG" )
117+ logger .info ("Ray initialization successful" )
118+ except Exception as e :
119+ logger .error (f"Ray initialization failed: { str (e )} " )
120+ logger .error (f"Error type: { type (e )} " )
121+ raise
122+
123+ logger .info ("Defining Ray remote functions..." )
70124
71125 @ray .remote (num_gpus = number_of_gpus / 2 )
72126 def heavy_calculation_part (num_iterations ):
127+ logger .info (
128+ f"Starting heavy_calculation_part with { num_iterations } iterations"
129+ )
73130 result = 0.0
74131 for i in range (num_iterations ):
75132 for j in range (num_iterations ):
76133 for k in range (num_iterations ):
77134 result += math .sin (i ) * math .cos (j ) * math .tan (k )
135+ logger .info ("heavy_calculation_part completed" )
78136 return result
79137
80138 @ray .remote (num_gpus = number_of_gpus / 2 )
81139 def heavy_calculation (num_iterations ):
140+ logger .info (f"Starting heavy_calculation with { num_iterations } iterations" )
82141 results = ray .get (
83142 [heavy_calculation_part .remote (num_iterations // 30 ) for _ in range (30 )]
84143 )
144+ logger .info ("heavy_calculation completed" )
85145 return sum (results )
86146
147+ logger .info ("Submitting calculation task..." )
87148 ref = heavy_calculation .remote (3000 )
88- result = ray .get (ref )
89- assert result == 1789.4644387076714
90- ray .cancel (ref )
149+ logger .info ("Task submitted, waiting for result..." )
150+
151+ try :
152+ result = ray .get (ref )
153+ logger .info (f"Calculation completed with result: { result } " )
154+ assert result == 1789.4644387076714
155+ logger .info ("Result assertion passed" )
156+ except Exception as e :
157+ logger .error (f"Error during calculation: { str (e )} " )
158+ raise
159+ finally :
160+ logger .info ("Cancelling task reference..." )
161+ ray .cancel (ref )
162+ logger .info ("Task cancelled" )
163+
164+ logger .info ("Shutting down Ray..." )
91165 ray .shutdown ()
166+ logger .info ("Ray shutdown completed" )
92167
168+ logger .info ("Tearing down cluster..." )
93169 cluster .down ()
170+ logger .info ("Cluster teardown completed" )
0 commit comments