|
76 | 76 | " ClusterConfiguration(\n",
|
77 | 77 | " namespace=namespace,\n",
|
78 | 78 | " name='mnisttest',\n",
|
79 |
| - " head_cpu_requests=1,\n", |
80 |
| - " head_cpu_limits=1,\n", |
| 79 | + " head_cpu_requests=2,\n", |
| 80 | + " head_cpu_limits=2,\n", |
81 | 81 | " head_memory_requests=4,\n",
|
82 | 82 | " head_memory_limits=4,\n",
|
83 | 83 | " head_extended_resource_requests={'nvidia.com/gpu':0},\n",
|
|
189 | 189 | "finished = False\n",
|
190 | 190 | "while not finished:\n",
|
191 | 191 | " sleep(1)\n",
|
192 |
| - " status = client.get_job_status(submission_id)\n", |
| 192 | + " try:\n", |
| 193 | + " status = client.get_job_status(submission_id)\n", |
| 194 | + " except RuntimeError:\n", |
| 195 | + " # At times, the ray dashboard displays a \"RuntimeError: Request failed with status code 504: <html><body><h1>504 Gateway Time-out</h1>\" \n", |
| 196 | + " # message, leading to a crashloopback error in the notebook pod. However, the ray job continues running and disregards the error. \n", |
| 197 | + " # Consider eliminating the try-except block when using the updated version of Ray 2.38.\n", |
| 198 | + " pass\n", |
193 | 199 | " finished = (status == \"SUCCEEDED\")\n",
|
194 | 200 | "if finished:\n",
|
195 | 201 | " print(\"Job completed Successfully !\")\n",
|
|
0 commit comments