Skip to content

Commit 00801d2

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 141c5a8 + dce4007 commit 00801d2

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

tests/odh/mnist_ray_test.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,22 @@ func TestMnistRayROCmGpu(t *testing.T) {
4444
mnistRay(t, 1, "amd.com/gpu", GetRayROCmImage(), "resources/requirements-rocm.txt")
4545
}
4646

47-
func TestMnistCustomRayImageCpu(t *testing.T) {
47+
func TestMnistCustomRayCudaCpu(t *testing.T) {
4848
mnistRay(t, 0, "nvidia.com/gpu", GetRayTorchCudaImage(), "resources/requirements.txt")
4949
}
5050

51-
func TestMnistCustomRayImageGpu(t *testing.T) {
51+
func TestMnistCustomRayCudaGpu(t *testing.T) {
5252
mnistRay(t, 1, "nvidia.com/gpu", GetRayTorchCudaImage(), "resources/requirements.txt")
5353
}
5454

55+
func TestMnistCustomRayRocmCpu(t *testing.T) {
56+
mnistRay(t, 0, "amd.com/gpu", GetRayTorchROCmImage(), "resources/requirements-rocm.txt")
57+
}
58+
59+
func TestMnistCustomRayRocmGpu(t *testing.T) {
60+
mnistRay(t, 1, "amd.com/gpu", GetRayTorchROCmImage(), "resources/requirements-rocm.txt")
61+
}
62+
5563
func mnistRay(t *testing.T, numGpus int, gpuResourceName string, rayImage string, requirementsFileName string) {
5664
test := With(t)
5765

tests/odh/resources/mnist_ray_mini.ipynb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@
7676
" ClusterConfiguration(\n",
7777
" namespace=namespace,\n",
7878
" name='mnisttest',\n",
79-
" head_cpu_requests=1,\n",
80-
" head_cpu_limits=1,\n",
79+
" head_cpu_requests=2,\n",
80+
" head_cpu_limits=2,\n",
8181
" head_memory_requests=4,\n",
8282
" head_memory_limits=4,\n",
8383
" head_extended_resource_requests={'nvidia.com/gpu':0},\n",
@@ -189,7 +189,13 @@
189189
"finished = False\n",
190190
"while not finished:\n",
191191
" sleep(1)\n",
192-
" status = client.get_job_status(submission_id)\n",
192+
" try:\n",
193+
" status = client.get_job_status(submission_id)\n",
194+
" except RuntimeError:\n",
195+
" # At times, the ray dashboard displays a \"RuntimeError: Request failed with status code 504: <html><body><h1>504 Gateway Time-out</h1>\" \n",
196+
" # message, leading to a crashloopback error in the notebook pod. However, the ray job continues running and disregards the error. \n",
197+
" # Consider eliminating the try-except block when using the updated version of Ray 2.38.\n",
198+
" pass\n",
193199
" finished = (status == \"SUCCEEDED\")\n",
194200
"if finished:\n",
195201
" print(\"Job completed Successfully !\")\n",

0 commit comments

Comments
 (0)