@@ -36,6 +36,9 @@ def test():
3636 test_directory = env .ARTIFACT_DIR
3737 prom_start_ts = None
3838 try :
39+ # Clean up any existing LLM inference services and pods before testing
40+ cleanup_llm_inference_resources ()
41+
3942 # Reset Prometheus before testing
4043 logging .info ("Resetting Prometheus database before testing" )
4144 prom_start_ts = prom .reset_prometheus ()
@@ -283,6 +286,45 @@ def capture_llm_inference_service_state():
283286 logging .error (f"Failed to capture LLM inference service state: { e } " )
284287
285288
289+ def cleanup_llm_inference_resources ():
290+ """
291+ Clean up all llminferenceservice resources in the namespace before testing.
292+ Fails the test if cleanup is not successful.
293+ """
294+
295+ namespace = config .project .get_config ("tests.llmd.namespace" )
296+ logging .info (f"Cleaning up all llminferenceservice resources in namespace { namespace } " )
297+
298+ # Delete all llminferenceservice resources in the namespace
299+ logging .info ("Deleting all llminferenceservice resources" )
300+ run .run (f"oc delete llminferenceservice --all -n { namespace } --wait=true --timeout=180s" )
301+
302+ # Verify no llminferenceservice resources remain
303+ logging .info ("Verifying no llminferenceservice resources remain" )
304+ for i in range (6 ): # Check up to 6 times with 10 second intervals
305+ result = run .run (f"oc get llminferenceservice -n { namespace } --no-headers" ,
306+ capture_stdout = True )
307+
308+ if not result .stdout .strip ():
309+ logging .info ("No llminferenceservice resources found - cleanup successful" )
310+ break
311+ else :
312+ remaining_services = result .stdout .strip ().split ('\n ' )
313+ remaining_count = len ([s for s in remaining_services if s .strip ()])
314+ logging .info (f"Still found { remaining_count } llminferenceservice resources, waiting..." )
315+
316+ if i == 5 : # Last iteration
317+ logging .error (f"Failed to clean up llminferenceservice resources after 60 seconds. { remaining_count } resources still exist:" )
318+ for service in remaining_services :
319+ if service .strip ():
320+ logging .error (f" - { service } " )
321+ raise RuntimeError (f"Cannot proceed with test - { remaining_count } llminferenceservice resources still exist in namespace { namespace } " )
322+
323+ time .sleep (10 )
324+
325+ logging .info ("LLM inference service cleanup completed successfully" )
326+
327+
286328def ensure_gpu_nodes_available ():
287329 """
288330 Ensures that there are GPU nodes available in the cluster.
0 commit comments