File tree Expand file tree Collapse file tree 4 files changed +25
-7
lines changed
Expand file tree Collapse file tree 4 files changed +25
-7
lines changed Original file line number Diff line number Diff line change 2929 depends_on : image-build
3030 commands :
3131 - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
32- - pytest -s -v tests/e2e/offline_inference/test_ovis_image.py
3332 agents :
3433 queue : " gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
3534 plugins :
@@ -93,11 +92,28 @@ steps:
9392 volumes :
9493 - " /fsx/hf_cache:/fsx/hf_cache"
9594
96- - label : " Diffusion Parallelism Test"
97- timeout_in_minutes : 25
95+ - label : " Diffusion Sequence Parallelism Test"
96+ timeout_in_minutes : 20
9897 depends_on : image-build
9998 commands :
10099 - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
100+ agents :
101+ queue : " gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
102+ plugins :
103+ - docker#v5.2.0:
104+ image : public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
105+ always-pull : true
106+ propagate-environment : true
107+ shm-size : " 8gb"
108+ environment :
109+ - " HF_HOME=/fsx/hf_cache"
110+ volumes :
111+ - " /fsx/hf_cache:/fsx/hf_cache"
112+
113+ - label : " Diffusion Tensor Parallelism Test"
114+ timeout_in_minutes : 20
115+ depends_on : image-build
116+ commands :
101117 - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
102118 agents :
103119 queue : " gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
Original file line number Diff line number Diff line change @@ -38,8 +38,10 @@ def inference(offload: bool = True):
3838 guidance_scale = 0.0 ,
3939 generator = torch .Generator ("cuda" ).manual_seed (42 ),
4040 )
41+ peak = monitor .peak_used_mb
42+ monitor .stop ()
4143
42- return monitor . peak_used_mb
44+ return peak
4345
4446 offload_peak_memory = inference (offload = True )
4547 no_offload_peak_memory = inference (offload = False )
Original file line number Diff line number Diff line change @@ -119,7 +119,7 @@ def _run_zimage_generate(
119119
120120 return _extract_single_image ([last_output ]), median_time_s , peak_memory_mb
121121 finally :
122- m . close ()
122+ monitor . stop ()
123123 cleanup_dist_env_and_memory ()
124124
125125
@@ -160,7 +160,7 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path):
160160
161161 mean_abs_diff , max_abs_diff = _diff_metrics (tp1_img , tp2_img )
162162 mean_threshold = 3e-2
163- max_threshold = 3. 5e-1
163+ max_threshold = 5e-1
164164 print (
165165 "Z-Image TP image diff stats (TP=1 vs TP=2): "
166166 f"mean_abs_diff={ mean_abs_diff :.6e} , max_abs_diff={ max_abs_diff :.6e} ; "
Original file line number Diff line number Diff line change @@ -500,7 +500,7 @@ def monitor_loop() -> None:
500500 pass
501501 time .sleep (self .interval )
502502
503- self ._thread = threading .Thread (target = monitor_loop , daemon = True )
503+ self ._thread = threading .Thread (target = monitor_loop , daemon = False )
504504 self ._thread .start ()
505505
506506 def stop (self ) -> None :
You can’t perform that action at this time.
0 commit comments