File tree Expand file tree Collapse file tree 2 files changed +7
-3
lines changed
inference/huggingface/stable-diffusion Expand file tree Collapse file tree 2 files changed +7
-3
lines changed Original file line number Diff line number Diff line change @@ -11,6 +11,8 @@ pip install -r requirements.txt
1111Examples can be run as follows:
1212<pre >deepspeed --num_gpus [number of GPUs] test-[model].py</pre >
1313
14+ NOTE: Local CUDA graphs for replaced SD modules will only be enabled when ` mp_size==1 ` .
15+
1416# Example Output
1517Command:
1618<pre >
Original file line number Diff line number Diff line change 99model = "prompthero/midjourney-v4-diffusion"
1010local_rank = int (os .getenv ("LOCAL_RANK" , "0" ))
1111device = torch .device (f"cuda:{ local_rank } " )
12- world_size = int (os .getenv ('WORLD_SIZE' , '4 ' ))
12+ world_size = int (os .getenv ('WORLD_SIZE' , '1 ' ))
1313generator = torch .Generator (device = torch .cuda .current_device ())
1414
1515pipe = DiffusionPipeline .from_pretrained (model , torch_dtype = torch .half )
1919baseline_image = pipe (prompt , guidance_scale = 7.5 , generator = generator ).images [0 ]
2020baseline_image .save (f"baseline.png" )
2121
22- # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules
22+ # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules.
23+ # Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1`
2324pipe = deepspeed .init_inference (
2425 pipe ,
26+ mp_size = world_size ,
2527 dtype = torch .half ,
2628 replace_with_kernel_inject = True ,
27- enable_cuda_graph = True ,
29+ enable_cuda_graph = True if world_size == 1 else False ,
2830)
2931
3032generator .manual_seed (0xABEDABE7 )
You can’t perform that action at this time.
0 commit comments