add parallel generations to flux ptxla code.

jfacevedo-google · jfacevedo-google · commit 358908d63ce4 · 2024-09-27T23:00:22.000Z
diff --git a/examples/research_projects/pytorch_xla/inference/flux/generate_flux.py b/examples/research_projects/pytorch_xla/inference/flux/generate_flux.py
@@ -13,46 +13,29 @@
 import torch_xla.debug.metrics as met
 
 from diffusers import FluxPipeline
+import torch_xla.distributed.xla_multiprocessing as xmp
 
 logger = structlog.get_logger()
 metrics_filepath = '/tmp/metrics_report.txt'
 
-if __name__ == '__main__':
-    parser = ArgumentParser()
-    parser.add_argument('--schnell', action='store_true', help='run flux schnell instead of dev')
-    parser.add_argument('--width', type=int, default=1024, help='width of the image to generate')
-    parser.add_argument('--height', type=int, default=1024, help='height of the image to generate')
-    parser.add_argument('--guidance', type=float, default=3.5, help='gauidance strentgh for dev')
-    parser.add_argument('--seed', type=int, default=None, help='seed for inference')
-    parser.add_argument('--profile', action='store_true', help='enable profiling')
-    parser.add_argument('--profile-duration', type=int, default=10000, help='duration for profiling in msec.')
-    args = parser.parse_args()
+def _main(index, args, text_pipe, ckpt_id):
 
-    cache_path = Path('/tmp/data/compiler_cache')
+    cache_path = Path('/tmp/data/compiler_cache_tRiLlium_eXp')
     cache_path.mkdir(parents=True, exist_ok=True)
     xr.initialize_cache(str(cache_path), readonly=False)
 
-    profile_path = Path('/tmp/data/profiler_out')
+    profile_path = Path('/tmp/data/profiler_out_tRiLlium_eXp')
     profile_path.mkdir(parents=True, exist_ok=True)
     profiler_port = 9012
     profile_duration = args.profile_duration
     if args.profile:
         logger.info(f'starting profiler on port {profiler_port}')
         _ = xp.start_server(profiler_port)
+    device0 = xm.xla_device()
 
-    device0 = xm.xla_device(0)
-    device1 = xm.xla_device(1)
-    logger.info(f'text encoders: {device0}, flux: {device1}')
-
-    if args.schnell:
-        ckpt_id = "black-forest-labs/FLUX.1-schnell"
-    else:
-        ckpt_id = "black-forest-labs/FLUX.1-dev"
     logger.info(f'loading flux from {ckpt_id}')
-
-    text_pipe = FluxPipeline.from_pretrained(ckpt_id, transformer=None, vae=None, torch_dtype=torch.bfloat16).to(device0)
     flux_pipe = FluxPipeline.from_pretrained(ckpt_id, text_encoder=None, tokenizer=None,
-                                            text_encoder_2=None, tokenizer_2=None, torch_dtype=torch.bfloat16).to(device1)
+                                            text_encoder_2=None, tokenizer_2=None, torch_dtype=torch.bfloat16).to(device0)
 
     prompt = 'photograph of an electronics chip in the shape of a race car with trillium written on its side'
     width = args.width
@@ -65,35 +48,58 @@
     with torch.no_grad():
         prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
             prompt=prompt, prompt_2=None, max_sequence_length=512)
-    prompt_embeds = prompt_embeds.to(device1)
-    pooled_prompt_embeds = pooled_prompt_embeds.to(device1)
+    prompt_embeds = prompt_embeds.to(device0)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
 
     image = flux_pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
                     num_inference_steps=28, guidance_scale=guidance, height=height, width=width).images[0]
     logger.info(f'compilation took {perf_counter() - ts} sec.')
     image.save('/tmp/compile_out.png')
 
-    seed = 0 if args.seed is None else args.seed
-    xm.set_rng_state(seed=seed, device=device0)
-    xm.set_rng_state(seed=seed, device=device1)
-
+    base_seed = 4096 if args.seed is None else args.seed
+    seed_range = 1000
+    unique_seed = base_seed + index * seed_range
+    xm.set_rng_state(seed=unique_seed, device=device0)
+    times = []
     logger.info('starting inference run...')
-    ts = perf_counter()
-    with torch.no_grad():
-        prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
-            prompt=prompt, prompt_2=None, max_sequence_length=512)
-    prompt_embeds = prompt_embeds.to(device1)
-    pooled_prompt_embeds = pooled_prompt_embeds.to(device1)
-    xm.wait_device_ops()
-
-    if args.profile:
-        xp.trace_detached(f"localhost:{profiler_port}", str(profile_path), duration_ms=profile_duration)
-    image = flux_pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
-                    num_inference_steps=n_steps, guidance_scale=guidance, height=height, width=width).images[0]
-    logger.info(f'inference took {perf_counter() - ts} sec.')
-    image.save('/tmp/inference_out.png')
-    metrics_report = met.metrics_report()
-    with open(metrics_filepath, 'w+') as fout:
-        fout.write(metrics_report)
-    logger.info(f'saved metric information as {metrics_filepath}')
+    for _ in range(args.itters):
+        ts = perf_counter()
+        with torch.no_grad():
+            prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
+                prompt=prompt, prompt_2=None, max_sequence_length=512)
+        prompt_embeds = prompt_embeds.to(device0)
+        pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
+
+        if args.profile:
+            xp.trace_detached(f"localhost:{profiler_port}", str(profile_path), duration_ms=profile_duration)
+        image = flux_pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
+                        num_inference_steps=n_steps, guidance_scale=guidance, height=height, width=width).images[0]
+        inference_time = perf_counter() - ts
+        if index == 0:
+            logger.info(f"inference time: {inference_time}")
+        times.append(inference_time)
+    logger.info(f'avg. inference over {args.itters} iterations took {sum(times)/len(times)} sec.')
+    image.save(f'/home/tmp/inference_out-{index}.png')
+    if index == 0:
+        metrics_report = met.metrics_report()
+        with open(metrics_filepath, 'w+') as fout:
+            fout.write(metrics_report)
+        logger.info(f'saved metric information as {metrics_filepath}')
 
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--schnell', action='store_true', help='run flux schnell instead of dev')
+    parser.add_argument('--width', type=int, default=1024, help='width of the image to generate')
+    parser.add_argument('--height', type=int, default=1024, help='height of the image to generate')
+    parser.add_argument('--guidance', type=float, default=3.5, help='gauidance strentgh for dev')
+    parser.add_argument('--seed', type=int, default=None, help='seed for inference')
+    parser.add_argument('--profile', action='store_true', help='enable profiling')
+    parser.add_argument('--profile-duration', type=int, default=10000, help='duration for profiling in msec.')
+    parser.add_argument('--itters', type=int, default=15, help='tiems to run inference and get avg time in sec.')
+    args = parser.parse_args()
+    if args.schnell:
+        ckpt_id = "black-forest-labs/FLUX.1-schnell"
+    else:
+        ckpt_id = "black-forest-labs/FLUX.1-dev"
+    text_pipe = FluxPipeline.from_pretrained(ckpt_id, transformer=None, vae=None, torch_dtype=torch.bfloat16).to('cpu')
+    xmp.spawn(_main, args=(args, text_pipe, ckpt_id))
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -708,48 +708,48 @@ def __call__(
             guidance = None
 
         # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+        #with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+            noise_pred = self.transformer(
+                hidden_states=latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
+                # if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                #     progress_bar.update()
 
                 if XLA_AVAILABLE:
                     xm.mark_step()