remove support for batch_size from dream.py (#227)

bakkot · web-flow · commit 1714816fe2ae · 2022-08-30T22:30:12.000-04:00
* remove dream.py support for batch_size

* expect to get a single image
diff --git a/README.md b/README.md
@@ -297,13 +297,13 @@ face enhancement (see previous section):
 ```
 dream> a cute child playing hopscotch -G0.5
 [...]
-outputs/img-samples/000039.3498014304.png: "a cute child playing hopscotch" -s50 -b1 -W512 -H512 -C7.5 -mk_lms -S3498014304
+outputs/img-samples/000039.3498014304.png: "a cute child playing hopscotch" -s50 -W512 -H512 -C7.5 -mk_lms -S3498014304
 
 # I wonder what it will look like if I bump up the steps and set facial enhancement to full strength?
 dream> a cute child playing hopscotch -G1.0 -s100 -S -1
 reusing previous seed 3498014304
 [...]
-outputs/img-samples/000040.3498014304.png: "a cute child playing hopscotch" -G1.0 -s100 -b1 -W512 -H512 -C7.5 -mk_lms -S3498014304
+outputs/img-samples/000040.3498014304.png: "a cute child playing hopscotch" -G1.0 -s100 -W512 -H512 -C7.5 -mk_lms -S3498014304
 ```
 
 ## Weighted Prompts
diff --git a/ldm/dream/pngwriter.py b/ldm/dream/pngwriter.py
@@ -117,7 +117,6 @@ def normalize_prompt(self):
         switches = list()
         switches.append(f'"{opt.prompt}"')
         switches.append(f'-s{opt.steps        or t2i.steps}')
-        switches.append(f'-b{opt.batch_size   or t2i.batch_size}')
         switches.append(f'-W{opt.width        or t2i.width}')
         switches.append(f'-H{opt.height       or t2i.height}')
         switches.append(f'-C{opt.cfg_scale    or t2i.cfg_scale}')
diff --git a/ldm/dream/readline.py b/ldm/dream/readline.py
@@ -89,7 +89,6 @@ def _path_completions(self, text, state, extensions):
                 '--steps','-s',
                 '--seed','-S',
                 '--iterations','-n',
-                '--batch_size','-b',
                 '--width','-W','--height','-H',
                 '--cfg_scale','-C',
                 '--grid','-g',
diff --git a/ldm/dream/server.py b/ldm/dream/server.py
@@ -140,8 +140,7 @@ def image_progress(sample, step):
             # since rendering images is moderately expensive, only render every 5th image
             # and don't bother with the last one, since it'll render anyway
             if progress_images and step % 5 == 0 and step < steps - 1:
-                images = self.model._samples_to_images(sample)
-                image = images[0]
+                image = self.model._sample_to_image(sample)
                 step_writer.write_image(image, seed) # TODO PngWriter to return path
                 url = step_writer.filepath
             self.wfile.write(bytes(json.dumps(
diff --git a/ldm/simplet2i.py b/ldm/simplet2i.py
@@ -39,7 +39,6 @@
 t2i = T2I(model       = <path>        // models/ldm/stable-diffusion-v1/model.ckpt
           config      = <path>        // configs/stable-diffusion/v1-inference.yaml
           iterations  = <integer>     // how many times to run the sampling (1)
-          batch_size  = <integer>     // how many images to generate per sampling (1)
           steps       = <integer>     // 50
           seed        = <integer>     // current system time
           sampler_name= ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms']  // k_lms
@@ -98,7 +97,6 @@ class T2I:
         model
         config
         iterations
-        batch_size
         steps
         seed
         sampler_name
@@ -116,7 +114,6 @@ class T2I:
 
     def __init__(
         self,
-        batch_size=1,
         iterations=1,
         steps=50,
         seed=None,
@@ -138,7 +135,6 @@ def __init__(
         latent_diffusion_weights=False,
         device='cuda',
     ):
-        self.batch_size = batch_size
         self.iterations = iterations
         self.width = width
         self.height = height
@@ -174,9 +170,7 @@ def prompt2png(self, prompt, outdir, **kwargs):
         Optional named arguments are the same as those passed to T2I and prompt2image()
         """
         results = self.prompt2image(prompt, **kwargs)
-        pngwriter = PngWriter(
-            outdir, prompt, kwargs.get('batch_size', self.batch_size)
-        )
+        pngwriter = PngWriter(outdir, prompt)
         for r in results:
             pngwriter.write_image(r[0], r[1])
         return pngwriter.files_written
@@ -196,7 +190,6 @@ def prompt2image(
         self,
         # these are common
         prompt,
-        batch_size=None,
         iterations=None,
         steps=None,
         seed=None,
@@ -222,8 +215,7 @@ def prompt2image(
         ldm.prompt2image() is the common entry point for txt2img() and img2img()
         It takes the following arguments:
            prompt                          // prompt string (no default)
-           iterations                      // iterations (1); image count=iterations x batch_size
-           batch_size                      // images per iteration (1)
+           iterations                      // iterations (1); image count=iterations
            steps                           // refinement steps per iteration
            seed                            // seed for random number generator
            width                           // width of image, in multiples of 64 (512)
@@ -258,7 +250,6 @@ def process_image(image,seed):
         height = height or self.height
         cfg_scale = cfg_scale or self.cfg_scale
         ddim_eta = ddim_eta or self.ddim_eta
-        batch_size = batch_size or self.batch_size
         iterations = iterations or self.iterations
         strength = strength or self.strength
         self.log_tokenization = log_tokenization
@@ -297,7 +288,6 @@ def process_image(image,seed):
                 images_iterator = self._img2img(
                     prompt,
                     precision_scope=scope,
-                    batch_size=batch_size,
                     steps=steps,
                     cfg_scale=cfg_scale,
                     ddim_eta=ddim_eta,
@@ -312,7 +302,6 @@ def process_image(image,seed):
                 images_iterator = self._txt2img(
                     prompt,
                     precision_scope=scope,
-                    batch_size=batch_size,
                     steps=steps,
                     cfg_scale=cfg_scale,
                     ddim_eta=ddim_eta,
@@ -325,11 +314,10 @@ def process_image(image,seed):
             with scope(self.device.type), self.model.ema_scope():
                 for n in trange(iterations, desc='Generating'):
                     seed_everything(seed)
-                    iter_images = next(images_iterator)
-                    for image in iter_images:
-                        results.append([image, seed])
-                        if image_callback is not None:
-                            image_callback(image, seed)
+                    image = next(images_iterator)
+                    results.append([image, seed])
+                    if image_callback is not None:
+                        image_callback(image, seed)
                     seed = self._new_seed()
 
                 if upscale is not None or gfpgan_strength > 0:
@@ -399,7 +387,6 @@ def _txt2img(
         self,
         prompt,
         precision_scope,
-        batch_size,
         steps,
         cfg_scale,
         ddim_eta,
@@ -415,31 +402,30 @@ def _txt2img(
         sampler = self.sampler
 
         while True:
-            uc, c = self._get_uc_and_c(prompt, batch_size, skip_normalize)
+            uc, c = self._get_uc_and_c(prompt, skip_normalize)
             shape = [
                 self.latent_channels,
                 height // self.downsampling_factor,
                 width // self.downsampling_factor,
             ]
             samples, _ = sampler.sample(
+                batch_size=1,
                 S=steps,
                 conditioning=c,
-                batch_size=batch_size,
                 shape=shape,
                 verbose=False,
                 unconditional_guidance_scale=cfg_scale,
                 unconditional_conditioning=uc,
                 eta=ddim_eta,
                 img_callback=callback
             )
-            yield self._samples_to_images(samples)
+            yield self._sample_to_image(samples)
 
     @torch.no_grad()
     def _img2img(
         self,
         prompt,
         precision_scope,
-        batch_size,
         steps,
         cfg_scale,
         ddim_eta,
@@ -464,7 +450,6 @@ def _img2img(
             sampler = self.sampler
 
         init_image = self._load_img(init_img,width,height).to(self.device)
-        init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
         with precision_scope(self.device.type):
             init_latent = self.model.get_first_stage_encoding(
                 self.model.encode_first_stage(init_image)
@@ -478,11 +463,11 @@ def _img2img(
         # print(f"target t_enc is {t_enc} steps")
 
         while True:
-            uc, c = self._get_uc_and_c(prompt, batch_size, skip_normalize)
+            uc, c = self._get_uc_and_c(prompt, skip_normalize)
 
             # encode (scaled latent)
             z_enc = sampler.stochastic_encode(
-                init_latent, torch.tensor([t_enc] * batch_size).to(self.device)
+                init_latent, torch.tensor([t_enc]).to(self.device)
             )
             # decode it
             samples = sampler.decode(
@@ -493,12 +478,12 @@ def _img2img(
                 unconditional_guidance_scale=cfg_scale,
                 unconditional_conditioning=uc,
             )
-            yield self._samples_to_images(samples)
+            yield self._sample_to_image(samples)
 
     # TODO: does this actually need to run every loop? does anything in it vary by random seed?
-    def _get_uc_and_c(self, prompt, batch_size, skip_normalize):
+    def _get_uc_and_c(self, prompt, skip_normalize):
 
-        uc = self.model.get_learned_conditioning(batch_size * [''])
+        uc = self.model.get_learned_conditioning([''])
 
         # weighted sub-prompts
         subprompts, weights = T2I._split_weighted_subprompts(prompt)
@@ -515,27 +500,23 @@ def _get_uc_and_c(self, prompt, batch_size, skip_normalize):
                 self._log_tokenization(subprompts[i])
                 c = torch.add(
                     c,
-                    self.model.get_learned_conditioning(
-                        batch_size * [subprompts[i]]
-                    ),
+                    self.model.get_learned_conditioning([subprompts[i]]),
                     alpha=weight,
                 )
         else:   # just standard 1 prompt
             self._log_tokenization(prompt)
-            c = self.model.get_learned_conditioning(batch_size * [prompt])
+            c = self.model.get_learned_conditioning([prompt])
         return (uc, c)
 
-    def _samples_to_images(self, samples):
+    def _sample_to_image(self, samples):
         x_samples = self.model.decode_first_stage(samples)
         x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
-        images = list()
-        for x_sample in x_samples:
-            x_sample = 255.0 * rearrange(
-                x_sample.cpu().numpy(), 'c h w -> h w c'
-            )
-            image = Image.fromarray(x_sample.astype(np.uint8))
-            images.append(image)
-        return images
+        if len(x_samples) != 1:
+            raise Exception(f'expected to get a single image, but got {len(x_samples)}')
+        x_sample = 255.0 * rearrange(
+            x_samples[0].cpu().numpy(), 'c h w -> h w c'
+        )
+        return Image.fromarray(x_sample.astype(np.uint8))
 
     def _new_seed(self):
         self.seed = random.randrange(0, np.iinfo(np.uint32).max)
diff --git a/scripts/dream.py b/scripts/dream.py
@@ -199,7 +199,7 @@ def main_loop(t2i, outdir, prompt_as_dir, parser, infile):
 
         # Here is where the images are actually generated!
         try:
-            file_writer = PngWriter(current_outdir, normalized_prompt, opt.batch_size)
+            file_writer = PngWriter(current_outdir, normalized_prompt)
             callback    = file_writer.write_image if individual_images else None
             image_list  = t2i.prompt2image(image_callback=callback, **vars(opt))
             results = (
@@ -419,13 +419,6 @@ def create_cmd_parser():
         default=1,
         help='Number of samplings to perform (slower, but will provide seeds for individual images)',
     )
-    parser.add_argument(
-        '-b',
-        '--batch_size',
-        type=int,
-        default=1,
-        help='Number of images to produce per sampling (will not provide seeds for individual images!)',
-    )
     parser.add_argument(
         '-W', '--width', type=int, help='Image width, multiple of 64'
     )