update 1105 sst test code with fake cp

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 4a3035d64e58 · 2024-11-05T12:55:54.000+08:00
diff --git a/sat/diffusion_video.py b/sat/diffusion_video.py
@@ -179,19 +179,31 @@ def decode_first_stage(self, z):
         n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
         n_rounds = math.ceil(z.shape[0] / n_samples)
         all_out = []
-        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
-            for n in range(n_rounds):
-                if isinstance(self.first_stage_model.decoder, VideoDecoder):
-                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
-                else:
-                    kwargs = {}
-                frame = z.shape[2] * 4 - 3
-                if frame <= 9:
-                    use_cp = False
-                else:
-                    use_cp = True
-                out = self.first_stage_model.decode(z[n * n_samples : (n + 1) * n_samples], use_cp=use_cp, **kwargs)
-                all_out.append(out)
+        for n in range(n_rounds):
+            z_now = z[n * n_samples : (n + 1) * n_samples, :, 1:]
+            latent_time = z_now.shape[2]  # check the time latent
+            temporal_compress_times = 4
+
+            fake_cp_size = min(10, latent_time // 2)
+            start_frame = 0
+
+            recons = []
+            start_frame = 0
+            for i in range(fake_cp_size):
+                end_frame = start_frame + latent_time // fake_cp_size + (1 if i < latent_time % fake_cp_size else 0)
+
+                fake_cp_rank0 = True if i == 0 else False
+                clear_fake_cp_cache = True if i == fake_cp_size - 1 else False
+                with torch.no_grad():
+                    recon = self.first_stage_model.decode(
+                        z_now[:, :, start_frame:end_frame].contiguous(),
+                        clear_fake_cp_cache=clear_fake_cp_cache,
+                        fake_cp_rank0=fake_cp_rank0,
+                    )
+                recons.append(recon)
+                start_frame = end_frame
+            recons = torch.cat(recons, dim=2)
+            all_out.append(recons)
         out = torch.cat(all_out, dim=0)
         return out
 
diff --git a/sat/dit_video_concat.py b/sat/dit_video_concat.py
@@ -654,7 +654,6 @@ def __init__(
         time_interpolation=1.0,
         use_SwiGLU=False,
         use_RMSNorm=False,
-        cfg_embed_dim=None,
         ofs_embed_dim=None,
         **kwargs,
     ):
@@ -669,7 +668,6 @@ def __init__(
         self.hidden_size = hidden_size
         self.model_channels = hidden_size
         self.time_embed_dim = time_embed_dim if time_embed_dim is not None else hidden_size
-        self.cfg_embed_dim = cfg_embed_dim
         self.ofs_embed_dim = ofs_embed_dim
         self.num_classes = num_classes
         self.adm_in_channels = adm_in_channels
@@ -728,13 +726,6 @@ def _build_modules(self, module_configs):
                 linear(self.ofs_embed_dim, self.ofs_embed_dim),
             )
 
-        if self.cfg_embed_dim is not None:
-            self.cfg_embed = nn.Sequential(
-                linear(self.cfg_embed_dim, self.cfg_embed_dim),
-                nn.SiLU(),
-                linear(self.cfg_embed_dim, self.cfg_embed_dim),
-            )
-
         if self.num_classes is not None:
             if isinstance(self.num_classes, int):
                 self.label_emb = nn.Embedding(self.num_classes, time_embed_dim)
@@ -848,14 +839,6 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             ofs_emb = timestep_embedding(kwargs["ofs"], self.ofs_embed_dim, repeat_only=False, dtype=self.dtype)
             ofs_emb = self.ofs_embed(ofs_emb)
             emb = emb + ofs_emb
-        if self.cfg_embed_dim is not None:
-            cfg_emb = kwargs["scale_emb"]
-            cfg_emb = self.cfg_embed(cfg_emb)
-            emb = emb + cfg_emb
-
-        if "ofs" in kwargs.keys():
-            ofs_emb = timestep_embedding(kwargs["ofs"], self.ofs_embed_dim, repeat_only=False, dtype=self.dtype)
-            ofs_emb = self.ofs_embed(ofs_emb)
 
         kwargs["seq_length"] = t * h * w // reduce(mul, self.patch_size)
         kwargs["images"] = x
diff --git a/sat/inference.sh b/sat/inference.sh
@@ -4,7 +4,7 @@ echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 
 environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
 
-run_cmd="$environs python sample_video.py --base configs/cogvideox_5b.yaml configs/inference.yaml --seed $RANDOM"
+run_cmd="$environs python sample_video.py --base configs/test_cogvideox_5b.yaml configs/test_inference.yaml --seed $RANDOM"
 
 echo ${run_cmd}
 eval ${run_cmd}
diff --git a/sat/sample_video.py b/sat/sample_video.py
@@ -135,14 +135,14 @@ def sampling_main(args, model_cls):
     sample_func = model.sample
     num_samples = [1]
     force_uc_zero_embeddings = ["txt"]
-
+    T, C = args.sampling_num_frames, args.latent_channels
     with torch.no_grad():
         for text, cnt in tqdm(data_iter):
             if args.image2video:
                 # use with input image shape
-                text, image_path = text.split('@@')
+                text, image_path = text.split("@@")
                 assert os.path.exists(image_path), image_path
-                image = Image.open(image_path).convert('RGB')
+                image = Image.open(image_path).convert("RGB")
                 (img_W, img_H) = image.size
 
                 def nearest_multiple_of_16(n):
@@ -163,7 +163,7 @@ def nearest_multiple_of_16(n):
                 chained_trainsforms.append(TT.Resize(size=[int(H * 8), int(W * 8)], interpolation=1))
                 chained_trainsforms.append(TT.ToTensor())
                 transform = TT.Compose(chained_trainsforms)
-                image = transform(image).unsqueeze(0).to('cuda')
+                image = transform(image).unsqueeze(0).to("cuda")
                 image = image * 2.0 - 1.0
                 image = image.unsqueeze(2).to(torch.bfloat16)
                 image = model.encode_first_stage(image, None)
@@ -173,7 +173,7 @@ def nearest_multiple_of_16(n):
                 image = torch.concat([image, torch.zeros(pad_shape).to(image.device).to(image.dtype)], dim=1)
             else:
                 image_size = args.sampling_image_size
-                T, H, W, C = args.sampling_num_frames, image_size[0], image_size[1], args.latent_channels
+                H, W = image_size[0], image_size[1]
                 F = 8  # 8x downsampled
                 image = None
 
@@ -183,11 +183,7 @@ def nearest_multiple_of_16(n):
             src = global_rank * mp_size
             torch.distributed.broadcast_object_list(text_cast, src=src, group=mpu.get_model_parallel_group())
             text = text_cast[0]
-            value_dict = {
-                'prompt': text,
-                'negative_prompt': '',
-                'num_frames': torch.tensor(T).unsqueeze(0)
-            }
+            value_dict = {"prompt": text, "negative_prompt": "", "num_frames": torch.tensor(T).unsqueeze(0)}
 
             batch, batch_uc = get_batch(
                 get_unique_embedder_keys_from_conditioner(model.conditioner), value_dict, num_samples
@@ -216,19 +212,15 @@ def nearest_multiple_of_16(n):
             for index in range(args.batch_size):
                 if args.image2video:
                     samples_z = sample_func(
-                        c,
-                        uc=uc,
-                        batch_size=1,
-                        shape=(T, C, H, W),
-                        ofs=torch.tensor([2.0]).to('cuda')
+                        c, uc=uc, batch_size=1, shape=(T, C, H, W), ofs=torch.tensor([2.0]).to("cuda")
                     )
                 else:
                     samples_z = sample_func(
                         c,
                         uc=uc,
                         batch_size=1,
                         shape=(T, C, H // F, W // F),
-                    ).to('cuda')
+                    ).to("cuda")
 
                 samples_z = samples_z.permute(0, 2, 1, 3, 4).contiguous()
                 if args.only_save_latents:
@@ -250,11 +242,12 @@ def nearest_multiple_of_16(n):
                     if mpu.get_model_parallel_rank() == 0:
                         save_video_as_grid_and_mp4(samples, save_path, fps=args.sampling_fps)
 
-if __name__ == '__main__':
-    if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ:
-        os.environ['LOCAL_RANK'] = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
-        os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
-        os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+
+if __name__ == "__main__":
+    if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
+        os.environ["LOCAL_RANK"] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
+        os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+        os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
     py_parser = argparse.ArgumentParser(add_help=False)
     known, args_list = py_parser.parse_known_args()
 
diff --git a/sat/vae_modules/cp_enc_dec.py b/sat/vae_modules/cp_enc_dec.py