[Misc] Small fixes to Torch code (#395)

applesaucethebun · b8zhong · web-flow · commit 12647457a784 · 2025-05-23T14:40:24.000-07:00
Co-authored-by: Brayden Zhong &lt;b8zhong@uwaterloo.ca&gt;
diff --git a/csrc/sliding_tile_attention/test/test_sta.py b/csrc/sliding_tile_attention/test/test_sta.py
@@ -2,7 +2,6 @@
 from flex_sta_ref import get_sliding_tile_attention_mask
 from st_attn import sliding_tile_attention
 from torch.nn.attention.flex_attention import flex_attention
-# from flash_attn_interface import flash_attn_func
 from tqdm import tqdm
 
 flex_attention = torch.compile(flex_attention, dynamic=False)
@@ -23,7 +22,7 @@ def h100_fwd_kernel_test(Q, K, V, kernel_size):
 def generate_tensor(shape, mean, std, dtype, device):
     tensor = torch.randn(shape, dtype=dtype, device=device)
 
-    magnitude = torch.norm(tensor, dim=-1, keepdim=True)
+    magnitude = torch.linalg.norm(tensor, dim=-1, keepdim=True)
     scaled_tensor = tensor * (torch.randn(magnitude.shape, dtype=dtype, device=device) * std + mean) / magnitude
 
     return scaled_tensor.contiguous()
diff --git a/fastvideo/models/hunyuan/idle_config.py b/fastvideo/models/hunyuan/idle_config.py
@@ -237,7 +237,7 @@ def add_inference_args(parser: argparse.ArgumentParser):
         type=str,
         default="540p",
         choices=["540p", "720p"],
-        help="Root path of all the models, including t2v models and extra models.",
+        help="The resolution of the model.",
     )
     group.add_argument(
         "--load-key",
@@ -361,7 +361,7 @@ def add_parallel_args(parser: argparse.ArgumentParser):
         "--ring-degree",
         type=int,
         default=1,
-        help="Ulysses degree.",
+        help="Ring degree.",
     )
 
     return parser
diff --git a/fastvideo/models/hunyuan/inference.py b/fastvideo/models/hunyuan/inference.py
@@ -17,7 +17,7 @@
 from fastvideo.utils.parallel_states import nccl_info
 
 
-class Inference(object):
+class Inference:
 
     def __init__(
         self,
diff --git a/fastvideo/models/hunyuan/prompt_rewrite.py b/fastvideo/models/hunyuan/prompt_rewrite.py
@@ -41,7 +41,7 @@ def get_rewrite_prompt(ori_prompt, mode="Normal"):
     elif mode == "Master":
         prompt = master_mode_prompt.format(input=ori_prompt)
     else:
-        raise Exception("Only supports Normal and Normal", mode)
+        raise Exception("Only supports Normal and Master mode, but got {}".format(mode))
     return prompt
 
 
diff --git a/fastvideo/models/stepvideo/text_encoder/stepllm.py b/fastvideo/models/stepvideo/text_encoder/stepllm.py
@@ -267,25 +267,25 @@ def forward(
 class STEP1TextEncoder(torch.nn.Module):
 
     def __init__(self, model_dir, max_length=320):
-        super(STEP1TextEncoder, self).__init__()
+        super()
         self.max_length = max_length
         self.text_tokenizer = Wrapped_StepChatTokenizer(os.path.join(model_dir, 'step1_chat_tokenizer.model'))
         text_encoder = Step1Model.from_pretrained(model_dir)
         self.text_encoder = text_encoder.eval().to(torch.bfloat16)
 
     @torch.no_grad
+    @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
     def forward(self, prompts, with_mask=True, max_length=None):
         self.device = next(self.text_encoder.parameters()).device
-        with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            if type(prompts) is str:
-                prompts = [prompts]
-
-            txt_tokens = self.text_tokenizer(prompts,
-                                             max_length=max_length or self.max_length,
-                                             padding="max_length",
-                                             truncation=True,
-                                             return_tensors="pt")
-            y = self.text_encoder(txt_tokens.input_ids.to(self.device),
+        if type(prompts) is str:
+            prompts = [prompts]
+
+        txt_tokens = self.text_tokenizer(prompts,
+                                            max_length=max_length or self.max_length,
+                                            padding="max_length",
+                                            truncation=True,
+                                            return_tensors="pt")
+        y = self.text_encoder(txt_tokens.input_ids.to(self.device),
                                   attention_mask=txt_tokens.attention_mask.to(self.device) if with_mask else None)
-            y_mask = txt_tokens.attention_mask
+        y_mask = txt_tokens.attention_mask
         return y.transpose(0, 1), y_mask