lint

Musisoul · Musisoul · commit 63348f1ee59c · 2025-12-29T11:47:41.000+08:00
diff --git a/lightx2v/common/ops/attn/draft_attn.py b/lightx2v/common/ops/attn/draft_attn.py
@@ -46,7 +46,7 @@ def build_grid_gather_index_and_bucket_fast(H, W, pool_h, pool_w, seqlen):
         Gh = (H + pool_h - 1) // pool_h
         Gw = (W + pool_w - 1) // pool_w
 
-        # 单帧
+        # Single frame
         gather_single = []
         bucket_sizes_single = []
 
@@ -90,23 +90,22 @@ def build_grid_gather_index_and_bucket_fast(H, W, pool_h, pool_w, seqlen):
 
     @classmethod
     @torch.compiler.disable
-    def prepare_reorg_idx_and_bucket_offset(cls, seqlen, frame_h, frame_w):
+    def prepare_reorg_idx_and_bucket_offset(cls, seqlen, frame_h, frame_w, pool_h, pool_w, device):
         if (seqlen, frame_h, frame_w) in cls.reorg_idx_dict:
             return
-        pool_h, pool_w = (8, 16) if frame_h < frame_w else (16, 8)
         reorg_idx, bucket_sizes, bucket_offsets = cls.build_grid_gather_index_and_bucket_fast(
             H=frame_h,
             W=frame_w,
             pool_h=pool_h,
             pool_w=pool_w,
             seqlen=seqlen,
         )
-        reorg_idx = torch.tensor(reorg_idx, dtype=torch.long, device="cuda")
+        reorg_idx = torch.tensor(reorg_idx, dtype=torch.long, device=device)
         restore_idx = torch.empty_like(reorg_idx)
-        restore_idx[reorg_idx] = torch.arange(reorg_idx.numel(), device=reorg_idx.device)
+        restore_idx[reorg_idx] = torch.arange(reorg_idx.numel(), device=device)
         cls.reorg_idx_dict[(seqlen, frame_h, frame_w)] = reorg_idx
         cls.restore_idx_dict[(seqlen, frame_h, frame_w)] = restore_idx
-        cls.bucket_offsets_dict[(seqlen, frame_h, frame_w)] = torch.tensor(bucket_offsets, dtype=torch.int32, device="cuda")
+        cls.bucket_offsets_dict[(seqlen, frame_h, frame_w)] = torch.tensor(bucket_offsets, dtype=torch.int32, device=device)
         logger.info(f"DraftAttnWeight: reorg_idx len: {len(reorg_idx)}")
         logger.info(f"DraftAttnWeight: bucket_sizes: {bucket_sizes}")
         logger.info(f"DraftAttnWeight: bucket_offsets: {bucket_offsets}")
@@ -136,7 +135,7 @@ def sample_qk_attention_2d(
         q_vid = q_vid.permute(0, 3, 4, 1, 2).reshape(num_frames, H * D, frame_h, frame_w)
         k_vid = k_vid.permute(0, 3, 4, 1, 2).reshape(num_frames, H * D, frame_h, frame_w)
 
-        # 3) 2D max‐pool each frame (ceil_mode ensures we cover the edges):
+        # 3) 2D avg‐pool each frame (ceil_mode ensures we cover the edges):
         #    → [num_frames, H*D, S_h, S_w]
         q_pooled = F.avg_pool2d(q_vid, kernel_size=(pool_h, pool_w), stride=(pool_h, pool_w), ceil_mode=True)
         k_pooled = F.avg_pool2d(k_vid, kernel_size=(pool_h, pool_w), stride=(pool_h, pool_w), ceil_mode=True)
@@ -186,7 +185,7 @@ def attention_percentile_mask_headwise(self, attn_map: torch.Tensor, r: float) -
         if k >= n:
             return torch.zeros_like(attn_map, dtype=torch.bool)
 
-        # 每个 head 独立计算阈值
+        # Calculate threshold for each head independently
         thresholds = torch.kthvalue(flat, k, dim=1).values  # [H]
         mask = attn_map >= thresholds[:, None, None]  # broadcasting
 
@@ -221,14 +220,17 @@ def apply(
         block_size = frame_h * frame_w
         num_frames = seqlen // block_size
 
+        pool_h, pool_w = (8, 16) if frame_h < frame_w else (16, 8)
+
         self.prepare_reorg_idx_and_bucket_offset(
             seqlen=seqlen,
             frame_h=frame_h,
             frame_w=frame_w,
+            pool_h=pool_h,
+            pool_w=pool_w,
+            device=q.device,
         )
 
-        pool_h, pool_w = (8, 16) if frame_h < frame_w else (16, 8)
-
         attn = self.sample_qk_attention_2d(
             q,
             k,
@@ -258,7 +260,7 @@ def apply(
 
         q_ranges = torch.stack([q_start, q_end], dim=1).to(dtype=torch.int32)
         k_ranges = torch.stack([k_start, k_end], dim=1).to(dtype=torch.int32)
-        attn_type_map = torch.zeros(len(q_ranges), dtype=torch.int32, device="cuda")
+        attn_type_map = torch.zeros(len(q_ranges), dtype=torch.int32, device=q.device)
 
         reorg_idx = self.reorg_idx_dict[(seqlen, frame_h, frame_w)]
         q = q[reorg_idx]