|
1 | 1 | import torch |
2 | 2 |
|
3 | 3 | from lightx2v.common.offload.manager import WeightAsyncStreamManager |
4 | | -from lightx2v.models.networks.qwen_image.infer.transformer_infer import QwenImageTransformerInfer |
| 4 | +from lightx2v.models.networks.qwen_image.infer.transformer_infer import ( |
| 5 | + QwenImageTransformerInfer, |
| 6 | +) |
5 | 7 | from lightx2v_platform.base.global_var import AI_DEVICE |
6 | 8 |
|
7 | 9 | torch_device_module = getattr(torch, AI_DEVICE) |
|
10 | 12 | class QwenImageOffloadTransformerInfer(QwenImageTransformerInfer): |
11 | 13 | def __init__(self, config): |
12 | 14 | super().__init__(config) |
13 | | - self.phases_num = 3 |
14 | 15 | self.num_blocks = config["num_layers"] |
| 16 | + self.phases_num = 4 |
15 | 17 | if self.config.get("cpu_offload", False): |
16 | 18 | if "offload_ratio" in self.config: |
17 | 19 | self.offload_ratio = self.config["offload_ratio"] |
18 | 20 | else: |
19 | 21 | self.offload_ratio = 1 |
20 | 22 | offload_granularity = self.config.get("offload_granularity", "block") |
21 | 23 | if offload_granularity == "block": |
22 | | - if not self.config.get("lazy_load", False): |
23 | | - self.infer_func = self.infer_with_blocks_offload |
24 | | - else: |
25 | | - assert NotImplementedError |
26 | | - |
27 | | - if offload_granularity != "model": |
| 24 | + self.infer_func = self.infer_with_blocks_offload |
28 | 25 | self.offload_manager = WeightAsyncStreamManager(offload_granularity=offload_granularity) |
29 | | - else: |
30 | | - assert NotImplementedError |
| 26 | + elif offload_granularity == "phase": |
| 27 | + self.infer_func = self.infer_with_phases_offload |
| 28 | + self.offload_manager = WeightAsyncStreamManager(offload_granularity=offload_granularity) |
| 29 | + |
| 30 | + self.lazy_load = self.config.get("lazy_load", False) |
| 31 | + if self.lazy_load: |
| 32 | + self.offload_manager.init_lazy_load(num_workers=self.config.get("num_disk_workers", 4)) |
| 33 | + |
| 34 | + def infer_with_phases_offload( |
| 35 | + self, |
| 36 | + blocks, |
| 37 | + hidden_states, |
| 38 | + encoder_hidden_states, |
| 39 | + temb_img_silu, |
| 40 | + temb_txt_silu, |
| 41 | + image_rotary_emb, |
| 42 | + modulate_index, |
| 43 | + ): |
| 44 | + for block_idx in range(len(blocks)): |
| 45 | + self.block_idx = block_idx |
| 46 | + if self.lazy_load: |
| 47 | + next_prefetch = (block_idx + 1) % len(blocks) |
| 48 | + self.offload_manager.start_prefetch_block(next_prefetch) |
| 49 | + |
| 50 | + for phase_idx in range(self.phases_num): |
| 51 | + # if self.offload_manager.need_init_first_buffer: |
| 52 | + if block_idx == 0 and phase_idx == 0: |
| 53 | + self.offload_manager.init_first_buffer(blocks) |
| 54 | + |
| 55 | + next_block_idx = (block_idx + 1) % len(blocks) if phase_idx == self.phases_num - 1 else block_idx |
| 56 | + next_phase_idx = (phase_idx + 1) % self.phases_num |
| 57 | + if self.lazy_load: |
| 58 | + if phase_idx == self.phases_num - 1: |
| 59 | + self.offload_manager.swap_cpu_buffers() |
31 | 60 |
|
32 | | - def infer_with_blocks_offload(self, block_weights, hidden_states, encoder_hidden_states, temb_img_silu, temb_txt_silu, image_rotary_emb, modulate_index): |
| 61 | + self.offload_manager.prefetch_phase(next_block_idx, next_phase_idx, blocks) |
| 62 | + with torch_device_module.stream(self.offload_manager.compute_stream): |
| 63 | + if phase_idx == 0: |
| 64 | + img_query, img_key, img_value, img_gate1, img_mod2 = self.infer_img_qkv( |
| 65 | + img_attn_phase=self.offload_manager.cuda_buffers[phase_idx], |
| 66 | + hidden_states=hidden_states, |
| 67 | + temb_img_silu=temb_img_silu, |
| 68 | + img_freqs=image_rotary_emb[0], |
| 69 | + modulate_index=modulate_index, |
| 70 | + ) |
| 71 | + elif phase_idx == 1: |
| 72 | + txt_query, txt_key, txt_value, seq_txt, txt_gate1, txt_mod2 = self.infer_txt_qkv( |
| 73 | + txt_attn_phase=self.offload_manager.cuda_buffers[phase_idx], |
| 74 | + encoder_hidden_states=encoder_hidden_states, |
| 75 | + temb_txt_silu=temb_txt_silu, |
| 76 | + txt_freqs=image_rotary_emb[1], |
| 77 | + ) |
| 78 | + elif phase_idx == 2: |
| 79 | + hidden_states, encoder_hidden_states = self.infer_cross_attn( |
| 80 | + cross_attn_phase=self.offload_manager.cuda_buffers[phase_idx], |
| 81 | + seq_txt=seq_txt, |
| 82 | + img_query=img_query, |
| 83 | + img_key=img_key, |
| 84 | + img_value=img_value, |
| 85 | + txt_query=txt_query, |
| 86 | + txt_key=txt_key, |
| 87 | + txt_value=txt_value, |
| 88 | + img_gate1=img_gate1, |
| 89 | + txt_gate1=txt_gate1, |
| 90 | + hidden_states=hidden_states, |
| 91 | + encoder_hidden_states=encoder_hidden_states, |
| 92 | + ) |
| 93 | + |
| 94 | + elif phase_idx == 3: |
| 95 | + encoder_hidden_states, hidden_states = self.infer_ffn( |
| 96 | + ffn_phase=self.offload_manager.cuda_buffers[phase_idx], |
| 97 | + hidden_states=hidden_states, |
| 98 | + encoder_hidden_states=encoder_hidden_states, |
| 99 | + img_mod2=img_mod2, |
| 100 | + txt_mod2=txt_mod2, |
| 101 | + modulate_index=modulate_index, |
| 102 | + ) |
| 103 | + self.offload_manager.swap_phases() |
| 104 | + |
| 105 | + return hidden_states |
| 106 | + |
| 107 | + def infer_with_blocks_offload( |
| 108 | + self, |
| 109 | + blocks, |
| 110 | + hidden_states, |
| 111 | + encoder_hidden_states, |
| 112 | + temb_img_silu, |
| 113 | + temb_txt_silu, |
| 114 | + image_rotary_emb, |
| 115 | + modulate_index, |
| 116 | + ): |
33 | 117 | for block_idx in range(self.num_blocks): |
34 | 118 | self.block_idx = block_idx |
| 119 | + |
| 120 | + if self.lazy_load: |
| 121 | + next_prefetch = (block_idx + 1) % self.num_blocks |
| 122 | + self.offload_manager.start_prefetch_block(next_prefetch) |
| 123 | + |
35 | 124 | if block_idx == 0: |
36 | | - self.offload_manager.init_first_buffer(block_weights.blocks) |
37 | | - if block_idx + 1 < self.num_blocks: |
38 | | - self.offload_manager.prefetch_weights(block_idx + 1, block_weights.blocks) |
| 125 | + self.offload_manager.init_first_buffer(blocks) |
| 126 | + |
| 127 | + if self.lazy_load: |
| 128 | + self.offload_manager.swap_cpu_buffers() |
| 129 | + self.offload_manager.prefetch_weights((block_idx + 1) % self.num_blocks, blocks) |
| 130 | + |
39 | 131 | with torch_device_module.stream(self.offload_manager.compute_stream): |
40 | 132 | encoder_hidden_states, hidden_states = self.infer_block( |
41 | | - block_weight=self.offload_manager.cuda_buffers[0], |
| 133 | + block=self.offload_manager.cuda_buffers[0], |
42 | 134 | hidden_states=hidden_states, |
43 | 135 | encoder_hidden_states=encoder_hidden_states, |
44 | 136 | temb_img_silu=temb_img_silu, |
|
0 commit comments