[update] add fine-tunning instructions

hanjian.thu123 · hanjian.thu123 · commit b67bf3ca6adf · 2025-01-08T19:32:56.000+08:00
diff --git a/README.md b/README.md
@@ -155,6 +155,10 @@ We provide [eval.sh](scripts/eval.sh) for evaluation on various benchmarks with
 bash scripts/eval.sh
 ```
 
+## Fine-tuning
+Fine-tuning Infinity is quite simple where you only need append ```--rush_resume=[infinity_vae_d32reg.pth]``` to [train.sh](scripts/train.sh). 
+
+
 ## One More Thing: Infinity-20B is coming soon 📆
 Infinity shows strong scaling capabilities as illustrated before. Thus we are encouraged to continue to scale up the model size to 20B. Here we present the side-by-side comparison results between Infinity-2B and Infinity-20B.
 
diff --git a/scripts/train.sh b/scripts/train.sh
@@ -103,7 +103,7 @@ train.py \
 --wpe=1 \
 --dynamic_resolution_across_gpus 1 \
 --enable_dynamic_length_prompt 1 \
---reweight_loss_by_scale 0 \
+--reweight_loss_by_scale 1 \
 --add_lvl_embeding_only_first_block 1 \
 --rope2d_each_sa_layer 1 \
 --rope2d_normalized_by_hw 2 \
@@ -117,6 +117,6 @@ train.py \
 --prefetch_factor=16 \
 --noise_apply_strength 0.3 \
 --noise_apply_layers 13 \
---apply_spatial_patchify 1 \
+--apply_spatial_patchify 0 \
 --use_flex_attn=True \
 --pad=128
diff --git a/trainer.py b/trainer.py
@@ -51,7 +51,6 @@ def __init__(
         
         self.gpt: Union[DDP, FSDP, nn.Module]
         self.gpt, self.vae_local, self.quantize_local = gpt, vae_local, vae_local.quantize
-        self.quantize_local: VectorQuantizer2
         self.gpt_opt: AmpOptimizer = gpt_opt
         self.gpt_wo_ddp: Union[Infinity, torch._dynamo.eval_frame.OptimizedModule] = gpt_wo_ddp  # after torch.compile
         self.gpt_wo_ddp_ema = gpt_wo_ddp_ema
@@ -208,7 +207,7 @@ def train_step(
                 last_scale_area = np.sqrt(scale_schedule[-1].prod())
                 for (pt, ph, pw) in scale_schedule[:training_scales]:
                     this_scale_area = np.sqrt(pt * ph * pw)
-                    lw.extend([last_scale_area / this_scale_area for _ in range(ph * pw)])
+                    lw.extend([last_scale_area / this_scale_area for _ in range(pt * ph * pw)])
                 lw = torch.tensor(lw, device=loss.device)[None, ...]
                 lw = lw / lw.sum()
             else: