add

gongweibao · gongweibao · commit 24c7a0ecf6f2 · 2025-03-05T20:17:19.000+08:00
diff --git a/scripts/train.sh b/scripts/train.sh
@@ -2,15 +2,19 @@
 
 set -x
 
+# ARNOLD_WORKER_GPU=8
+# ARNOLD_WORKER_NUM=1
+# ARNOLD_ID=0
+
 # set dist args
-# SINGLE=1
+SINGLE=1
 nproc_per_node=${ARNOLD_WORKER_GPU}
 
 if [ ! -z "$SINGLE" ] && [ "$SINGLE" != "0" ]; then
   echo "[single node alone] SINGLE=$SINGLE"
   nnodes=1
   node_rank=0
-  nproc_per_node=1
+  nproc_per_node=8
   master_addr=127.0.0.1
   master_port=12345
 else
@@ -33,9 +37,13 @@ echo "[master_port: ${master_port}]"
 
 # set up envs
 export OMP_NUM_THREADS=8
-export NCCL_IB_DISABLE=0
+export NCCL_IB_DISABLE=1
 export NCCL_IB_GID_INDEX=3
-export NCCL_SOCKET_IFNAME=eth0
+# export NCCL_SOCKET_IFNAME=xgbe0
+
+# export NCCL_DEBUG=info
+# export NCCL_IB_DISABLE=1
+# export NCCL_P2P_DISABLE=1
 
 
 BED=checkpoints
@@ -60,12 +68,14 @@ local_out_path=$LOCAL_OUT/${exp_name}
 rm -rf ${bed_path}
 rm -rf ${local_out_path}
 
-torchrun \
---nproc_per_node=${nproc_per_node} \
---nnodes=${nnodes} \
---node_rank=${node_rank} \
---master_addr=${master_addr} \
---master_port=${master_port} \
+# torchrun \
+# --nproc_per_node=${nproc_per_node} \
+# --nnodes=${nnodes} \
+# --node_rank=${node_rank} \
+# --master_addr=${master_addr} \
+# --master_port=${master_port} \
+python -m torch.distributed.launch --nproc-per-node=8 \
+--local_ranks_filter 0 \
 train.py \
 --ep=100 \
 --opt=adamw \
@@ -96,14 +106,14 @@ train.py \
 --use_streaming_dataset 1 \
 --iterable_data_buffersize 30000 \
 --Ct5=2048 \
---t5_path=weights/flan-t5-xl \
+--t5_path=google/flan-t5-xl \
 --vae_type 32 \
 --vae_ckpt=weights/infinity_vae_d32_rdn_short.pth  \
 --wp 0.00000001 \
 --wpe=1 \
 --dynamic_resolution_across_gpus 1 \
 --enable_dynamic_length_prompt 1 \
---reweight_loss_by_scale 1 \
+--reweight_loss_by_scale 0 \
 --add_lvl_embeding_only_first_block 1 \
 --rope2d_each_sa_layer 1 \
 --rope2d_normalized_by_hw 2 \
diff --git a/train.py b/train.py
@@ -327,6 +327,7 @@ def main_train(args: arg_util.Args):
     # build wandb logger
     if dist.is_master():
         wandb_utils.wandb.init(project=args.project_name, name=args.exp_name, config={})
+
     for ep in range(start_ep, args.ep):
         if ep % ep_lg == 0 or ep == start_ep:
             print(f'[PT info]  from ep{start_ep} it{start_it}, acc_str: {acc_str}, diffs: {args.diffs},    =======>  bed: {args.bed}  <=======\n')
@@ -483,10 +484,15 @@ def train_one_ep(
             with maybe_record_function('before_train'):
                 # [get data]
                 inp, captions = data
-                tokens = text_tokenizer(text=captions, max_length=text_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
+                tokens = text_tokenizer(text=captions, max_length=text_tokenizer.model_max_length, 
+                                        padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
+                print("gongwb tokens:", tokens)
+
                 input_ids = tokens.input_ids.cuda(non_blocking=True)
                 mask = tokens.attention_mask.cuda(non_blocking=True)
+
                 text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float()
+                print("gongwb text_features:", text_features)
                 
                 lens: List[int] = mask.sum(dim=-1).tolist()
                 cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0))
@@ -521,7 +527,8 @@ def train_one_ep(
                 step_cnt += int(stepping)
             
             with maybe_record_function('in_training'):
-                grad_norm_t, scale_log2_t = trainer.train_step(
+                #grad_norm_t, scale_log2_t =
+                trainer.train_step(
                     ep=ep, it=it, g_it=g_it, stepping=stepping, clip_decay_ratio=clip_decay_ratio,
                     metric_lg=me, 
                     logging_params=stepping and step_cnt == 1 and (ep < 4 or ep in logging_params_milestone), 
diff --git a/trainer.py b/trainer.py
@@ -159,6 +159,9 @@ def train_step(
         V = self.vae_local.vocab_size
         device = inp_B3HW.device
 
+        print(f"gongwb B: {B}, T: {T}, V:{V}")
+        print("gongwb inp_B3HW:", inp_B3HW)
+
         h_div_w = inp_B3HW.shape[-2] / inp_B3HW.shape[-1]
         h_div_w_templates = np.array(list(dynamic_resolution_h_w.keys()))
         h_div_w_template = h_div_w_templates[np.argmin(np.abs(h_div_w-h_div_w_templates))]
@@ -184,7 +187,10 @@ def train_step(
             x_BLC_wo_prefix = x_BLC_wo_prefix[:, :(training_seq_len-np.array(scale_schedule[0]).prod()), :]
 
             self.gpt_wo_ddp.forward  
-            logits_BLV = self.gpt(text_cond_tuple, x_BLC_wo_prefix, scale_schedule=scale_schedule[:training_scales]) # [bs, 1*1+...+64*64, vocab_size or log2(vocab_size)*2]
+            logits_BLV = self.gpt(text_cond_tuple, 
+                                  x_BLC_wo_prefix, scale_schedule=scale_schedule[:training_scales]) # [bs, 1*1+...+64*64, vocab_size or log2(vocab_size)*2]
+            print("gongwb self.gpt:", self.gpt)
+            print(f"gongwb logits_BLV:{logits_BLV.shape}")
             self.batch_size, self.seq_len = logits_BLV.shape[:2]
 
             self.seq_len_each = [idx_Bl.shape[1] for idx_Bl in gt_ms_idx_Bl]
@@ -214,6 +220,7 @@ def train_step(
                 lw = 1. / self.seq_len
             loss = loss.mul(lw).sum(dim=-1).mean()
         
+        return 
         # [backward]
         grad_norm_t, scale_log2_t = self.gpt_opt.backward_clip_step(ep=ep, it=it, g_it=g_it, stepping=stepping, logging_params=logging_params, loss=loss, clip_decay_ratio=clip_decay_ratio, stable=args.stable)