InternLM
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/7B_isp_sft.py‎
Lines changed: 12 additions & 7 deletions b/‎configs/7B_isp_sft.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎configs/7B_llama2.py‎
Lines changed: 7 additions & 7 deletions b/‎configs/7B_llama2.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎internlm/core/context/parallel_context.py‎
Lines changed: 2 additions & 1 deletion b/‎internlm/core/context/parallel_context.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎internlm/core/scheduler/pipeline_scheduler_1f1b.py‎
Lines changed: 67 additions & 2 deletions b/‎internlm/core/scheduler/pipeline_scheduler_1f1b.py‎
Lines changed: 67 additions & 2 deletions
diff --git a/‎internlm/data/build_dataloader.py‎
Lines changed: 17 additions & 3 deletions b/‎internlm/data/build_dataloader.py‎
Lines changed: 17 additions & 3 deletions
@@ -147,3 +147,4 @@ llm_ckpts
 events.*
 memory_trace
 RUN*/
+micro_record/
@@ -3,13 +3,13 @@
 DO_ALERT = False
 
 VOCAB_SIZE = 103168
-SEQ_LEN = 2048
+SEQ_LEN = 16*1024
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 NUM_KV_ATTENTION_HEAD = 8
 MLP_RATIO = 8 / 3
 NUM_LAYER = 32
-BUCKET_SIZE = 512  
+BUCKET_SIZE = 256
 
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -53,18 +53,19 @@
 TRAIN_FOLDER = '/data/wikipedia/en_test/train_test_dataset'  # "/path/to/dataset"
 VALID_FOLDER = None  # "/path/to/dataset"
 data = dict(
+    data_name="wiki",
     seq_len=SEQ_LEN,
     bucket_size=BUCKET_SIZE,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=16,
     # packed_length = micro_bsz * SEQ_LEN
     micro_bsz=2,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=0,
     pack_sample_into_one=False,
-    total_steps=50,
+    total_steps=42,
     skip_batches="",
     # rampup_batch_size (str): A string with three space-separated integers representing the
     #       starting batch size, the increment, and the number of steps between
@@ -230,11 +231,15 @@
         interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
                            interleaved the ranks in the same window to make full use of NIC as much as possible.
 """
+# wdp = world_size // wp // pp  # isp
+# dp = world_size // tp // pp
+# zero1 size is up to wdp 
+
 parallel = dict(
     zero1=dict(size=-1),
     tensor=dict(size=2, mode="isp"),
-    pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=4, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
+    pipeline=dict(size=4, interleaved_overlap=True),
+    weight=dict(size=2, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
     sequence_2D=dict(
         enable=False,
         head_size=2,
@@ -246,7 +251,7 @@
 
 cudnn_deterministic = False
 cudnn_benchmark = False
-
+profile_fwd_bwd = True
 
 # monitor = dict(
 #     # feishu alert configs
 
@@ -3,13 +3,13 @@
 DO_ALERT = False
 
 VOCAB_SIZE = 32000
-SEQ_LEN = 2048
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-NUM_KV_ATTENTION_HEAD = 32
-MLP_RATIO = 2.6875
-NUM_LAYER = 32
-
+SEQ_LEN = 16*1024
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+NUM_KV_ATTENTION_HEAD = 40
+MLP_RATIO = 2.7
+NUM_LAYER = 40
+BUCKET_SIZE = 512
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # Ckpt folder format:
 
@@ -170,6 +170,7 @@ def __init__(self):
         self._expert_parallel_group_names = []
         self.is_evaluating = False
         self.v_shape = False
+        self.batch_count = 1
 
     @property
     def config(self):
@@ -516,7 +517,7 @@ def _set_parallel_size_from_config(self, config: dict, key: str, attr_name: str)
 
     def init_parallel_groups(self):
         """Initializes the parallel groups."""
-
+        
         # get rank and world size
         rank = self.get_global_rank()
         world_size = self.get_world_size(ParallelMode.GLOBAL)
 
@@ -8,6 +8,9 @@
 
 import torch
 import torch.distributed as dist
+import time
+import os 
+import json
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
@@ -202,22 +205,39 @@ def _call_engine(engine, data):  # pylint: disable=W0237
     def load_batch(self, engine, data_iter):
         # Pipeline schedule just puts data in memory,
         batch_data, actual_batch_size = engine.load_batch(data_iter, to_gpu=False)
-
+        batch_seqlist = []
+        # import pdb
+        # pdb.set_trace()
         # Even if 'use_flash_attn' is False, the data seen when the 'load_batch' is called is still packed,
         # because internlm's current train dataset is packed, even using dummy data.
         # The unpack operation is performed in load_micro_batch().
         if check_data_is_packed(batch_data):
             micro_num = actual_batch_size
         else:
             micro_num = actual_batch_size // gpc.config.data["micro_bsz"]
-
+        # import pdb 
+        # breakpoint()
+        for micro_batch_cu in batch_data[0]['cu_seqlens']:
+            micro_batch_seqlist = [ int(micro_batch_cu[j]) - int(micro_batch_cu[j - 1]) for j in range(1, len(micro_batch_cu))]
+            batch_seqlist.append(micro_batch_seqlist)
+        
         self.microbatch_offset = 0
         self.batch_size = actual_batch_size
         self.batch_data, self.batch_label = batch_data
         self.bsz_stride = self.batch_size // micro_num
         # 'num_microbatches' is no longer an initialization parameter,
         # but is determined on the fly by the Scheduler.
         self.num_microbatches = micro_num  # Rampup or variable bsz size.
+        
+        if gpc.config.profile_fwd_bwd and os.environ.get("CUDA_LAUNCH_BLOCKING") == "1" and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            output_dir = os.path.join("./micro_record", gpc.config.data.data_name, f"B{gpc.config.data.bucket_size}_seq{gpc.config.SEQ_LEN}_mb{gpc.config.data.micro_num}", f'S{gpc.batch_count}')
+            os.makedirs(output_dir, exist_ok=True)
+            output_file = os.path.join(output_dir, f"PP_rank_{gpc.get_local_rank(ParallelMode.PIPELINE)}_seq.json")
+            
+            with open(output_file, "w") as f:
+                for micro_batch_seqlist in batch_seqlist:
+                    json.dump(micro_batch_seqlist, f)
+                    f.write('\n')
 
     def load_micro_batch(self):
         micro_batch_data, micro_batch_label = self._load_micro_batch(
@@ -592,8 +612,12 @@ def _forward_backward_step(self, engine, return_loss=True, return_output_label=T
                 input_obj = None
 
         # Run 1F1B in steady state.
+        fwd_times = []
+        bwd_times = []
+
         for i in range(num_1f1b_micropairs):
             # Perform forward computation
+            start_time=time.time()
             output_obj, moe_loss = self._forward_step(
                 engine,
                 input_obj,
@@ -602,6 +626,7 @@ def _forward_backward_step(self, engine, return_loss=True, return_output_label=T
                 accum_loss=accum_loss,
                 accum_moe_loss=accum_moe_loss,
             )
+            fwd_times.append(time.time() - start_time)
 
             if gpc.is_last_rank(ParallelMode.PIPELINE):
                 output_obj_grad = None
@@ -625,7 +650,9 @@ def _forward_backward_step(self, engine, return_loss=True, return_output_label=T
             output_obj = output_objs.pop(0)
             moe_loss = moe_losses.pop(0)
 
+            start_bwd_time=time.time()
             input_obj_grad = self._backward_step(engine, i, input_obj, output_obj, output_obj_grad, moe_loss)
+            bwd_times.append(time.time() - start_bwd_time)
 
             if i == (num_1f1b_micropairs - 1):
                 input_obj = None
@@ -644,6 +671,44 @@ def _forward_backward_step(self, engine, return_loss=True, return_output_label=T
                         dtype=self.dtype,
                         scatter_gather_tensors=self.scatter_gather_tensors,
                     )
+        if gpc.config.profile_fwd_bwd and os.environ.get("CUDA_LAUNCH_BLOCKING") == "1" and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+            output_dir = os.path.join("./micro_record", gpc.config.data.data_name, f"B{gpc.config.data.bucket_size}_seq{gpc.config.SEQ_LEN}_mb{gpc.config.data.micro_num}", f'S{gpc.batch_count}')
+            os.makedirs(output_dir, exist_ok=True)
+            output_file = os.path.join(output_dir, f"PP_rank_{gpc.get_local_rank(ParallelMode.PIPELINE)}.json")
+            gpc.batch_count += 1
+            
+            history = {
+                "fwd_times": [],
+                "bwd_times": [],
+            }
+
+            # 2. 如果文件存在，则读取旧数据
+            if os.path.exists(output_file):
+                with open(output_file, 'r') as f:
+                    try:
+                        history = json.load(f)
+                    except json.JSONDecodeError:
+                        pass  # 文件为空或损坏则跳过
+
+            # 3. 追加新数据
+            history["fwd_times"].extend(fwd_times)
+            history["bwd_times"].extend(bwd_times)
+
+            from collections import OrderedDict
+            data = OrderedDict()
+            # 4. 更新平均值
+            data["avg_fwd"] = sum(history["fwd_times"]) / len(history["fwd_times"])
+            data["avg_bwd"] = sum(history["bwd_times"]) / len(history["bwd_times"])
+            f_f = round(data["avg_fwd"]/data["avg_fwd"],3)
+            b_f = round(data["avg_bwd"]/data["avg_fwd"],3)
+            data["f_b_w"] = (f_f, b_f)
+            data["fwd_times"] = history["fwd_times"]
+            data["bwd_times"] = history["bwd_times"]
+
+            # 5. 写回文件
+            with open(output_file, 'w') as f:
+                json.dump(data, f, indent=4)
+
 
         # Run cooldown backward passes.
         for i in range(num_warmup_microsteps):
 
@@ -23,6 +23,7 @@
 from internlm.data.tokenized.batch_sampler import (
     StaticBatchSampler,
     get_dpsampler_dataloader,
+    BucketGroupBatchSampler,
 )
 from internlm.data.tokenized.collaters import (
     generation_collate_fn,
@@ -86,8 +87,9 @@ def get_tokenized_train_loader_items(data_cfg):
             pack_sample_into_one=data_cfg.get("pack_sample_into_one", False),
             bucket_size=data_cfg.get("bucket_size", 0)
         )
-
-    train_sampler = StaticBatchSampler(
+    if data_cfg.get("bucket_size", 0) > 0:
+        enable_bucket_balance = True
+        train_sampler = BucketGroupBatchSampler(
         train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
         batch_size=data_cfg.micro_num,
         rampup_batch_size=data_cfg.rampup_batch_size,
@@ -96,7 +98,19 @@ def get_tokenized_train_loader_items(data_cfg):
         drop_last=True,
         data_rank=gpc.get_local_rank(ParallelMode.DATA),
         data_world_size=gpc.get_world_size(ParallelMode.DATA),
-    )
+        enable_bucket_balance=enable_bucket_balance,
+        )
+    else:
+        train_sampler = StaticBatchSampler(
+            train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
+            batch_size=data_cfg.micro_num,
+            rampup_batch_size=data_cfg.rampup_batch_size,
+            micro_bsz=data_cfg.micro_bsz,
+            seed=data_cfg.get("seed", 1024),
+            drop_last=True,
+            data_rank=gpc.get_local_rank(ParallelMode.DATA),
+            data_world_size=gpc.get_world_size(ParallelMode.DATA),
+        )
     train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
 
     return train_ds, train_sampler, train_collate_fn