daviswer · daviswer · Jul 17, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 19, 2024
diff --git a/fms_fsdp/config/training.py b/fms_fsdp/config/training.py
@@ -60,6 +60,15 @@ class train_config:
     # compile
     use_torch_compile: bool = True
 
+    # muP scale params
+    mup_emb_scale: float = 0
+    mup_head_scale: float = 0
+    mup_a_f_skew: float = 0
+    mup_attn_temp: float = 0
+    mup_lr_dscale: float = 0
+    mup_explore_range: float = 5.0
+    mup_search_steps: int = 10
+
     # speculator training
     tp_size: int = 8
     model_arch: str = "embedllama"

diff --git a/fms_fsdp/policies/param_init.py b/fms_fsdp/policies/param_init.py
@@ -1,18 +1,25 @@
 import torch
-from fms.modules.attention import MultiHeadAttention
+from fms.modules.attention import QKV, MultiHeadAttention
 from fms.modules.embedding import WordEmbedding
 from fms.modules.feedforward import GatedLinearUnit
 from fms.modules.layernorm import LayerNormParameterized
 
 
 # for details, read https://github.com/foundation-model-stack/fms-fsdp/issues/64
-def param_init_function(module):
-    if (
-        isinstance(module, MultiHeadAttention)
-        or isinstance(module, WordEmbedding)
-        or isinstance(module, GatedLinearUnit)
-        or isinstance(module, LayerNormParameterized)
-    ):
+def param_init_function(module, cfg):
+    scales = {
+        MultiHeadAttention: cfg.mup_a_f_skew**0.5,
+        QKV: cfg.mup_a_f_skew**0.5,
+        GatedLinearUnit: cfg.mup_a_f_skew**-0.5,
+        WordEmbedding: (cfg.mup_emb_scale, cfg.mup_head_scale),
+        LayerNormParameterized: 1,
+    }
+    scale_keys = list(scales.keys())
+    scale_vals = list(scales.values())
+    type_id = [isinstance(module, x) for x in scale_keys]
+    is_resettable = sum(type_id)
+    if is_resettable:
+        module_type_id = type_id.index(True)
         module.to_empty(device=torch.cuda.current_device())
         with torch.no_grad():
-            module.reset_parameters()
+            module.reset_parameters(scale=scale_vals[module_type_id])
diff --git a/fms_fsdp/utils/config_utils.py b/fms_fsdp/utils/config_utils.py
@@ -59,8 +59,6 @@ def get_model_config(model_variant):
             emb_dim=2048,
             nheads=16,
             nlayers=24,
-            hidden_grow_factor=3,
-            kvheads=4,
         )
     elif model_variant == "llama3_8b":
         llama_config = LLaMAConfig(
@@ -128,6 +126,39 @@ def get_model_config(model_variant):
             max_expected_seq_len=4096,
             rope_theta=500000.0,
         )
+    elif model_variant == "llama3_3.2b_4k_mup_tiny":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=384,
+            nheads=3,
+            kvheads=1,
+            nlayers=24,
+            hidden_grow_factor=8 / 3,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
+    elif model_variant == "llama3_3.2b_4k_mup_small":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=768,
+            nheads=6,
+            kvheads=2,
+            nlayers=24,
+            hidden_grow_factor=8 / 3,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
+    elif model_variant == "llama3_3.2b_4k_mup_medium":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=1536,
+            nheads=12,
+            kvheads=4,
+            nlayers=24,
+            hidden_grow_factor=8 / 3,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
     elif model_variant == "llama3_70b":
         llama_config = LLaMAConfig(
             src_vocab_size=128256,
@@ -150,6 +181,39 @@ def get_model_config(model_variant):
             max_expected_seq_len=4096,
             rope_theta=500000.0,
         )
+    elif model_variant == "llama3_70b_4k_medium":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=3172,
+            nheads=24,
+            kvheads=3,
+            nlayers=80,
+            hidden_grow_factor=3.5,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
+    elif model_variant == "llama3_70b_4k_small":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=2048,
+            nheads=16,
+            kvheads=2,
+            nlayers=80,
+            hidden_grow_factor=3.5,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
+    elif model_variant == "llama3_70b_4k_tiny":
+        llama_config = LLaMAConfig(
+            src_vocab_size=128256,
+            emb_dim=1024,
+            nheads=8,
+            kvheads=1,
+            nlayers=80,
+            hidden_grow_factor=3.5,
+            max_expected_seq_len=4096,
+            rope_theta=500000.0,
+        )
     elif model_variant == "llama3_194m_4k":
         llama_config = LLaMAConfig(
             src_vocab_size=128256,
@@ -163,3 +227,11 @@ def get_model_config(model_variant):
         raise ValueError(f"model variant {model_variant} not supported.")
 
     return llama_config
+
+
+def set_mup_from_cfg(job_cfg, model_cfg):
+    fields = {k: v for k, v in vars(job_cfg).items() if "mup" in k and v > 0}
+    for f in fields:
+        if hasattr(model_cfg, f):
+            setattr(model_cfg, f, fields[f])
+    return model_cfg
diff --git a/fms_fsdp/utils/dataloader_utils.py b/fms_fsdp/utils/dataloader_utils.py
@@ -24,7 +24,7 @@ def causal_lm(data_seq, prompt_len=1):
     Perform causal language modeling by right-shifting the input sequence.
     Sets first prompt_len tokens to be ignored by the loss.
     """
-    data_seq = torch.tensor(data_seq, dtype=torch.int)
+    data_seq = data_seq.int()
     t = data_seq.clone()[1:]
     data_seq = data_seq[:-1]
     t[:prompt_len] = -100
@@ -132,13 +132,13 @@ def get_data_loader(cfg, rank, world_size, postprocess=[causal_lm]):
         data = PreprocessDataset(data, p)
 
     # Enable auto-saving
-    data = CheckpointDataset(
-        data,
-        cfg.ckpt_load_path if cfg.resuming_dataset else cfg.ckpt_save_path,
-        cfg.checkpoint_interval,
-        cfg.batch_size,
-        cfg.ckpt_save_path,
-    )
+    # data = CheckpointDataset(
+    #     data,
+    #     cfg.ckpt_load_path if cfg.resuming_dataset else cfg.ckpt_save_path,
+    #     cfg.checkpoint_interval,
+    #     cfg.batch_size,
+    #     cfg.ckpt_save_path,
+    # )
     return torch.utils.data.DataLoader(
         data, num_workers=cfg.num_workers, batch_size=cfg.batch_size
     )

diff --git a/fms_fsdp/utils/dataset_utils.py b/fms_fsdp/utils/dataset_utils.py
@@ -357,11 +357,10 @@ def length(self, path: str):
 
     def get(self, reader: pa.RecordBatchFileReader, index: int, drop_tokens: Set):
         doc = reader.get_batch(index)[self.col_name]
-        if len(doc) > 0:
-            if doc[0].as_py() in drop_tokens:
-                doc = doc.slice(1, len(doc) - 1)
-            if doc[-1].as_py() in drop_tokens:
-                doc = doc.slice(0, len(doc) - 1)
+        if len(doc) > 0 and doc[0].as_py() in drop_tokens:
+            doc = doc.slice(1, len(doc) - 1)
+        if len(doc) > 0 and doc[-1].as_py() in drop_tokens:
+            doc = doc.slice(0, len(doc) - 1)
         return doc
 
     def slice(self, doc: pa.UInt32Array, index: int, n_pull: int) -> List:
@@ -384,18 +383,24 @@ def is_legal(self, filepath: str):
         return "parquet" in os.path.splitext(filepath)[1]
 
     def open(self, path: str):
-        return pq.read_pandas(path, columns=[self.col_name])[self.col_name]
+        colnames = pq.read_metadata(path).schema.names
+        legal_fields = ["text", "content", "contents"]
+        overlap = set(legal_fields).intersection(set(colnames))
+        assert (
+            len(overlap) == 1
+        ), f"{len(overlap)} shared column names detected, need 1 ({overlap})"
+        name = overlap.pop()
+        return pq.read_pandas(path, columns=[name], partitioning=None)[name]
 
     def length(self, path: str):
-        return pq.read_pandas(path, columns=[]).num_rows
+        return pq.read_metadata(path).num_rows
 
     def get(self, reader, index: int, drop_tokens: Set):
-        doc = self.tokenizer(str(reader[index]))["input_ids"]
-        if len(doc) > 0:
-            if doc[0] in drop_tokens:
-                doc = doc[1:]
-            if doc[-1] in drop_tokens:
-                doc = doc[:-1]
+        doc = self.tokenizer(str(reader[index])[:128_000])["input_ids"]
+        if len(doc) > 0 and doc[0] in drop_tokens:
+            doc = doc[1:]
+        if len(doc) > 0 and doc[-1] in drop_tokens: # Recheck len for edge case where doc=[eos]
+            doc = doc[:-1]
         return doc
 
     def slice(self, doc: List, index: int, n_pull: int) -> List:
@@ -872,73 +877,86 @@ def setup(self):
                 if self.filehandler.is_legal(os.path.join(root, name))
             ]
             shards.sort()  # Ensure consistent sharding across machines
-            start_frag = (self.rank * self.worldsize * len(shards)) // self.worldsize
-            end_frag = (
-                (self.rank + 1) * self.worldsize * len(shards)
-            ) // self.worldsize
-            shardfrags = [
-                (shards[i // self.worldsize], i % self.worldsize)
-                for i in range(start_frag, end_frag)
-            ]
-
-            # Assemble length of each owned shard file
 
+            # Find metadata file
             countfiles = []
             if os.path.exists(os.path.join(pardir, "meta")):
                 countfiles = [
                     x
                     for x in os.listdir(os.path.join(pardir, "meta"))
                     if "counts" in x and "csv" in x
                 ]
-            doc_counts = {}
             if len(countfiles) > 0:
                 # Count file exists, use it
                 countpath = os.path.join(pardir, "meta", countfiles[0])
+            else:
+                countpath = ""
+
+            # Use shard file sizes to perform partitioning
+            # Create shardlist of form shardid -> [start%, end%]
+            if len(countfiles) > 0:
+                sizes = {}
+                with open(countpath, "r") as csvfile:
+                    reader = csv.DictReader(csvfile)
+                    for row in reader:
+                        fullpath = row["dataset/filename"]
+                        prefix = fullpath.find(dataset + "/")
+                        if prefix >= 0:
+                            key = fullpath[prefix + len(dataset) + 1 :]
+                            sizes[key] = int(row["size"])
+                shard_sizes = [sizes[shard] for shard in shards]
+            else:
+                shard_sizes = [
+                    os.path.getsize(os.path.join(datapath, shard)) for shard in shards
+                ]
+            shard_sizes = [s / sum(shard_sizes) for s in shard_sizes]
+            start = self.rank / self.worldsize
+            end = (self.rank + 1) / self.worldsize
+            shardset = {}
+            tally = 0
+            for i in range(len(shards)):
+                if tally <= end and tally + shard_sizes[i] >= start:
+                    shardset[shards[i]] = [
+                        min(max((start - tally) / shard_sizes[i], 0), 1),
+                        min(max((end - tally) / shard_sizes[i], 0), 1),
+                    ]
+                tally += shard_sizes[i]
+
+            # Assemble length of each owned shard file
+            doc_counts = {}
+            if len(countfiles) > 0:
+                # Count file exists, use it
                 with open(countpath, "r") as csvfile:
                     reader = csv.DictReader(csvfile)
                     for row in reader:
                         fullpath = row["dataset/filename"]
-                        prefix = fullpath.find("/" + dataset) + 1
-                        if prefix > 0:
+                        prefix = fullpath.find(dataset + "/")
+                        if prefix >= 0:
                             key = fullpath[prefix + len(dataset) + 1 :]
                             doc_counts[key] = int(row["documents"])
             else:
                 # Count file does not exist, touch every owned file for length
-                unique_shardfiles = set(shard for shard, frag in shardfrags)
+                # unique_shardfiles = set(shard for shard, frag in shardfrags)
                 doc_counts = {
                     shard: self.filehandler.length(os.path.join(datapath, shard))
-                    for shard in unique_shardfiles
+                    for shard in shardset
                 }
 
-            # Read shardfrags, assemble doc list for each file shard (aggregating over fragments):
-            ndocs = -1
-            docset = {}  # shardid -> (min docid, max docid)
-            for i, (shard, frag) in enumerate(shardfrags):
-                ndocs = doc_counts[shard]
-                doc_start = (ndocs * frag) // self.worldsize
-                doc_end = (
-                    ndocs * frag + ndocs
-                ) // self.worldsize - 1  # Inclusive upper bound
-                if shard not in docset:
-                    docset[shard] = [doc_start, doc_end]
-                min_d, max_d = docset[shard]
-                if doc_start < min_d:
-                    docset[shard][0] = doc_start
-                if doc_end > max_d:
-                    docset[shard][1] = doc_end
-
-            # Add shard entries to self.docset
+            # Assemble doc list for each file shard
+            # Create docset of form [shardid, min docid, max docid]
             doccount = 0
-            for shardid in docset:
-                min_d = docset[shardid][0]
-                max_d = docset[shardid][1]
-                self.docset.append((shardid, min_d, max_d))
-                doccount += max_d - min_d + 1
+            for shard in shardset:
+                ndocs = doc_counts[shard]
+                doc_start = round(ndocs * shardset[shard][0])
+                doc_end = round(ndocs * shardset[shard][1]) - 1  # inclusive upper bound
+                if doc_end >= doc_start:
+                    self.docset.append([shard, doc_start, doc_end])
+                    doccount += doc_end - doc_start + 1
             self._len = doccount
 
             if self.verbose:
                 logging.info(
-                    f"    Worker {self.rank} ingested {len(shardfrags)} shard fragments from {dataset}"
+                    f"    Worker {self.rank} ingested {len(self.docset)} shards from {dataset}"
                 )
 
             # Shuffle shard files - guaranteed inconsistent across workers