meta-pytorch
diff --git a/‎apps/sft/eval_utils.py‎
Lines changed: 6 additions & 0 deletions b/‎apps/sft/eval_utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎brainstorming/configs/base.py‎
Lines changed: 41 additions & 0 deletions b/‎brainstorming/configs/base.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎brainstorming/configs/baseline.yaml‎
Lines changed: 33 additions & 0 deletions b/‎brainstorming/configs/baseline.yaml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎brainstorming/configs/config_dataclasses.py‎
Lines changed: 131 additions & 0 deletions b/‎brainstorming/configs/config_dataclasses.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎brainstorming/configs/config_dicts.py‎
Lines changed: 130 additions & 0 deletions b/‎brainstorming/configs/config_dicts.py‎
Lines changed: 130 additions & 0 deletions
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """Utility functions for evaluation to make main.py more concise."""
 
 import logging
 
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Base config class for all approaches.
+
+In a real project, this would live in forge/config/base.py and be imported as:
+    from forge.config.base import FullFinetuneConfig
+"""
+
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class FullFinetuneConfig:
+    """
+    Base config for all approaches.
+
+    Uses `Any` type hints for components that differ across approaches:
+    - Plain dicts: str/dict
+    - Fiddle: fdl.Config/fdl.Partial
+    - Partial: functools.partial
+    - Hydra: Spec dataclasses
+    - Dataclasses: Inner Config classes
+    """
+
+    output_dir: str
+
+    # Components (type varies by approach)
+    tokenizer: Any
+    model: Any
+    optimizer: Any
+    data_args: Any
+
+    # Plain hyperparameters (same across all approaches)
+    epochs: int = 1
+    gradient_accumulation_steps: int = 8
@@ -0,0 +1,33 @@
+# baseline.yaml shows 5 key config patterns
+
+output_dir: /tmp/torchtune/llama3_2_1B/full
+
+# PATTERN 1: Simple Component Instantiation
+tokenizer:
+  _target_: mock.llama3_tokenizer
+  path: /tmp/Llama-3.2-1B-Instruct/original/tokenizer.model
+
+# PATTERN 2: Component with Nested Instantiation
+model:
+  _target_: mock.llama3_2_1b
+  # Nested component: attention config
+  attn_config:
+    _target_: mock.MultiHeadAttention
+    num_heads: 32
+
+# PATTERN 3: Component Needing Runtime Args (Partial)
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 2e-5
+  _partial_: true
+  # params: None #will be passed at instantiation time (not known now)
+
+# PATTERN 4: Non-Instantiated Config Block (Plain Data)
+data_args:
+  batch_size: 4
+  shuffle: True
+
+# PATTERN 5: Plain Top-Level Hyperparameters
+# Training params
+epochs: 1
+gradient_accumulation_steps: 8
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Dataclass config with inner Config classes.
+
+Pros:
+- Type safety for instantiated configs
+
+Cons:
+- Requires modifying target classes (not feasible for external libraries - Need to use wrapper)
+- Boilerplate (every class needs Config + __init__)
+
+eg:
+
+```
+class TokenizerWithConfig:
+    @dataclass
+    class Config:
+        path: str
+
+        def build(self) -> "TokenizerWithConfig":
+            return TokenizerWithConfig(self)
+
+    def __init__(self, config: Config):
+        self.config = config
+        self.path = config.path
+```
+"""
+
+from dataclasses import dataclass
+
+import torch
+
+from mock_with_config import (
+    ComponentConfig,
+    LlamaModelWithConfig,
+    MultiHeadAttentionWithConfig,
+    TokenizerWithConfig,
+)
+
+
+@dataclass
+class DataArgs:
+    """Plain dataclass for non-instantiated config block (PATTERN 4)."""
+
+    batch_size: int = 4
+    shuffle: bool = True
+
+
+def llama3_2_1b_full():
+    output_dir = "/tmp/torchtune/llama3_2_1B/full"
+
+    return {
+        "output_dir": output_dir,
+        # PATTERN 1: Simple Component Instantiation
+        "tokenizer": TokenizerWithConfig.Config(
+            path="/tmp/Llama-3.2-1B-Instruct/original/tokenizer.model",
+        ),
+        # PATTERN 2: Component with Nested Instantiation
+        "model": LlamaModelWithConfig.Config(
+            attn_config=MultiHeadAttentionWithConfig.Config(
+                num_heads=32,
+            )
+        ),
+        # PATTERN 3: Component Needing Runtime Args (Partial)
+        "optimizer": ComponentConfig(
+            component_cls=torch.optim.AdamW,
+            kwargs={"lr": 2e-5},
+        ),
+        # PATTERN 4: Non-Instantiated Config Block (Plain Data)
+        "data_args": DataArgs(
+            batch_size=4,
+            shuffle=True,
+        ),
+        # PATTERN 5: Plain Top-Level Hyperparameters
+        "epochs": 1,
+        "gradient_accumulation_steps": 8,
+    }
+
+
+if __name__ == "__main__":
+    # =========================================================================
+    # Scenario 1: Basic Instantiation
+    # =========================================================================
+    cfg = llama3_2_1b_full()
+
+    # PATTERN 1: Simple Component Instantiation
+    tokenizer = cfg["tokenizer"].build()
+
+    # PATTERN 2: Component with Nested Instantiation
+    model = cfg["model"].build()
+
+    # PATTERN 3: Component Needing Runtime Args (Partial)
+    optimizer = cfg["optimizer"].build(model.parameters())
+
+    # =========================================================================
+    # Scenario 2: Override Config Values
+    # =========================================================================
+    cfg2 = llama3_2_1b_full()
+
+    # PATTERN 1: Simple Component Instantiation
+    cfg2["tokenizer"].path = "/new/path"
+
+    # PATTERN 2: Component with Nested Instantiation
+    cfg2["model"].attn_config.num_heads = 64
+
+    # PATTERN 3: Component Needing Runtime Args (Partial)
+    cfg2["optimizer"].kwargs["lr"] = 1e-4
+
+    model2 = cfg2["model"].build()
+    optimizer2 = cfg2["optimizer"].build(model2.parameters())
+
+    # =========================================================================
+    # Scenario 3: Config Composition
+    # =========================================================================
+    def llama3_2_1b_large_lr():
+        """Variant with larger learning rate and different model config."""
+        base = llama3_2_1b_full()
+        # Overrides
+        base["optimizer"].kwargs["lr"] = 1e-3
+        base["model"].attn_config.num_heads = 64
+        return base
+
+    cfg_variant = llama3_2_1b_large_lr()
+    model_variant = cfg_variant["model"].build()
+    optimizer_variant = cfg_variant["optimizer"].build(model_variant.parameters())
+    assert optimizer_variant.param_groups[0]["lr"] == 1e-3
@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Config using Plain Python Dicts.
+
+Pros:
+- Extremely simple
+- No dependencies
+- Easy to understand
+- Flexible
+
+Cons:
+- No type hints (cfg["batch_szie"] typo won't be caught)
+- No validation (cfg["batch_size"] = "invalid" won't error)
+- Very loose, users can pass anything
+"""
+
+import torch.optim
+
+from mock import llama3_2_1b, llama3_tokenizer, MultiHeadAttention
+
+
+def llama3_2_1b_full():
+    output_dir = "/tmp/torchtune/llama3_2_1B/full"
+    batch_size = 4
+
+    return {
+        "output_dir": output_dir,
+        # PATTERN 1: Simple Component Instantiation
+        "tokenizer": {
+            "cls": llama3_tokenizer,
+            "kwargs": {
+                "path": "/tmp/Llama-3.2-1B-Instruct/original/tokenizer.model",
+            },
+        },
+        # PATTERN 2: Component with Nested Instantiation
+        "model": {
+            "cls": llama3_2_1b,
+            "kwargs": {
+                "attn_config": {
+                    "cls": MultiHeadAttention,
+                    "kwargs": {
+                        "num_heads": 32,
+                    },
+                }
+            },
+        },
+        # PATTERN 3: Component Needing Runtime Args (Partial)
+        "optimizer": {
+            "cls": torch.optim.AdamW,
+            "kwargs": {
+                "lr": 2e-5,
+            },
+        },
+        # PATTERN 4: Non-Instantiated Config Block (Plain Data)
+        "data_args": {
+            "batch_size": batch_size,
+            "shuffle": True,
+        },
+        # PATTERN 5: Plain Top-Level Hyperparameters
+        "epochs": 1,
+        "gradient_accumulation_steps": 8,
+    }
+
+
+if __name__ == "__main__":
+    # =========================================================================
+    # Scenario 1: Basic Instantiation
+    # =========================================================================
+    cfg = llama3_2_1b_full()
+
+    # PATTERN 1: Simple Component Instantiation
+    tokenizer = cfg["tokenizer"]["cls"](**cfg["tokenizer"]["kwargs"])
+
+    # PATTERN 2: Component with Nested Instantiation
+    attn_config = cfg["model"]["kwargs"]["attn_config"]["cls"](
+        **cfg["model"]["kwargs"]["attn_config"]["kwargs"]
+    )
+    model = cfg["model"]["cls"](attn_config=attn_config)
+
+    # PATTERN 3: Component Needing Runtime Args (Partial)
+    optimizer = cfg["optimizer"]["cls"](
+        model.parameters(), **cfg["optimizer"]["kwargs"]
+    )
+
+    # =========================================================================
+    # Scenario 2: Override Config Values
+    # =========================================================================
+    cfg2 = llama3_2_1b_full()
+
+    # PATTERN 1: Simple Component Instantiation
+    cfg2["tokenizer"]["kwargs"]["path"] = "/new/tokenizer"
+
+    # PATTERN 2: Component with Nested Instantiation
+    cfg2["model"]["kwargs"]["attn_config"]["kwargs"]["num_heads"] = 64
+
+    # PATTERN 3: Component Needing Runtime Args (Partial)
+    cfg2["optimizer"]["kwargs"]["lr"] = 1e-4
+
+    model2 = cfg2["model"]["cls"](
+        attn_config=cfg2["model"]["kwargs"]["attn_config"]["cls"](
+            **cfg2["model"]["kwargs"]["attn_config"]["kwargs"]
+        )
+    )
+    optimizer2 = cfg2["optimizer"]["cls"](
+        model2.parameters(), **cfg2["optimizer"]["kwargs"]
+    )
+
+    # =========================================================================
+    # Scenario 3: Config Composition
+    # =========================================================================
+    def llama3_2_1b_large_lr():
+        """Variant with larger learning rate."""
+        base = llama3_2_1b_full()
+        base["optimizer"]["kwargs"]["lr"] = 1e-3
+        base["model"]["kwargs"]["attn_config"]["kwargs"]["num_heads"] = 64
+        return base
+
+    cfg_variant = llama3_2_1b_large_lr()
+    attn_config_variant = cfg_variant["model"]["kwargs"]["attn_config"]["cls"](
+        **cfg_variant["model"]["kwargs"]["attn_config"]["kwargs"]
+    )
+    model_variant = cfg_variant["model"]["cls"](attn_config=attn_config_variant)
+    optimizer_variant = cfg_variant["optimizer"]["cls"](
+        model_variant.parameters(), **cfg_variant["optimizer"]["kwargs"]
+    )