Undo other random changes

joecummings · joecummings · commit e28f84bb580a · 2025-11-14T11:54:04.000-08:00
diff --git a/apps/on_policy_distillation/README.md b/apps/on_policy_distillation/README.md
@@ -68,7 +68,7 @@ python -m apps.on-policy-distillation.main --config apps/on-policy-distillation/
 1. **Ensure proper initialization**: Load the SFT checkpoint before starting OPD
 2. **Use prompts only**: During OPD, sample completions from student, don't use dataset solutions
 3. **Teacher quality matters**: Better teachers provide better supervision
-4. **Monitor reverse KL**: Should decrease to near-zero as training progresses
+4. **Monitor reverse KL**: Should go to near-zero as training progresses
 
 ## References
 
diff --git a/apps/sft/main.py b/apps/sft/main.py
@@ -21,14 +21,11 @@
 
 import torch
 
+import torchtitan.experiments.forge.train_spec as forge_train_spec
 from forge.controller import ForgeActor
 from forge.data.collate import collate_packed
 from forge.data.datasets.packed import PackedDataset, TextPacker
-from forge.data.datasets.sft_dataset import (
-    AlpacaToMessages,
-    OpenThoughtsToMessages,
-    sft_iterable_dataset,
-)
+from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset
 from forge.data.tokenizer import HuggingFaceModelTokenizer
 from forge.observability import get_or_create_metric_logger, record_metric, Reduce
 from forge.util.config import parse
@@ -84,34 +81,8 @@ def __init__(self, config: DictConfig):
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
-        self._init_dist()
         super().__init__(job_config)
 
-    def _init_dist(self):
-        """Initializes torch distributed.
-
-        torchrun normally hands this, but we need to do it ourselves
-        in monarch for now.
-
-        We should consider putting this into ForgeActor, but having this
-        be explicit for now.
-
-        """
-        env = {
-            "RANK": str(self._rank),
-            "LOCAL_RANK": str(self._rank),
-            "LOCAL_WORLD_SIZE": str(self._size),
-            "GROUP_RANK": str(self._size),
-            "GROUP_WORLD_SIZE": str(self._size),
-            "ROLE_RANK": str(self._rank),
-            "ROLE_WORLD_SIZE": str(self._size),
-            "ROLE_NAME": "rank",
-            "WORLD_SIZE": str(self._size),
-            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
-        }
-        os.environ.update(env)
-        logger.info("env: {}".format(env))
-
     async def setup_metric_logger(self):
         """Initialization happens in the main process. Here we just retrieve it"""
         mlogger = await get_or_create_metric_logger()
@@ -168,32 +139,13 @@ def setup_data(self):
             ),
         )
 
-        # Get dataset configuration from job_config
-        dataset_config = self.job_config["dataset"]
-        dataset_path = dataset_config["path"]
-        dataset_split = dataset_config["split"]
-        message_transform_type = dataset_config.get("message_transform", "alpaca")
-        masking_strategy = dataset_config.get("masking_strategy", "train_on_assistant")
-
-        # Select the appropriate message transform
-        if message_transform_type == "openthoughts":
-            message_transform = OpenThoughtsToMessages(
-                masking_strategy=masking_strategy
-            )
-        elif message_transform_type == "alpaca":
-            message_transform = AlpacaToMessages(masking_strategy=masking_strategy)
-        else:
-            raise ValueError(
-                f"Unknown message_transform type: {message_transform_type}"
-            )
-
         dataset = sft_iterable_dataset(
             model_transform=tokenizer,
-            message_transform=message_transform,
-            path=dataset_path,
-            split=dataset_split,
+            message_transform=AlpacaToMessages(),
+            path="yahma/alpaca-cleaned",
+            split="train",
         )
-        packer = TextPacker(padding_idx=151643)
+        packer = TextPacker(padding_idx=0)
         dataset = PackedDataset(
             dataset=dataset,
             packer=packer,
diff --git a/src/forge/actors/generator.py b/src/forge/actors/generator.py
@@ -239,11 +239,12 @@ def _spawn_fetchers(self):
         # TODO: this assumes the generator is on the same host as the worker
         # and only works for single host generators. Figure out how to support
         # generators with workers spanned across multiple hosts.
-        fetcher_procs = this_host().spawn_procs(
-            per_host={"procs": self.n_fetcher_procs}
-        )
-        self._fetcher_procs = fetcher_procs
-        self.weight_fetchers = fetcher_procs.spawn("weight_fetcher", _WeightFetcher)
+        pass
+        # fetcher_procs = this_host().spawn_procs(
+        #     per_host={"procs": self.n_fetcher_procs}
+        # )
+        # self._fetcher_procs = fetcher_procs
+        # self.weight_fetchers = fetcher_procs.spawn("weight_fetcher", _WeightFetcher)
 
     def _start_processing(self):
         if self._run_task is None or self._run_task.done():
diff --git a/src/forge/actors/reference_model.py b/src/forge/actors/reference_model.py
@@ -13,9 +13,6 @@
 from dataclasses import dataclass, field, fields
 
 import torch
-import torch.nn.functional as F
-
-# from forge.util.ops import compute_logprobs
 from monarch.actor import current_rank, current_size, endpoint
 from torch.distributed.tensor import DTensor
 
@@ -33,6 +30,7 @@
 from forge.controller import ForgeActor
 from forge.observability.metrics import record_metric, Reduce
 from forge.observability.perf_tracker import Tracer
+from forge.util.ops import compute_logprobs
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -182,77 +180,15 @@ async def forward(
                     with torch.inference_mode():
                         logits = self.model(input_ids)
         self.step += 1
-        # if isinstance(logits, DTensor):
-        #     logits = logits.full_tensor()
+        if isinstance(logits, DTensor):
+            logits = logits.full_tensor()
         t.step("forward")
 
         if not return_logprobs:
             t.stop()
-            if isinstance(logits, DTensor):
-                return logits.full_tensor()
             return logits
         else:
-            logprobs = compute_logprobs_chunked(logits, input_ids[:, max_req_tokens:])
+            logprobs = compute_logprobs(logits, input_ids[:, max_req_tokens:])
             t.step("compute_logprobs")
             t.stop()
             return logprobs
-
-
-def compute_logprobs_chunked(
-    logits: torch.Tensor | DTensor,
-    input_ids: torch.Tensor,
-    temperature: float = 1.0,
-    align: bool = True,
-    chunk_size: int = 512,
-) -> torch.Tensor:
-    """
-    Memory-efficient version that processes logits in chunks along the sequence dimension.
-    Useful for very long sequences where even the DTensor operations might cause memory issues.
-
-    Args:
-        chunk_size: Number of tokens to process at once. Lower values use less memory.
-    """
-    is_dtensor = isinstance(logits, DTensor)
-
-    # Align logits with input_ids if requested
-    if align:
-        target_len = input_ids.size(1)
-        logits = logits[:, -target_len - 1 : -1, :]
-        if not is_dtensor:
-            logits = logits.to(input_ids.device)
-
-    batch_size, seq_len, vocab_size = logits.shape
-
-    # Initialize output tensor
-    logprobs = torch.zeros(
-        batch_size, seq_len, dtype=torch.float32, device=logits.device
-    )
-
-    # Process in chunks
-    for start_idx in range(0, seq_len, chunk_size):
-        end_idx = min(start_idx + chunk_size, seq_len)
-
-        # Get chunk of logits and input_ids
-        logits_chunk = logits[:, start_idx:end_idx, :]
-        input_chunk = input_ids[:, start_idx:end_idx]
-
-        # Scale and convert to fp32
-        scaled_chunk = (logits_chunk / temperature).float()
-
-        # Compute log probabilities for this chunk
-        chunk_size_actual = end_idx - start_idx
-        flat_logits = scaled_chunk.reshape(-1, vocab_size)
-        flat_targets = input_chunk.reshape(-1).long()
-
-        chunk_logprobs = -F.cross_entropy(
-            flat_logits,
-            flat_targets,
-            reduction="none",
-        )
-
-        # Store in output tensor
-        logprobs[:, start_idx:end_idx] = chunk_logprobs.reshape(
-            batch_size, chunk_size_actual
-        )
-
-    return logprobs
diff --git a/src/forge/data/datasets/__init__.py b/src/forge/data/datasets/__init__.py
@@ -7,20 +7,13 @@
 from .dataset import DatasetInfo, InfiniteTuneIterableDataset, InterleavedDataset
 from .hf_dataset import HfIterableDataset
 from .packed import PackedDataset
-from .sft_dataset import (
-    AlpacaToMessages,
-    OpenThoughtsToMessages,
-    sft_iterable_dataset,
-    SFTOutputTransform,
-)
+from .sft_dataset import sft_iterable_dataset, SFTOutputTransform
 
 __all__ = [
-    "AlpacaToMessages",
     "DatasetInfo",
     "HfIterableDataset",
     "InterleavedDataset",
     "InfiniteTuneIterableDataset",
-    "OpenThoughtsToMessages",
     "PackedDataset",
     "SFTOutputTransform",
     "sft_iterable_dataset",
diff --git a/src/forge/data/datasets/sft_dataset.py b/src/forge/data/datasets/sft_dataset.py
@@ -105,75 +105,6 @@ def __call__(self, sample: dict[str, Any]) -> dict[str, Any]:
         return {"messages": messages}
 
 
-class OpenThoughtsToMessages:
-    """
-    Message transform class for OpenThoughts-style datasets with a "conversations" column
-    containing a list of dictionaries with "from" and "value" fields.
-
-    Args:
-        column_map (dict[str, str] | None): a mapping to change the expected "conversations"
-            column name to the actual column name in the dataset. Default is None,
-            keeping the default column name.
-        masking_strategy (str): masking strategy to use for model training.
-            Must be one of: `train_on_all`, `train_on_assistant`, `train_on_last`.
-            Default is "train_on_assistant".
-
-            - ``train_on_all``: both user and assistant messages are unmasked
-            - ``train_on_assistant``: user messages are masked, only assistant messages are unmasked
-            - ``train_on_last``: only the last assistant message is unmasked
-    """
-
-    def __init__(
-        self,
-        column_map: dict[str, str] | None = None,
-        masking_strategy: str = "train_on_assistant",
-    ):
-        self.masking_strategy = masking_strategy
-        if column_map:
-            if "conversations" not in column_map:
-                raise ValueError(
-                    f"Expected a key of 'conversations' in column_map but found {column_map.keys()}."
-                )
-            self._column_map = column_map
-        else:
-            self._column_map = {
-                "conversations": "conversations",
-            }
-
-    def __call__(self, sample: dict[str, Any]) -> dict[str, Any]:
-        conversations = sample[self._column_map["conversations"]]
-
-        if not isinstance(conversations, list):
-            raise ValueError(
-                f"Expected 'conversations' to be a list, got {type(conversations)}"
-            )
-
-        messages = []
-        for message_dict in conversations:
-            role = message_dict.get("from", "")
-            content = message_dict.get("value", "")
-
-            # Map OpenThoughts roles to standard roles
-            if role in ["human", "user"]:
-                role = "user"
-            elif role in ["gpt", "assistant", "model"]:
-                role = "assistant"
-            else:
-                # Skip unknown roles
-                continue
-
-            messages.append(
-                TuneMessage(
-                    role=role,
-                    content=content,
-                    eot=True,
-                )
-            )
-
-        mask_messages(messages, self.masking_strategy)
-        return {"messages": messages}
-
-
 class SFTOutputTransform:
     """Applied to each dataset sample to build the `"labels"` tensor for causal-LM SFT training.