refactor: move compute_logprobs_parallel into ops alongside compute_logprobs

gitlost-murali · gitlost-murali · commit b584f4eb744c · 2025-12-01T22:15:17.000Z
diff --git a/src/forge/util/ops.py b/src/forge/util/ops.py
@@ -5,8 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+import torch.distributed as dist
 import torch.nn.functional as F
 
+from torch.distributed.tensor import DTensor
+
 
 def compute_logprobs(
     logits: torch.Tensor,
@@ -95,3 +98,124 @@ def compute_logprobs(
     )
 
     return logprobs.reshape(batch_size, seq_len)
+
+
+def compute_logprobs_parallel(
+    logits: DTensor,
+    target_ids: torch.Tensor,
+    temperature: float = 1.0,
+    align: bool = True,
+) -> torch.Tensor:
+    """
+    Compute log probabilities for target tokens from vocab-sharded DTensor logits.
+
+    This function computes log_softmax(logits)[target_ids] distributedly,
+    without ever gathering the full vocabulary dimension.
+
+    IMPORTANT: Only use this when logits is a DTensor sharded on vocab dimension.
+    For regular tensors or non-vocab-sharded DTensors, use compute_logprobs instead.
+
+    Args:
+        logits: DTensor of shape [batch_size, seq_len, vocab_size], sharded on dim=-1.
+        target_ids: Tensor of shape [batch_size, target_len] with target token IDs.
+        temperature: Temperature for scaling logits (default 1.0).
+        align: If True, slice logits to align with target_ids (default True).
+
+    Returns:
+        Tensor of shape [batch_size, target_len] with log probabilities.
+    """
+    # Get sharding info using helper
+    tp_group, tp_rank, tp_size, vocab_start, vocab_end = get_vocab_shard_info(logits)
+
+    if tp_group is None:
+        # DTensor but not sharded on vocab (Replicate or other dim sharding)
+        return compute_logprobs(logits.full_tensor(), target_ids, temperature, align)
+
+    # Get the local shard
+    local_logits = logits._local_tensor  # [batch, seq_len, vocab_size / tp_size]
+
+    # Align logits with target if needed
+    if align:
+        # Slice to match target length: logits[:, -target_len-1:-1, :]
+        target_len = target_ids.size(1)
+        local_logits = local_logits[:, -target_len - 1 : -1, :]
+
+    # Scale by temperature
+    local_logits = local_logits / temperature
+
+    batch_size, seq_len, local_vocab_size = local_logits.shape
+
+    # Move target_ids to the same device as local_logits
+    target_ids = target_ids.to(local_logits.device)
+
+    # Cast to float32 for numerical stability
+    local_logits_fp32 = local_logits.float()
+
+    # Compute global max across all shards for numerical stability
+    local_max = local_logits_fp32.max(dim=-1, keepdim=True).values
+    global_max = local_max.clone()
+    dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+    # Compute global sum(exp(x - max)) for the log-sum-exp trick
+    local_exp = torch.exp(local_logits_fp32 - global_max)
+    local_sum_exp = local_exp.sum(dim=-1, keepdim=True)
+    global_sum_exp = local_sum_exp.clone()
+    dist.all_reduce(global_sum_exp, op=dist.ReduceOp.SUM, group=tp_group)
+
+    # log_normalizer = global_max + log(global_sum_exp)
+    log_normalizer = global_max + torch.log(global_sum_exp)  # [batch, seq, 1]
+    log_normalizer = log_normalizer.squeeze(-1)  # [batch, seq]
+
+    # Extract logits at target positions - each rank only has part of the vocab
+    is_local = (target_ids >= vocab_start) & (target_ids < vocab_end)
+
+    # Convert global indices to local indices (only valid where is_local=True)
+    local_indices = target_ids - vocab_start
+    local_indices = local_indices.clamp(0, local_vocab_size - 1)  # Clamp for safety
+
+    target_logits = torch.gather(
+        local_logits_fp32,
+        dim=-1,
+        index=local_indices.unsqueeze(-1).long(),
+    ).squeeze(-1)
+
+    # Zero out where this rank doesn't own the token, then reduce
+    target_logits = target_logits * is_local.float()
+    dist.all_reduce(target_logits, op=dist.ReduceOp.SUM, group=tp_group)
+
+    logprobs = target_logits - log_normalizer
+
+    return logprobs
+
+
+def get_vocab_shard_info(
+    logits: DTensor,
+) -> tuple[dist.ProcessGroup | None, int, int, int, int]:
+    """
+    Get vocabulary sharding information from a DTensor.
+
+    Args:
+        logits: DTensor with shape [..., vocab_size], potentially sharded on vocab dim.
+
+    Returns:
+        Tuple of (tp_group, tp_rank, tp_size, vocab_start, vocab_end).
+        If not sharded, returns (None, 0, 1, 0, vocab_size).
+    """
+    from torch.distributed.tensor.placement_types import Shard
+
+    local_logits = logits._local_tensor
+    placements = logits.placements
+    device_mesh = logits.device_mesh
+
+    for i, p in enumerate(placements):
+        if isinstance(p, Shard) and p.dim == 2:  # vocab dimension
+            tp_group = device_mesh.get_group(mesh_dim=i)
+            tp_size = dist.get_world_size(tp_group)
+            tp_rank = dist.get_rank(tp_group)
+            local_vocab_size = local_logits.shape[-1]
+            vocab_start = tp_rank * local_vocab_size
+            vocab_end = vocab_start + local_vocab_size
+            return tp_group, tp_rank, tp_size, vocab_start, vocab_end
+
+    # Not sharded
+    return None, 0, 1, 0, local_logits.shape[-1]
diff --git a/src/forge/util/parallel_logprobs.py b/src/forge/util/parallel_logprobs.py
diff --git a/tests/unit_tests/util/test_parallel_logprobs.py b/tests/unit_tests/util/test_parallel_logprobs.py
@@ -14,8 +14,11 @@
 import torch
 import torch.distributed as dist
 
-from forge.util.ops import compute_logprobs
-from forge.util.parallel_logprobs import compute_logprobs_parallel, get_vocab_shard_info
+from forge.util.ops import (
+    compute_logprobs,
+    compute_logprobs_parallel,
+    get_vocab_shard_info,
+)
 from tests.test_utils import gpu_test
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DTensor, Shard