Ulysses HF Accelerate integration (#7638)

stas00 · web-flow · commit 64c0052fa084 · 2025-10-22T11:57:51.000-07:00
Ulysses/ALST integration with HF Accelerate: - Allow `UlyssesSPAttentionHF.register_with_transformers` to get a `model` obj as an argument, to match HF accelerate's workflow - Fix existing Ulysses' tests to tests z2 instead of z1 - Improve documentation - Add a defensive check The HF Accelerate PR that depends on this PR is here huggingface/accelerate#3817 --------- Signed-off-by: Stas Bekman <stas@stason.org>
diff --git a/deepspeed/runtime/sequence_parallel/ulysses_sp.py b/deepspeed/runtime/sequence_parallel/ulysses_sp.py
@@ -345,8 +345,16 @@ def register_with_transformers(
         seq_length_is_variable=True,
     ):
         """
-        Register "ulysses" attn_implementation with HF transformers and return mpu (Megatron-LM-style parallel state object).
-        If sequence_parallel_size==1 do nothng and return None.
+        Register "ulysses" attn_implementation with HF transformers and return mpu (Megatron-LM-style parallel state groups object).
+        If sequence_parallel_size==1 do nothing and return None.
+
+        Args:
+        - model_name_or_path (object or str): model object, or HF hub model name, or model's local path
+        - core_attn_implementation (str): which attention to use: flash_attention_2 or flash_attention_3 or sdpa
+        - sequence_parallel_size (int): sequence parallelism dimension (if 1 it's disabled)
+        - max_length (int): actual global sequence length
+        - micro_batch_size (int): micro batch size
+        - seq_length_is_variable (bool): whether global seqlen may change between batches an optimization flag - the default is `True`
 
         """
         if sequence_parallel_size == 1:
@@ -359,8 +367,14 @@ def register_with_transformers(
 
         mpu.initialize_sequence_parallel(sequence_parallel_size=sequence_parallel_size)
 
-        # we don't have the model yet at this stage
-        hf_model_config = AutoConfig.from_pretrained(model_name_or_path)
+        from transformers import PreTrainedModel
+        if isinstance(model_name_or_path, PreTrainedModel):
+            # we already have the model
+            hf_model_config = model_name_or_path.config
+        else:
+            # if we don't have the model yet at this stage
+            hf_model_config = AutoConfig.from_pretrained(model_name_or_path)
+
         supported_attn_implementation = ["flash_attention_2", "flash_attention_3", "sdpa"]
         if core_attn_implementation not in supported_attn_implementation:
             # notes on the excluded ones:
@@ -460,6 +474,19 @@ def __init__(
 
         If more tokens need to be consumed per step use the gradient accumulation feature.
 
+        Ulysses expects the following dict keys in each DL batch (`dl->iter->next`):
+        - `input_ids`
+        - `position_ids`
+        - `labels`
+
+        Additional entries can be present.
+
+        The tensors are expected to be of shape: `[batch_size, seqlen, ...]`
+
+        The sharding happens on the seqlen (1st) dimension for all tensors in the batch, any non-tensor entries get copied to all ranks.
+
+        `attention_mask` isn't used by Ulysses, because it's typically too large when it's 4D, and position_ids is just 1D, therefore it's much much smaller and consumes little GPU memory.
+
         Arguments:
         - `dl`: an existing DataLoader object to wrap
         - `sp_rank`: SP rank
@@ -469,10 +496,6 @@ def __init__(
 
         Returns:
             Another DataLoader object
-
-        Here are the current assumptions on the inputs fetched by dl->iter->next
-        - the batch is a dict with at least the keys: `input_ids`, `labels`, `position_ids` - but can have any additional keys necessary.
-        - the tensor values get sharded, the non-tensor values are passed along as is
         """
 
         self.dl = dl
@@ -515,6 +538,9 @@ def refill(self):
         for k in batch.keys():
             if torch.is_tensor(batch[k]):
                 batch[k] = batch[k].to(self.device)
+                if seqlen != batch[k].shape[1]:
+                    raise ValueError(
+                        f"{k}'s shape {batch[k].shape} must match input_ids's shape {batch['input_ids'].shape}")
                 with torch.no_grad():
                     tensor_list = [
                         torch.zeros((batch[k].shape[0], seqlens[i]), dtype=batch[k].dtype, device=batch[k].device)
diff --git a/tests/unit/ulysses_alst/test_tiled_compute.py b/tests/unit/ulysses_alst/test_tiled_compute.py
@@ -98,7 +98,7 @@ def mlp_forward_sequence_tiled_compute(self, x):
 
 
 @pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("zero_stage", [1, 3])
+@pytest.mark.parametrize("zero_stage", [2, 3])
 class TestTiledCompute(DistributedTest):
     world_size = 1
 
@@ -232,7 +232,7 @@ def test_tiled_mlp(self, zero_stage, batch_size):
 
 
 @pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("zero_stage", [1, 3])
+@pytest.mark.parametrize("zero_stage", [2, 3])
 class TestTiledFusedLogitsLoss(DistributedTest):
     world_size = 1
 
diff --git a/tests/unit/ulysses_alst/test_ulysses_sp_hf.py b/tests/unit/ulysses_alst/test_ulysses_sp_hf.py
@@ -29,7 +29,7 @@ def get_grad(param, zero_stage):
     #     return safe_get_full_grad(param)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 3])
+@pytest.mark.parametrize("zero_stage", [2, 3])
 class TestUlyssesSPHF(DistributedTest):
     world_size = 2