pytorch
diff --git a/‎.ci/docker/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration_test_2gpu_rl_numerics.yaml‎
Lines changed: 118 additions & 0 deletions b/‎.github/workflows/integration_test_2gpu_rl_numerics.yaml‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/common_utils.py‎
Lines changed: 16 additions & 11 deletions b/‎torchtitan/experiments/graph_trainer/common_utils.py‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/configs.py‎
Lines changed: 7 additions & 0 deletions b/‎torchtitan/experiments/graph_trainer/configs.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/deepseek_v3/config_registry.py‎
Lines changed: 0 additions & 5 deletions b/‎torchtitan/experiments/graph_trainer/deepseek_v3/config_registry.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/deepseek_v3/parallelize.py‎
Lines changed: 2 additions & 5 deletions b/‎torchtitan/experiments/graph_trainer/deepseek_v3/parallelize.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/llama3/parallelize.py‎
Lines changed: 2 additions & 34 deletions b/‎torchtitan/experiments/graph_trainer/llama3/parallelize.py‎
Lines changed: 2 additions & 34 deletions
@@ -1,5 +1,5 @@
 torchdata >= 0.8.0
-datasets >= 3.6.0
+datasets >= 3.6.0, < 4.8.0
 tensorboard
 wandb
 fsspec
 
@@ -0,0 +1,118 @@
+name: RL Numerics 2 GPU Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+    tags:
+      - ciflow/8gpu/*
+    paths:
+      - 'torchtitan/experiments/rl/**'
+      - '.github/workflows/integration_test_2gpu_rl_numerics.yaml'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/rl/**'
+      - '.github/workflows/integration_test_2gpu_rl_numerics.yaml'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+permissions:
+      id-token: write
+      contents: read
+
+# Steps should be kept in sync with torchtitan/experiments/rl/README.md
+jobs:
+  # Step 1: Dynamically compute the matrix based on conditions
+  set-matrix:
+    uses: ./.github/workflows/set-matrix.yaml
+    with:
+      runner-cuda: linux.aws.h100.8
+
+  # Step 2: Use the dynamic matrix in the build-test job
+  build-test:
+    needs: set-matrix
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
+    with:
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        conda install -y -c conda-forge libstdcxx-ng
+
+        pip install -e .
+
+        # Install CUDA 12.8 toolkit via conda (needed to build vLLM from source)
+        # Temporarily disable -u because conda activation scripts have unbound variables
+        set +u
+        conda install -y -c nvidia cuda-toolkit=12.8
+        set -u
+        # Conda CUDA toolkit puts headers/libs under targets/x86_64-linux/
+        export CUDA_HOME="${CONDA_PREFIX}/targets/x86_64-linux"
+        # Pass as CMake args since CUDA_TOOLKIT_ROOT_DIR is a CMake variable, not an env var
+        export CMAKE_ARGS="-DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_CUDA_COMPILER=${CONDA_PREFIX}/bin/nvcc"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        # Install torch nightly first
+        TORCH_SPEC="torch"
+        if [ -n "${{ matrix.torch-version }}" ]; then
+          TORCH_SPEC="torch==${{ matrix.torch-version }}"
+        fi
+        if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then
+          python -m pip install --force-reinstall --pre \
+            "${TORCH_SPEC}" --index-url ${{ matrix.index-url }}
+        else
+          python -m pip install --force-reinstall --pre \
+            torch --index-url ${{ matrix.index-url }}
+        fi
+
+        # Install RL dependencies: xformers, monarch, flash-attn-3
+        pip install xformers --extra-index-url ${{ matrix.index-url }}
+        pip install torchmonarch==0.3.0
+        pip install pygtrie portpicker
+        pip install --no-deps "git+https://github.com/meta-pytorch/torchstore.git@main"
+        pip install flash-attn-3 --extra-index=https://download.pytorch.org/whl/test/cu128
+
+        # Build and install vLLM from source using existing torch nightly
+        git clone https://github.com/vllm-project/vllm.git /tmp/vllm
+        cd /tmp/vllm
+        python use_existing_torch.py
+        pip install -r requirements/build.txt
+        # Constrain torch to the installed nightly version so pip doesn't try to downgrade it
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
+        echo "Installed torch version: ${TORCH_VERSION}"
+        echo "torch==${TORCH_VERSION}" > /tmp/torch-constraint.txt
+        PIP_CONSTRAINT=/tmp/torch-constraint.txt pip install --no-build-isolation -v -e .
+        cd -
+
+        # Download Qwen3-0.6B checkpoint
+        python scripts/download_hf_assets.py \
+          --repo_id Qwen/Qwen3-0.6B \
+          --local_dir torchtitan/experiments/rl/example_checkpoint \
+          --all
+
+        # Run the attention numerics test (2 GPU, TP=2)
+        torchrun --nproc-per-node=2 \
+          torchtitan/experiments/rl/tests/test_attn_numerics.py
@@ -145,20 +145,25 @@ def convert_modules_to_fqns(modules, module_to_fqn_mapping):
     return module_fqns
 
 
-def maybe_disable_eager_ac(
+def apply_graph_ac(
     compile_config: CompileConfig,
     ac_config: "ActivationCheckpointConfig",
 ) -> None:
-    """Disable eager AC when apply_sac graph pass is enabled.
+    """Add apply_sac to compile joint passes for graph-based selective AC.
 
-    When apply_sac is used as a joint graph pass, eager activation checkpointing
-    must be disabled to avoid double-checkpointing. This must be called before
-    the model parallelization step that applies eager AC.
+    Must be called only when ac_config.mode != "none". Only "selective" mode
+    is supported; other modes raise ValueError.
     """
+    if ac_config.mode != "selective":
+        raise ValueError(
+            f"graph_trainer only supports activation_checkpoint.mode 'selective' or "
+            f"'none', got '{ac_config.mode}'. Use 'selective' for graph-based SAC."
+        )
+
     joint_pass_names = getattr(compile_config, "joint_passes", [])
-    if "apply_sac" in joint_pass_names:
-        if ac_config.mode != "none":
-            logger.info(
-                "apply_sac graph pass is enabled, overriding eager AC mode to none"
-            )
-            ac_config.mode = "none"
+    if "apply_sac" not in joint_pass_names:
+        compile_config.joint_passes = list(joint_pass_names) + ["apply_sac"]
+        logger.info(
+            "activation_checkpoint.mode is 'selective', added apply_sac to "
+            "compile.joint_passes"
+        )
@@ -8,6 +8,7 @@
 from dataclasses import dataclass, field, fields
 from typing import Literal
 
+from torchtitan.config import ActivationCheckpointConfig
 from torchtitan.config.configs import CompileConfig
 from torchtitan.protocols.model_spec import ModelSpec
 from torchtitan.trainer import Trainer
@@ -59,4 +60,10 @@ def to_graph_trainer_config(
     d["model_spec"] = model_registry(base_config.model_spec.flavor)
     d.pop("compile")
 
+    # graph_trainer uses graph-based SAC instead of eager AC. Override any
+    # non-"none" AC mode to "selective" so callers don't need per-config fixups.
+    ac = d.get("activation_checkpoint")
+    if ac is not None and ac.mode != "none":
+        d["activation_checkpoint"] = ActivationCheckpointConfig(mode="selective")
+
     return GraphTrainer.Config(**d)
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from torchtitan.config import ActivationCheckpointConfig
 from torchtitan.experiments.graph_trainer.configs import (
     GraphTrainerCompileConfig,
     to_graph_trainer_config,
@@ -22,27 +21,23 @@
 
 def graph_trainer_deepseek_v3_debugmodel() -> GraphTrainer.Config:
     config = to_graph_trainer_config(deepseek_v3_debugmodel(), model_registry)
-    config.activation_checkpoint = ActivationCheckpointConfig(mode="none")
     config.compile = GraphTrainerCompileConfig(enable=True)
     return config
 
 
 def graph_trainer_deepseek_v3_debugmodel_flex_attn() -> (GraphTrainer.Config):
     config = to_graph_trainer_config(deepseek_v3_debugmodel_flex_attn(), model_registry)
-    config.activation_checkpoint = ActivationCheckpointConfig(mode="none")
     config.compile = GraphTrainerCompileConfig(enable=True)
     return config
 
 
 def graph_trainer_deepseek_v3_16b() -> GraphTrainer.Config:
     config = to_graph_trainer_config(deepseek_v3_16b(), model_registry)
-    config.activation_checkpoint = ActivationCheckpointConfig(mode="none")
     config.compile = GraphTrainerCompileConfig(enable=True)
     return config
 
 
 def graph_trainer_deepseek_v3_671b() -> GraphTrainer.Config:
     config = to_graph_trainer_config(deepseek_v3_671b(), model_registry)
-    config.activation_checkpoint = ActivationCheckpointConfig(mode="none")
     config.compile = GraphTrainerCompileConfig(enable=True)
     return config
@@ -17,11 +17,10 @@
 )
 from torchtitan.distributed import ParallelDims
 
-from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.experiments.graph_trainer.common_utils import (
     annotate_ac_regions,
-    maybe_disable_eager_ac,
+    apply_graph_ac,
 )
 from torchtitan.experiments.graph_trainer.compile import apply_compile
 from torchtitan.experiments.graph_trainer.deepseek_v3.model import (
@@ -99,8 +98,6 @@ def parallelize_deepseekv3(
 
     annotate_deepseekv3(model)
 
-    maybe_disable_eager_ac(compile_config, ac_config)
-
     if parallel_dims.tp_enabled:
         float8_config = find_float8_linear_config(model_converters.converters)
         enable_float8_linear = float8_config is not None
@@ -135,7 +132,7 @@ def parallelize_deepseekv3(
         )
 
     if ac_config.mode != "none":
-        apply_ac(model, ac_config)
+        apply_graph_ac(compile_config, ac_config)
 
     mp_policy = MixedPrecisionPolicy(
         param_dtype=TORCH_DTYPE_MAP[training.mixed_precision_param],
 
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
 from torch.fx.traceback import annotate_fn
 
 from torchtitan.components.quantization.float8 import find_float8_linear_config
@@ -16,11 +15,10 @@
     TrainingConfig,
 )
 from torchtitan.distributed import ParallelDims
-from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.experiments.graph_trainer.common_utils import (
     annotate_ac_regions,
-    maybe_disable_eager_ac,
+    apply_graph_ac,
 )
 from torchtitan.experiments.graph_trainer.compile import apply_compile
 from torchtitan.experiments.graph_trainer.llama3.model import GraphTrainerLlama3Model
@@ -32,25 +30,6 @@
 from torchtitan.protocols.model_converter import ModelConvertersContainer
 from torchtitan.tools.logging import logger
 
-# for selective op activation checkpointing
-_op_sac_save_list = {
-    torch.ops.aten.mm.default,
-    torch.ops.aten.linear.default,
-    torch.ops.aten._scaled_dot_product_efficient_attention.default,
-    torch.ops.aten._scaled_dot_product_flash_attention.default,
-    torch.ops.aten._scaled_dot_product_cudnn_attention.default,
-    torch.ops.aten._scaled_dot_product_attention_math.default,
-    torch.ops.aten._scaled_dot_product_fused_attention_overrideable.default,
-    torch.ops._c10d_functional.reduce_scatter_tensor.default,
-    # for low precision training, it's useful to always save
-    # the result of max, since the absolute maximum is
-    # used to compute the scaling factor for quantization.
-    torch.ops.aten.max.default,
-    torch._higher_order_ops.flex_attention,
-    torch.ops.torch_attn._varlen_attn.default,
-    torch._higher_order_ops.inductor_compiled_code,
-}
-
 
 def annotate_llama(model: GraphTrainerLlama3Model) -> None:
     """Attach annotations to FX graph nodes with ``torch.fx.traceback.annotate_fn``
@@ -103,8 +82,6 @@ def parallelize_llama(
 
     annotate_llama(model)
 
-    maybe_disable_eager_ac(compile_config, ac_config)
-
     if parallel_dims.tp_enabled:
         float8_config = find_float8_linear_config(model_converters.converters)
         enable_float8_linear = float8_config is not None
@@ -128,16 +105,7 @@ def parallelize_llama(
         maybe_enable_async_tp(parallelism, compile_config, tp_mesh)
 
     if ac_config.mode != "none":
-        model_compile_enabled = (
-            compile_config.enable and "model" in compile_config.components
-        )
-        apply_ac(
-            model,
-            ac_config,
-            model_compile_enabled=model_compile_enabled,
-            op_sac_save_list=_op_sac_save_list,
-            base_folder=dump_folder,
-        )
+        apply_graph_ac(compile_config, ac_config)
 
     # apply data parallel
     if (