Skip to content

Commit b1644a4

Browse files
authored
[TorchComms] integration CI tests (#1927)
1 parent 7b96efc commit b1644a4

File tree

2 files changed

+153
-0
lines changed

2 files changed

+153
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: TorchComms 8 GPU Integration Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/torchcomms/**'
8+
- '.github/workflows/integration_test_8gpu_torchcomms.yaml'
9+
pull_request:
10+
paths:
11+
- 'torchtitan/experiments/torchcomms/**'
12+
- '.github/workflows/integration_test_8gpu_torchcomms.yaml'
13+
schedule:
14+
# Runs every 12 hours
15+
- cron: '0 */12 * * *'
16+
17+
concurrency:
18+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
19+
cancel-in-progress: true
20+
21+
defaults:
22+
run:
23+
shell: bash -l -eo pipefail {0}
24+
25+
jobs:
26+
build-test:
27+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
28+
with:
29+
runner: linux.g5.48xlarge.nvidia.gpu
30+
gpu-arch-type: cuda
31+
gpu-arch-version: "12.8"
32+
# This image is faster to clone than the default, but it lacks CC needed by triton
33+
# (1m25s vs 2m37s).
34+
docker-image: torchtitan-ubuntu-20.04-clang12
35+
repository: pytorch/torchtitan
36+
upload-artifact: outputs
37+
script: |
38+
set -eux
39+
40+
# The generic Linux job chooses to use base env, not the one setup by the image
41+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
42+
conda activate "${CONDA_ENV}"
43+
44+
# Log CUDA driver version for debugging.
45+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
46+
echo "CUDA driver version: ${DRIVER_VERSION}"
47+
48+
pip config --user set global.progress_bar off
49+
50+
python -m pip install --force-reinstall --pre torch torchcomms --index-url https://download.pytorch.org/whl/nightly/cu128
51+
52+
mkdir artifacts-to-be-uploaded
53+
TEST_BACKEND=ncclx TRAIN_FILE=torchtitan.experiments.torchcomms.train python -m torchtitan.experiments.torchcomms.integration_tests artifacts-to-be-uploaded --ngpu 8
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_torchcomms_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
key is the config file name and value is a list of OverrideDefinitions
17+
that is used to generate variations of integration tests based on the
18+
same root config file.
19+
"""
20+
integration_tests_flavors = [
21+
OverrideDefinitions(
22+
[
23+
[],
24+
],
25+
"1D FSDP",
26+
"1d",
27+
),
28+
OverrideDefinitions(
29+
[
30+
[
31+
"--parallelism.tensor_parallel_degree 2",
32+
"--parallelism.pipeline_parallel_degree 2",
33+
],
34+
],
35+
"FSDP+TP+PP",
36+
"3d_dp+tp+pp",
37+
ngpu=8,
38+
),
39+
# TODO: Enable CP and async TP tests once fixes are available in
40+
# torchcomms nightly builds.
41+
# OverrideDefinitions(
42+
# [
43+
# [
44+
# "--parallelism.context_parallel_degree 2",
45+
# "--parallelism.pipeline_parallel_degree 2",
46+
# ],
47+
# ],
48+
# "DP+CP+PP",
49+
# "3d_dp+cp+pp",
50+
# ngpu=8,
51+
# ),
52+
# OverrideDefinitions(
53+
# [
54+
# [
55+
# "--compile.enable",
56+
# "--parallelism.context_parallel_degree 2",
57+
# "--parallelism.tensor_parallel_degree 2",
58+
# "--parallelism.enable_async_tensor_parallel",
59+
# ],
60+
# ],
61+
# "3D CP+async TP compile",
62+
# "3d_cp+asynctp_compile",
63+
# ngpu=8,
64+
# ),
65+
]
66+
return integration_tests_flavors
67+
68+
69+
_TEST_SUITES_FUNCTION = {
70+
"torchcomms": build_torchcomms_test_list,
71+
}
72+
73+
74+
def main():
75+
parser = argparse.ArgumentParser()
76+
parser.add_argument("output_dir")
77+
parser.add_argument(
78+
"--config_path",
79+
default="./tests/integration_tests/base_config.toml",
80+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
81+
)
82+
parser.add_argument(
83+
"--test_name",
84+
default="all",
85+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
86+
)
87+
parser.add_argument("--ngpu", default=8, type=int)
88+
args = parser.parse_args()
89+
90+
if not os.path.exists(args.output_dir):
91+
os.makedirs(args.output_dir)
92+
if os.listdir(args.output_dir):
93+
raise RuntimeError("Please provide an empty output directory.")
94+
95+
test_list = _TEST_SUITES_FUNCTION["torchcomms"]()
96+
run_tests(args, test_list)
97+
98+
99+
if __name__ == "__main__":
100+
main()

0 commit comments

Comments
 (0)