add periodical intergration test

wwwjn · wwwjn · commit a18bae286c9c · 2025-05-16T13:17:32.000-07:00
diff --git a/.github/workflows/flux_integration_test_8gpu.yaml b/.github/workflows/flux_integration_test_8gpu.yaml
@@ -0,0 +1,46 @@
+name: 8 GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/flux/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/flux/**'
+  schedule:
+    # Runs every 12 hours
+    - cron: '0 */12 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python ./torchtitan/experiments/flux/tests/flux_integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/assets/cc12m_test/cc12m-train-0000.tar b/tests/assets/cc12m_test/cc12m-train-0000.tar
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -558,7 +558,7 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("output_dir")
     parser.add_argument(
-        "--config_dir", default="./torchtitan/models/llama3/train_configs"
+        "--config_dir", default="./torchtitan/experiments/flux/train_configs"
     )
     parser.add_argument(
         "--test",
diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
@@ -4,5 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torchtitan.experiments.flux  # noqa: F401
 import torchtitan.experiments.llama4  # noqa: F401
 import torchtitan.experiments.simple_fsdp  # noqa: F401
diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py
@@ -116,7 +116,7 @@ class TextToImageDatasetConfig:
         data_processor=_cc12m_wds_data_processor,
     ),
     "cc12m-test": TextToImageDatasetConfig(
-        path="tests/assets/cc12m_test",
+        path="torchtitan/experiments/flux/tests/assets/cc12m_test",
         loader=lambda path: load_dataset(
             path, split="train", data_files={"train": "*.tar"}, streaming=True
         ),
diff --git a/torchtitan/experiments/flux/tests/__init__.py b/torchtitan/experiments/flux/tests/__init__.py
diff --git a/torchtitan/experiments/flux/tests/flux_integration_tests.py b/torchtitan/experiments/flux/tests/flux_integration_tests.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import subprocess
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Sequence
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+
+@dataclass
+class OverrideDefinitions:
+    """
+    This class is used to define the override definitions for the integration tests.
+    """
+
+    override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
+    test_descr: str = "default"
+    test_name: str = "default"
+    ngpu: int = 4
+    model_flavor: str = "flux-debug"
+
+    def __repr__(self):
+        return self.test_descr
+
+
+def build_test_list():
+    """
+    key is the config file name and value is a list of OverrideDefinitions
+    that is used to generate variations of integration tests based on the
+    same root config file.
+    """
+    integration_tests_flavors = defaultdict(list)
+    integration_tests_flavors["debug_model.toml"] = [
+        # basic tests
+        OverrideDefinitions(
+            [
+                [
+                    "--profiling.enable_profiling",
+                    "--metrics.enable_tensorboard",
+                ],
+            ],
+            "default",
+            "default",
+        ),
+        # Compile tests
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                ],
+            ],
+            "1D compile",
+            "1d_compile",
+        ),
+        # Checkpointing tests
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--training.steps 20",
+                ],
+            ],
+            "Checkpoint Integration Test - Save Load Full Checkpoint",
+            "full_checkpoint",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--checkpoint.model_weights_only",
+                ],
+            ],
+            "Checkpoint Integration Test - Save Model Weights Only fp32",
+            "model_weights_only_fp32",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--checkpoint.model_weights_only",
+                    "--checkpoint.export_dtype bfloat16",
+                ],
+            ],
+            "Checkpoint Integration Test - Save Model Weights Only bf16",
+            "model_weights_only_bf16",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.data_parallel_shard_degree=1",
+                    "--parallelism.data_parallel_replicate_degree=4",
+                ]
+            ],
+            "DDP",
+            "ddp",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.data_parallel_replicate_degree=2",
+                ]
+            ],
+            "HSDP",
+            "hsdp",
+            ngpu=4,
+        ),
+        # OverrideDefinitions(
+        #     [
+        #         [
+        #             "--checkpoint.enable_checkpoint",
+        #         ],
+        #         [
+        #             # placeholder for the generation script's generate step
+        #         ],
+        #     ],
+        #     "Generation script test",
+        #     "test_generate",
+        #     ngpu=2,
+        # ),
+    ]
+    return integration_tests_flavors
+
+
+def _run_cmd(cmd):
+    return subprocess.run([cmd], text=True, shell=True)
+
+
+def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
+    # run_test supports sequence of tests.
+    test_name = test_flavor.test_name
+    dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
+    model_flavor_arg = f"--model.flavor {test_flavor.model_flavor}"
+    all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
+
+    for idx, override_arg in enumerate(test_flavor.override_args):
+        cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./torchtitan/experiments/flux/run_train.sh"
+        # dump compile trace for debugging purpose
+        cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd
+        cmd += " " + dump_folder_arg
+        cmd += " " + model_flavor_arg
+        if override_arg:
+            cmd += " " + " ".join(override_arg)
+        logger.info(
+            f"=====Flux Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
+        )
+
+        # save checkpoint (idx == 0) and load it for generation (idx == 1)
+        if test_name == "test_generate_image" and idx == 1:
+            # cmd = (
+            #     f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
+            #     f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 "
+            #     "PROMPT='What is the meaning of life?' "
+            #     f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json"
+            # )
+            # TODO: migrate the generate image script
+            cmd = None
+
+        result = _run_cmd(cmd)
+        logger.info(result.stdout)
+        if result.returncode != 0:
+            raise Exception(
+                f"Flux Integration test failed, flavor : {test_flavor.test_descr}, command : {cmd}"
+            )
+
+
+def run_tests(args):
+    integration_tests_flavors = build_test_list()
+    for config_file in os.listdir(args.config_dir):
+        if config_file.endswith(".toml"):
+            full_path = os.path.join(args.config_dir, config_file)
+            with open(full_path, "rb") as f:
+                config = tomllib.load(f)
+                is_integration_test = config["job"].get(
+                    "use_for_integration_test", False
+                )
+                if is_integration_test:
+                    for test_flavor in integration_tests_flavors[config_file]:
+                        if args.test == "all" or test_flavor.test_name == args.test:
+                            if args.ngpu < test_flavor.ngpu:
+                                logger.info(
+                                    f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus,"
+                                    f" because --ngpu arg is {args.ngpu}"
+                                )
+                            else:
+                                run_test(test_flavor, full_path, args.output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_dir", default="./torchtitan/experiments/flux/train_configs"
+    )
+    parser.add_argument(
+        "--test",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=8, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+    run_tests(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchtitan/experiments/flux/tests/unit_tests/__init__.py b/torchtitan/experiments/flux/tests/unit_tests/__init__.py
diff --git a/torchtitan/experiments/flux/train_configs/debug_model.toml b/torchtitan/experiments/flux/train_configs/debug_model.toml
@@ -38,7 +38,7 @@ seq_len = 512
 max_norm = 2.0  # grad norm clipping
 steps = 10
 compile = false
-dataset = "cc12m-wds"
+dataset = "cc12m-test"
 classifer_free_guidance_prob = 0.1
 img_size = 256
 

Original file line number	Diff line number	Diff line change
`@@ -558,7 +558,7 @@ def main():`
`558`	`558`	`parser = argparse.ArgumentParser()`
`559`	`559`	`parser.add_argument("output_dir")`
`560`	`560`	`parser.add_argument(`
`561`		`- "--config_dir", default="./torchtitan/models/llama3/train_configs"`
	`561`	`+ "--config_dir", default="./torchtitan/experiments/flux/train_configs"`
`562`	`562`	`)`
`563`	`563`	`parser.add_argument(`
`564`	`564`	`"--test",`