Skip to content

Failing cluster tests in CI #174

@suzhoum

Description

@suzhoum

The test_cluster on "source" build has been failing on CI, example run.

=================================== FAILURES ===================================
__________________________ test_distributed_training ___________________________
test_helper = <class 'conftest.CloudTestHelper'>, framework_version = 'source'

    def test_distributed_training(test_helper, framework_version):
        with tempfile.TemporaryDirectory() as temp_dir:
            os.chdir(temp_dir)
            timestamp = test_helper.get_utc_timestamp_now()
            cp = TabularCloudPredictor(
                cloud_output_path=f"s3://autogluon-cloud-ci/test-tabular-distributed/{framework_version}/{timestamp}",
                local_output_path="test_tabular_distributed",
                backend="ray_aws",
            )
    
            train_data = pd.read_csv("https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv")
            subsample_size = 1000
            if subsample_size is not None and subsample_size < len(train_data):
                train_data = train_data.sample(n=subsample_size, random_state=0)
            predictor_init_args = {"label": "class"}
            predictor_fit_args = {
                "train_data": train_data,
                "hyperparameters": {
                    "GBM": {"num_leaves": space.Int(lower=26, upper=66, default=36)},
                },
                "num_bag_folds": 2,
                "num_bag_sets": 1,
                "hyperparameter_tune_kwargs": {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
                    "num_trials": 2,
                    "scheduler": "local",
                    "searcher": "auto",
                },
            }
    
            image_uri = test_helper.get_custom_image_uri(framework_version, type="training", gpu=False)
    
>           cp.fit(
                predictor_init_args=predictor_init_args,
                predictor_fit_args=predictor_fit_args,
                custom_image_uri=image_uri,
                framework_version=framework_version,
                backend_kwargs={
                    "initialization_commands": [
                        "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com",
                        "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 369[469](https://github.com/autogluon/autogluon-cloud/actions/runs/13054240952/job/36421326686#step:7:470)875935.dkr.ecr.us-east-1.amazonaws.com",
                    ]
                },
            )

/home/runner/work/autogluon-cloud/autogluon-cloud/tests/unittests/cluster/test_distributed_training.py:43: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/predictor/cloud_predictor.py:273: in fit
    self.backend.fit(
/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/backend/ray_backend.py:296: in fit
    raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <autogluon.cloud.backend.ray_aws_backend.TabularRayAWSBackend object at 0x7f5c98c3f7c0>

            logger.log(20, "Waiting for 60s to give the cluster some buffer time")
            time.sleep(60)
            cluster_manager.setup_connection()
            time.sleep(10)  # waiting for connection to setup
            if job_name is None:
                job_name = CLOUD_RESOURCE_PREFIX + "-" + get_utc_timestamp_now()
            job = RayFitJob(output_path=self.cloud_output_path + "/model")
            self._fit_job = job
    
            entry_point_command = f"python3 {os.path.basename(train_script)} --ag_args_path {os.path.basename(ag_args_path)} --train_data {train_data} --model_output_path {self.get_fit_job_output_path()} --ray_job_id {job_name}"  # noqa: E501
            if tune_data is not None:
                entry_point_command += f" --tune_data {tune_data}"
            if leaderboard:
                entry_point_command += " --leaderboard"
            if not wait and ephemeral_cluster:
                entry_point_command += f" --cluster_config_file {os.path.basename(config)}"
            job.run(
                entry_point=entry_point_command,
                runtime_env={
                    "working_dir": job_path,
                    "env_vars": {
                        "AG_DISTRIBUTED_MODE": "1",
                        "AG_MODEL_SYNC_PATH": f"{self.cloud_output_path}/model_sync/",
                        "AG_UTIL_PATH": f"{self.cloud_output_path}/utils/",
                        "AG_NUM_NODES": str(instance_count),
                        # TODO: update syncing logic in tabular https://github.com/ray-project/ray/pull/37142
                        "RAY_AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE": "1",
                    },
                },
                job_name=job_name,
                timeout=timeout,
                wait=wait,
            )
            job_submitted = True
            if wait and job.get_job_status() != "SUCCEEDED":
>               raise ValueError("Training job failed. Please check the log for reason.")
E               ValueError: Training job failed. Please check the log for reason.

/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/backend/ray_backend.py:293: ValueError

The tests on "1.1.0" build has been running fine.

Expected behavior: test_distributed_training should pass on the "source" build.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions