-
Notifications
You must be signed in to change notification settings - Fork 15
Open
Description
The test_cluster on "source" build has been failing on CI, example run.
=================================== FAILURES ===================================
__________________________ test_distributed_training ___________________________
test_helper = <class 'conftest.CloudTestHelper'>, framework_version = 'source'
def test_distributed_training(test_helper, framework_version):
with tempfile.TemporaryDirectory() as temp_dir:
os.chdir(temp_dir)
timestamp = test_helper.get_utc_timestamp_now()
cp = TabularCloudPredictor(
cloud_output_path=f"s3://autogluon-cloud-ci/test-tabular-distributed/{framework_version}/{timestamp}",
local_output_path="test_tabular_distributed",
backend="ray_aws",
)
train_data = pd.read_csv("https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv")
subsample_size = 1000
if subsample_size is not None and subsample_size < len(train_data):
train_data = train_data.sample(n=subsample_size, random_state=0)
predictor_init_args = {"label": "class"}
predictor_fit_args = {
"train_data": train_data,
"hyperparameters": {
"GBM": {"num_leaves": space.Int(lower=26, upper=66, default=36)},
},
"num_bag_folds": 2,
"num_bag_sets": 1,
"hyperparameter_tune_kwargs": { # HPO is not performed unless hyperparameter_tune_kwargs is specified
"num_trials": 2,
"scheduler": "local",
"searcher": "auto",
},
}
image_uri = test_helper.get_custom_image_uri(framework_version, type="training", gpu=False)
> cp.fit(
predictor_init_args=predictor_init_args,
predictor_fit_args=predictor_fit_args,
custom_image_uri=image_uri,
framework_version=framework_version,
backend_kwargs={
"initialization_commands": [
"aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com",
"aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 369[469](https://github.com/autogluon/autogluon-cloud/actions/runs/13054240952/job/36421326686#step:7:470)875935.dkr.ecr.us-east-1.amazonaws.com",
]
},
)
/home/runner/work/autogluon-cloud/autogluon-cloud/tests/unittests/cluster/test_distributed_training.py:43:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/predictor/cloud_predictor.py:273: in fit
self.backend.fit(
/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/backend/ray_backend.py:296: in fit
raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <autogluon.cloud.backend.ray_aws_backend.TabularRayAWSBackend object at 0x7f5c98c3f7c0>
logger.log(20, "Waiting for 60s to give the cluster some buffer time")
time.sleep(60)
cluster_manager.setup_connection()
time.sleep(10) # waiting for connection to setup
if job_name is None:
job_name = CLOUD_RESOURCE_PREFIX + "-" + get_utc_timestamp_now()
job = RayFitJob(output_path=self.cloud_output_path + "/model")
self._fit_job = job
entry_point_command = f"python3 {os.path.basename(train_script)} --ag_args_path {os.path.basename(ag_args_path)} --train_data {train_data} --model_output_path {self.get_fit_job_output_path()} --ray_job_id {job_name}" # noqa: E501
if tune_data is not None:
entry_point_command += f" --tune_data {tune_data}"
if leaderboard:
entry_point_command += " --leaderboard"
if not wait and ephemeral_cluster:
entry_point_command += f" --cluster_config_file {os.path.basename(config)}"
job.run(
entry_point=entry_point_command,
runtime_env={
"working_dir": job_path,
"env_vars": {
"AG_DISTRIBUTED_MODE": "1",
"AG_MODEL_SYNC_PATH": f"{self.cloud_output_path}/model_sync/",
"AG_UTIL_PATH": f"{self.cloud_output_path}/utils/",
"AG_NUM_NODES": str(instance_count),
# TODO: update syncing logic in tabular https://github.com/ray-project/ray/pull/37142
"RAY_AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE": "1",
},
},
job_name=job_name,
timeout=timeout,
wait=wait,
)
job_submitted = True
if wait and job.get_job_status() != "SUCCEEDED":
> raise ValueError("Training job failed. Please check the log for reason.")
E ValueError: Training job failed. Please check the log for reason.
/home/runner/work/autogluon-cloud/autogluon-cloud/src/autogluon/cloud/backend/ray_backend.py:293: ValueError
The tests on "1.1.0" build has been running fine.
Expected behavior: test_distributed_training should pass on the "source" build.
Metadata
Metadata
Assignees
Labels
No labels