Add DDP component support in GCP Batch

priyaramani · web-flow · commit 1993aab1df10 · 2022-12-16T19:03:59.000-05:00
Differential Revision: D42080776 Pull Request resolved: #669
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -5,8 +5,8 @@ boto3==1.20.24
 captum>=0.4.0
 flake8==3.9.0
 fsspec[s3]==2022.1.0
-google-api-core>=2.0.1
-google-cloud-batch>=0.3.1
+google-api-core
+google-cloud-batch>=0.5.0
 google-cloud-logging>=3.0.0
 google-cloud-runtimeconfig>=0.33.2
 hydra-core
diff --git a/scripts/component_integration_tests.py b/scripts/component_integration_tests.py
@@ -51,7 +51,7 @@ def main() -> None:
     torchx_image = "dummy_image"
     dryrun = False
 
-    if scheduler in ("kubernetes", "local_docker", "aws_batch", "lsf"):
+    if scheduler in ("kubernetes", "local_docker", "aws_batch", "lsf", "gcp_batch"):
         try:
             build = build_and_push_image()
             torchx_image = build.torchx_image
@@ -95,6 +95,13 @@ def main() -> None:
                 "queue": "torchx",
             },
         },
+        "gcp_batch": {
+            "providers": [
+                component_provider,
+            ],
+            "image": torchx_image,
+            "cfg": {},
+        },
         "ray": {
             "providers": [
                 component_provider,
diff --git a/scripts/gcpbatchint.sh b/scripts/gcpbatchint.sh
@@ -9,7 +9,7 @@ set -ex
 
 torchx runopts gcp_batch
 
-APP_ID="$(torchx run --wait --scheduler gcp_batch utils.echo --msg hello)"
+APP_ID="$(torchx run --wait --scheduler gcp_batch dist.ddp -j 2x2 --max_retries 3 --script torchx/components/integration_tests/test/dummy_app.py)"
 torchx status "$APP_ID"
 
 torchx list -s gcp_batch
@@ -19,3 +19,12 @@ then
     echo "expected $APP_ID to be listed"
     exit 1
 fi
+
+torchx log "$APP_ID"
+EXPECTED_MSG="hi from main"
+LINES="$(torchx log "$APP_ID" | grep -c "$EXPECTED_MSG")"
+if [ "$LINES" -ne 4 ]
+then
+    echo "expected 4 log lines with msg $EXPECTED_MSG"
+    exit 1
+fi
diff --git a/torchx/schedulers/gcp_batch_scheduler.py b/torchx/schedulers/gcp_batch_scheduler.py
@@ -182,8 +182,7 @@ def _app_to_job(self, app: AppDef) -> "batch_v1.Job":
                 img_root="",
                 app_id=name,
                 replica_id=str(0),
-                # TODO set value for rank0_env: TORCHX_RANK0_HOST is a place holder for now
-                rank0_env=("TORCHX_RANK0_HOST"),
+                rank0_env=("BATCH_MAIN_NODE_HOSTNAME"),
             )
             role_dict = values.apply(role)
             role_dict.env["TORCHX_ROLE_IDX"] = str(role_idx)
@@ -195,14 +194,12 @@ def _app_to_job(self, app: AppDef) -> "batch_v1.Job":
             if cpu <= 0:
                 cpu = 1
             MILLI = 1000
-            # pyre-ignore [8] : pyre gets confused even when types on both sides of = are int
             res.cpu_milli = cpu * MILLI
             memMB = resource.memMB
             if memMB < 0:
                 raise ValueError(
                     f"memMB should to be set to a positive value, got {memMB}"
                 )
-            # pyre-ignore [8] : pyre gets confused even when types on both sides of = are int
             res.memory_mib = memMB
 
             # TODO support named resources
@@ -226,24 +223,40 @@ def _app_to_job(self, app: AppDef) -> "batch_v1.Job":
                 )
                 print(f"Using GPUs of type: {machineType}")
 
+            # Configure host firewall rules to accept ingress communication
+            config_network_runnable = batch_v1.Runnable(
+                script=batch_v1.Runnable.Script(
+                    text="/sbin/iptables -A INPUT -j ACCEPT"
+                )
+            )
+
             runnable = batch_v1.Runnable(
                 container=batch_v1.Runnable.Container(
                     image_uri=role_dict.image,
                     commands=[role_dict.entrypoint] + role_dict.args,
                     entrypoint="",
+                    # Configure docker to use the host network stack to communicate with containers/other hosts in the same network
+                    options="--net host",
                 )
             )
 
             ts = batch_v1.TaskSpec(
-                runnables=[runnable],
+                runnables=[config_network_runnable, runnable],
                 environment=batch_v1.Environment(variables=role_dict.env),
                 max_retry_count=role_dict.max_retries,
                 compute_resource=res,
             )
 
+            task_env = [
+                batch_v1.Environment(variables={"TORCHX_REPLICA_IDX": str(i)})
+                for i in range(role_dict.num_replicas)
+            ]
+
             tg = batch_v1.TaskGroup(
                 task_spec=ts,
                 task_count=role_dict.num_replicas,
+                task_count_per_node=1,
+                task_environments=task_env,
                 require_hosts_file=True,
             )
             taskGroups.append(tg)
@@ -338,37 +351,34 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
             return None
 
         gpu = 0
-        # pyre-fixme [16]: Pyre doesn't properly infer job field types
         if len(job.allocation_policy.instances) != 0:
             gpu_type = job.allocation_policy.instances[0].policy.machine_type
             gpu = GPU_TYPE_TO_COUNT[gpu_type]
 
         roles = {}
-        # pyre-fixme [16]: Pyre doesn't properly infer job field types
         for tg in job.task_groups:
             env = tg.task_spec.environment.variables
             role = env["TORCHX_ROLE_NAME"]
-            container = tg.task_spec.runnables[0].container
+            container = tg.task_spec.runnables[1].container
             roles[role] = Role(
                 name=role,
                 num_replicas=tg.task_count,
                 image=container.image_uri,
                 entrypoint=container.commands[0],
-                args=container.commands[1:],
+                args=list(container.commands[1:]),
                 resource=Resource(
                     cpu=int(tg.task_spec.compute_resource.cpu_milli / 1000),
                     memMB=tg.task_spec.compute_resource.memory_mib,
                     gpu=gpu,
                 ),
-                env=env,
+                env=dict(env),
                 max_retries=tg.task_spec.max_retry_count,
             )
 
         # Map job -> DescribeAppResponse
         # TODO map role/replica status
         desc = DescribeAppResponse(
             app_id=app_id,
-            # pyre-fixme [16]: Pyre doesn't properly infer job field types
             state=JOB_STATE[job.status.state.name],
             roles=list(roles.values()),
         )
diff --git a/torchx/schedulers/test/gcp_batch_scheduler_test.py b/torchx/schedulers/test/gcp_batch_scheduler_test.py
@@ -81,9 +81,7 @@ def test_submit_dryrun(self) -> None:
         env["TORCHX_ROLE_NAME"] = "trainer"
         env["FOO"] = "bar"
         res = batch_v1.ComputeResource()
-        # pyre-ignore [8] : pyre gets confused even when types on both sides of = are int
         res.cpu_milli = 2000
-        # pyre-ignore [8] : pyre gets confused even when types on both sides of = are int
         res.memory_mib = 3000
         allocationPolicy = batch_v1.AllocationPolicy(
             instances=[
@@ -95,6 +93,9 @@ def test_submit_dryrun(self) -> None:
                 )
             ],
         )
+        preRunnable = batch_v1.Runnable(
+            script=batch_v1.Runnable.Script(text="/sbin/iptables -A INPUT -j ACCEPT")
+        )
         runnable = batch_v1.Runnable(
             container=batch_v1.Runnable.Container(
                 image_uri="pytorch/torchx:latest",
@@ -105,12 +106,13 @@ def test_submit_dryrun(self) -> None:
                     "--app-id",
                     "app-name-42",
                     "--rank0_env",
-                    "TORCHX_RANK0_HOST",
+                    "BATCH_MAIN_NODE_HOSTNAME",
                 ],
+                options="--net host",
             )
         )
         ts = batch_v1.TaskSpec(
-            runnables=[runnable],
+            runnables=[preRunnable, runnable],
             environment=batch_v1.Environment(variables=env),
             max_retry_count=3,
             compute_resource=res,
@@ -119,6 +121,10 @@ def test_submit_dryrun(self) -> None:
         tg = batch_v1.TaskGroup(
             task_spec=ts,
             task_count=1,
+            task_count_per_node=1,
+            task_environments=[
+                batch_v1.Environment(variables={"TORCHX_REPLICA_IDX": "0"})
+            ],
             require_hosts_file=True,
         )
         taskGroups.append(tg)
@@ -261,13 +267,19 @@ def _mock_scheduler(self) -> GCPBatchScheduler:
                 batch_v1.TaskGroup(
                     task_spec=batch_v1.TaskSpec(
                         runnables=[
+                            batch_v1.Runnable(
+                                script=batch_v1.Runnable.Script(
+                                    text="/sbin/iptables -A INPUT -j ACCEPT"
+                                )
+                            ),
                             batch_v1.Runnable(
                                 container=batch_v1.Runnable.Container(
                                     image_uri="ghcr.io/pytorch/torchx:0.3.0dev0",
                                     commands=["python"] + ["-c", 'print("hello ")'],
                                     entrypoint="",
+                                    options="--net host",
                                 )
-                            )
+                            ),
                         ],
                         compute_resource=batch_v1.ComputeResource(
                             cpu_milli=8000,