aws
diff --git a/‎test/dlc_tests/container_tests/bin/efa/testEFA
Lines changed: 1 addition & 1 deletion b/‎test/dlc_tests/container_tests/bin/efa/testEFA
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dlc_tests/ec2/test_efa.py
Lines changed: 10 additions & 4 deletions b/‎test/dlc_tests/ec2/test_efa.py
Lines changed: 10 additions & 4 deletions
diff --git a/‎test/test_utils/ec2.py
Lines changed: 21 additions & 0 deletions b/‎test/test_utils/ec2.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎test/testrunner.py
Lines changed: 4 additions & 4 deletions b/‎test/testrunner.py
Lines changed: 4 additions & 4 deletions
@@ -36,7 +36,7 @@ validate_all_reduce_performance_logs(){
     # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
     grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
     if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
-        grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
+        grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG}
         # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
         grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
     fi
 
@@ -294,10 +294,16 @@ def _setup_container(connection, docker_image, container_name):
     # using SSH on a pre-defined port (as decided by sshd_config on server-side).
     # Allow instance to share all memory with container using memlock=-1:-1.
     # Share all EFA devices with container using --device <device_location> for all EFA devices.
-    connection.run(
-        f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
-        f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image} bash"
-    )
+    if "vllm" in docker_image:
+        connection.run(
+            f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
+            f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image}"
+        )
+    else:
+        connection.run(
+            f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
+            f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image} bash"
+        )
 
 
 def _setup_master_efa_ssh_config(connection):
 
@@ -1817,6 +1817,27 @@ def get_default_subnet_for_az(ec2_client, availability_zone):
     return az_subnet_id
 
 
+def get_subnet_id_by_vpc(ec2_client, vpc_id):
+
+    response = ec2_client.describe_subnets(
+        Filters=[
+            {
+                "Name": "vpc-id",
+                "Values": [
+                    vpc_id,
+                ],
+            },
+        ],
+    )
+
+    subnet_ids = []
+    for subnet in response["Subnets"]:
+        if subnet["SubnetId"] is not None:
+            subnet_ids.append(subnet["SubnetId"])
+
+    return subnet_ids
+
+
 def get_vpc_id_by_name(ec2_client, vpc_name):
     """
     Get VPC ID by VPC name tag
 
@@ -410,7 +410,7 @@ def main():
             pull_dlc_images(all_image_list)
         if specific_test_type == "bai":
             build_bai_docker_container()
-        if specific_test_type == "eks" and not is_all_images_list_eia:
+        if specific_test_type in ["eks", "ec2"] and not is_all_images_list_eia:
             frameworks_in_images = [
                 framework
                 for framework in ("mxnet", "pytorch", "tensorflow", "vllm")
@@ -425,13 +425,13 @@ def main():
 
             if framework == "vllm":
                 try:
-                    LOGGER.info(f"Running vLLM EKS tests with image: {all_image_list[0]}")
+                    LOGGER.info(f"Running vLLM EKS EC2 tests with image: {all_image_list[0]}")
                     test()
-                    LOGGER.info("vLLM EKS tests completed successfully")
+                    LOGGER.info("vLLM EKS EC2 tests completed successfully")
                     # Exit function after vLLM tests
                     return
                 except Exception as e:
-                    LOGGER.error(f"vLLM EKS tests failed: {str(e)}")
+                    LOGGER.error(f"vLLM EKS EC2 tests failed: {str(e)}")
                     raise
 
             eks_cluster_name = f"dlc-{framework}-{build_context}"