Skip to content

Commit 362173a

Browse files
Adding EC2 tests on vLLM DLC (#4986)
* add ec2 * testing vllm route * fixed error in trigger_test * added new dir for vllm test and infra * commented out test runner * trigger ec2 * create ec2 * change region * adding fsx * adding fsx * create func for subnet id * print statements * print statements * add more delete functionalities * fix ingress rules * make dg is list * modify egress and ingress * add ingress and egress rules * add setup_script * add setup_script * fixed path * fix sg re ordering * fix sg and fsx * fix sg and fsx * fix sg and fsx * fixed hf token error * fix error with sg and fsx mount * commented out cleanup code * refactor setup() and fsx_utils sg * fix sg * modify sg creation * add self * setup instances failure * fix deletion of sg * remove version * remove command * adding test-runner path and actual single node test * added secret key for hf * fixed import * rename fn' * testspec use trigger_test: * fix import * fix errors * add cleanup logic * increase time out * change region * changed it back to us-west-2 * modified test for single node * modified test for single node * changes to test to use script * retrigger tst * remove nvjpeg * remove nvjpeg * add logs * fix script to pass arguments * fix script to pass arguments * fix string * fix string * test ec2 * remove unused code * add multinode * fixed connection * fix connection * fix path * fix comand * retrigger ec2 * increase wait time and add fsx version * fix fsx command * fix names * fix dir * fix vllm dir and add log * fix git clone * fix git url * fix path * increase max attempts * fixed paths * added more fixes * sleep * setup instance one at a time * create diff fsx and sg for another instance * add conda installer * create conda env * conda accept tps * fix sg and multinode * add packages * add venv vllm_env * add venv vllm_env * fixed vllm venv * fixed transformrs isntallation * fix cleanup logic * add timer * increase timer * run single node * multinode test * add packages * test ec2 * activate venv * add vllm serve * increase cleanup timer * test multinode * test mutlinode * test multinode * test multinode * test multinode * retest * retest * retest * retest * retest * retest single node * test * test single node * test efa and nccl * test efa and nccl multinode * test efa and nccl * add sleep timer * test efa * test efa * test efa * test efa * added print statements * debug efa * debug efa * test vllm openai server * test vllm openai server * test vllm openai server * add cleanup for address allocation exception * add ingress rule * retest sg * add sleep timer for debugging * test efa * test efa * test efa * revert efa * test single node * test single node * test single node * make ipv6 true * run efa ipv6 * run efa ipv6 * modify instance setup * test multinode and efa * test multinode and efa * modify instance setup * modify instance setup * test efa * revamp ec2 * revamp ec2 * add test tunner * add ec2 in test runner * add ec2 elasticip cleanup * test efa * fix error in efa_ec2 print statement * fix error in efa_ec2 print statement * fix path of setup_fsx * retest ec2 * retest ec2 * add condition to skip chdir * change dir * test ec2 * test ec2 * test efa' * test multinode * test multinode * test multinode * test multinode * test multinode * test multinode * test multinode * test multinode * test multinode tmux * test multinode tmux * remove timer * remove tmux from worker * test multinode * test multinode * test efa and multinode * test multinode * test efa * test efa * fix key pair logic * test efa * test efa * test efa and test multinode * test efa and test multinode * test efa and test multinode * test efa and test multinode * test multinode * test multinode * test multinode * increased max attempts * run efa and multinode * test efa and multinode * test efa and multinode * test efa and multinode * add single script * test multinode * test multinode * test multinode * test multinode * add delay * add delay and model ready waiter * add async * test enforce eager * add timer to test: * test multinode * test multinode * Test all methods * test methods * test methods * test methods * test methods * test single node and multinode * test single node and multinode * test single node and multinode * test single node and multinode * test single node and multinode * test single node and multinode * test single node and multinode * test single node and multinode * test multinode * test multinode * add more storage capacity * test multinode * test multinode * test multinode * retest * retest * retest * retest * retest * retest * retest * retest * test multinode * test multinode * test multinode * add sleep * add sleep * revert toml * final cleanup * final cleanup * final cleanup
1 parent 1ae3d79 commit 362173a

File tree

13 files changed

+1530
-12
lines changed

13 files changed

+1530
-12
lines changed

test/dlc_tests/container_tests/bin/efa/testEFA

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ validate_all_reduce_performance_logs(){
3636
# EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
3737
grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
3838
if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
39-
grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
39+
grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG}
4040
# EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
4141
grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
4242
fi

test/dlc_tests/ec2/test_efa.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,16 @@ def _setup_container(connection, docker_image, container_name):
294294
# using SSH on a pre-defined port (as decided by sshd_config on server-side).
295295
# Allow instance to share all memory with container using memlock=-1:-1.
296296
# Share all EFA devices with container using --device <device_location> for all EFA devices.
297-
connection.run(
298-
f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
299-
f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image} bash"
300-
)
297+
if "vllm" in docker_image:
298+
connection.run(
299+
f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
300+
f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image}"
301+
)
302+
else:
303+
connection.run(
304+
f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
305+
f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image} bash"
306+
)
301307

302308

303309
def _setup_master_efa_ssh_config(connection):

test/test_utils/ec2.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,27 @@ def get_default_subnet_for_az(ec2_client, availability_zone):
18171817
return az_subnet_id
18181818

18191819

1820+
def get_subnet_id_by_vpc(ec2_client, vpc_id):
1821+
1822+
response = ec2_client.describe_subnets(
1823+
Filters=[
1824+
{
1825+
"Name": "vpc-id",
1826+
"Values": [
1827+
vpc_id,
1828+
],
1829+
},
1830+
],
1831+
)
1832+
1833+
subnet_ids = []
1834+
for subnet in response["Subnets"]:
1835+
if subnet["SubnetId"] is not None:
1836+
subnet_ids.append(subnet["SubnetId"])
1837+
1838+
return subnet_ids
1839+
1840+
18201841
def get_vpc_id_by_name(ec2_client, vpc_name):
18211842
"""
18221843
Get VPC ID by VPC name tag

test/testrunner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ def main():
410410
pull_dlc_images(all_image_list)
411411
if specific_test_type == "bai":
412412
build_bai_docker_container()
413-
if specific_test_type == "eks" and not is_all_images_list_eia:
413+
if specific_test_type in ["eks", "ec2"] and not is_all_images_list_eia:
414414
frameworks_in_images = [
415415
framework
416416
for framework in ("mxnet", "pytorch", "tensorflow", "vllm")
@@ -425,13 +425,13 @@ def main():
425425

426426
if framework == "vllm":
427427
try:
428-
LOGGER.info(f"Running vLLM EKS tests with image: {all_image_list[0]}")
428+
LOGGER.info(f"Running vLLM EKS EC2 tests with image: {all_image_list[0]}")
429429
test()
430-
LOGGER.info("vLLM EKS tests completed successfully")
430+
LOGGER.info("vLLM EKS EC2 tests completed successfully")
431431
# Exit function after vLLM tests
432432
return
433433
except Exception as e:
434-
LOGGER.error(f"vLLM EKS tests failed: {str(e)}")
434+
LOGGER.error(f"vLLM EKS EC2 tests failed: {str(e)}")
435435
raise
436436

437437
eks_cluster_name = f"dlc-{framework}-{build_context}"

0 commit comments

Comments
 (0)