From 39f0e2f8c94948f32435632af5fc581e9a05b546 Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Tue, 28 Nov 2023 20:34:17 +0000 Subject: [PATCH] Add ability to assign GPUs to workers This is intended to be used as a separate buildkite runner group where `num_agents` is equal to `num_gpus`. --- common/buildkite_config.jl | 6 ++++++ common/common.jl | 1 + common/nvidia.jl | 6 ++++++ linux-sandbox.jl/common.jl | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+) create mode 100644 common/nvidia.jl diff --git a/common/buildkite_config.jl b/common/buildkite_config.jl index aafae47..bfa136b 100644 --- a/common/buildkite_config.jl +++ b/common/buildkite_config.jl @@ -21,6 +21,10 @@ struct BuildkiteRunnerGroup # NOTE: This only works with `linux-sandbox.jl` runners! num_cpus::Int + # Whether to assign GPUs to each worker (if this is true, then + # `num_agents` must equal `get_num_gpus()`). + assign_gpus::Bool + # The platform that this will run as platform::Platform @@ -46,6 +50,7 @@ function BuildkiteRunnerGroup(name::String, config::Dict; extra_tags::Dict{Strin tags = get(config, "tags", Dict{String,String}()) start_rootless_docker = get(config, "start_rootless_docker", false) num_cpus = get(config, "num_cpus", 0) + assign_gpus = get(config, "assign_gpus", false) platform = parse(Platform, get(config, "platform", triplet(HostPlatform()))) source_image = get(config, "source_image", "") tempdir_path = get(config, "tempdir", nothing) @@ -87,6 +92,7 @@ function BuildkiteRunnerGroup(name::String, config::Dict; extra_tags::Dict{Strin tags, start_rootless_docker, num_cpus, + assign_gpus, platform, source_image, tempdir_path, diff --git a/common/common.jl b/common/common.jl index 2e811ed..8f31a19 100644 --- a/common/common.jl +++ b/common/common.jl @@ -11,6 +11,7 @@ end if Sys.islinux() include("linux_systemd_config.jl") include("linux_sysctl.jl") + include("nvidia.jl") end if Sys.isapple() include("mac_launchctl_config.jl") diff --git a/common/nvidia.jl b/common/nvidia.jl new file mode 100644 index 0000000..db68002 --- /dev/null +++ b/common/nvidia.jl @@ -0,0 +1,6 @@ +function get_num_gpus() + if Sys.which("nvidia-smi") === nothing + return 0 + end + return length(split(readchomp(`nvidia-smi --list-gpus`), "\n")) +end diff --git a/linux-sandbox.jl/common.jl b/linux-sandbox.jl/common.jl index 8d2930a..9ad030c 100644 --- a/linux-sandbox.jl/common.jl +++ b/linux-sandbox.jl/common.jl @@ -23,6 +23,11 @@ function check_configs(brgs::Vector{BuildkiteRunnerGroup}) # Check that the subuid stuff for rootless docker is setup properly check_rootless_subuid() end + + num_gpus = get_num_gpus() + if brg.assign_gpus && brg.num_agents != num_gpus + throw(ArgumentError("num_agents ($(brg.num_agents)) must equal num_gpus ($(num_gpus))!")) + end end # Check that we aren't trying to pin too many cores @@ -304,6 +309,19 @@ function Sandbox.SandboxConfig(brg::BuildkiteRunnerGroup; entrypoint = "/usr/lib/entrypoint" end + if brg.assign_gpus + # Here we have to do a bit of nastiness; we need the agent index, but it's + # usually wrapped up in `agent_name`. We try our best to unbundle it here, + # making use of the detail that systemd will replace `%i` with the index. + if endswith(agent_name, "-%i") + env_maps["CUDA_VISIBLE_DEVICES"] = "%i" + elseif endswith(agent_name, r"\.\d+") + env_maps["CUDA_VISIBLE_DEVICES"] = split(agent_name, ".")[end] + else + throw(ArgumentError("Cannot auto-determine CUDA_VISIBLE_DEVICES for agent_name $(agent_name)")) + end + end + return SandboxConfig( ro_maps, rw_maps,