Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions common/buildkite_config.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ struct BuildkiteRunnerGroup
# NOTE: This only works with `linux-sandbox.jl` runners!
num_cpus::Int

# Whether to assign GPUs to each worker (if this is true, then
# `num_agents` must equal `get_num_gpus()`).
assign_gpus::Bool

# The platform that this will run as
platform::Platform

Expand All @@ -46,6 +50,7 @@ function BuildkiteRunnerGroup(name::String, config::Dict; extra_tags::Dict{Strin
tags = get(config, "tags", Dict{String,String}())
start_rootless_docker = get(config, "start_rootless_docker", false)
num_cpus = get(config, "num_cpus", 0)
assign_gpus = get(config, "assign_gpus", false)
platform = parse(Platform, get(config, "platform", triplet(HostPlatform())))
source_image = get(config, "source_image", "")
tempdir_path = get(config, "tempdir", nothing)
Expand Down Expand Up @@ -87,6 +92,7 @@ function BuildkiteRunnerGroup(name::String, config::Dict; extra_tags::Dict{Strin
tags,
start_rootless_docker,
num_cpus,
assign_gpus,
platform,
source_image,
tempdir_path,
Expand Down
1 change: 1 addition & 0 deletions common/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ end
if Sys.islinux()
include("linux_systemd_config.jl")
include("linux_sysctl.jl")
include("nvidia.jl")
end
if Sys.isapple()
include("mac_launchctl_config.jl")
Expand Down
6 changes: 6 additions & 0 deletions common/nvidia.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
function get_num_gpus()
if Sys.which("nvidia-smi") === nothing
return 0
end
return length(split(readchomp(`nvidia-smi --list-gpus`), "\n"))
end
18 changes: 18 additions & 0 deletions linux-sandbox.jl/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ function check_configs(brgs::Vector{BuildkiteRunnerGroup})
# Check that the subuid stuff for rootless docker is setup properly
check_rootless_subuid()
end

num_gpus = get_num_gpus()
if brg.assign_gpus && brg.num_agents != num_gpus
throw(ArgumentError("num_agents ($(brg.num_agents)) must equal num_gpus ($(num_gpus))!"))
end
end

# Check that we aren't trying to pin too many cores
Expand Down Expand Up @@ -304,6 +309,19 @@ function Sandbox.SandboxConfig(brg::BuildkiteRunnerGroup;
entrypoint = "/usr/lib/entrypoint"
end

if brg.assign_gpus
# Here we have to do a bit of nastiness; we need the agent index, but it's
# usually wrapped up in `agent_name`. We try our best to unbundle it here,
# making use of the detail that systemd will replace `%i` with the index.
if endswith(agent_name, "-%i")
env_maps["CUDA_VISIBLE_DEVICES"] = "%i"
elseif endswith(agent_name, r"\.\d+")
env_maps["CUDA_VISIBLE_DEVICES"] = split(agent_name, ".")[end]
else
throw(ArgumentError("Cannot auto-determine CUDA_VISIBLE_DEVICES for agent_name $(agent_name)"))
end
end

return SandboxConfig(
ro_maps,
rw_maps,
Expand Down