Skip to content

Commit 6c835fd

Browse files
yongbinfengkpedro88
authored andcommitted
add podman support
1 parent 8a10d7c commit 6c835fd

File tree

5 files changed

+84
-17
lines changed

5 files changed

+84
-17
lines changed

HeterogeneousCore/SonicTriton/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ The script has three operations (`start`, `stop`, `check`) and the following opt
132132
* `-c`: don't cleanup temporary dir (for debugging)
133133
* `-C [dir]`: directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)
134134
* `-D`: dry run: print container commands rather than executing them
135-
* `-d`: use Docker instead of Apptainer
135+
* `-d [exe]`: container choice: Apptainer, Docker, Podman (default: apptainer)
136136
* `-E [path]`: include extra path(s) for executables (default: /cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin)
137137
* `-f`: force reuse of (possibly) existing container instance
138138
* `-g [device]`: device choice: auto (try to detect GPU), CPU, GPU (default: auto)
@@ -200,8 +200,8 @@ The fallback server has a separate set of options, mostly related to the invocat
200200
* `enable`: enable the fallback server
201201
* `debug`: enable debugging (equivalent to `-c` in `cmsTriton`)
202202
* `verbose`: enable verbose output in logs (equivalent to `-v` in `cmsTriton`)
203-
* `useDocker`: use Docker instead of Apptainer (equivalent to `-d` in `cmsTriton`)
204-
* `useGPU`: run on local GPU (equivalent to `-g` in `cmsTriton`)
203+
* `container`: container choice (equivalent to `-d` in `cmsTriton`)
204+
* `device`: device choice (equivalent to `-g` in `cmsTriton`)
205205
* `retries`: number of retries when starting container (passed to `-r [num]` in `cmsTriton` if >= 0; default: -1)
206206
* `wait`: maximum time to wait for server to start (passed to `-w time` in `cmsTriton` if >= 0; default: -1)
207207
* `instanceBaseName`: base name for server instance if random names are enabled (default: triton_server_instance)

HeterogeneousCore/SonicTriton/interface/TritonService.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class TritonService {
3636
: enable(pset.getUntrackedParameter<bool>("enable")),
3737
debug(pset.getUntrackedParameter<bool>("debug")),
3838
verbose(pset.getUntrackedParameter<bool>("verbose")),
39-
useDocker(pset.getUntrackedParameter<bool>("useDocker")),
39+
container(pset.getUntrackedParameter<std::string>("container")),
4040
device(pset.getUntrackedParameter<std::string>("device")),
4141
retries(pset.getUntrackedParameter<int>("retries")),
4242
wait(pset.getUntrackedParameter<int>("wait")),
@@ -54,7 +54,7 @@ class TritonService {
5454
bool enable;
5555
bool debug;
5656
bool verbose;
57-
bool useDocker;
57+
std::string container;
5858
std::string device;
5959
int retries;
6060
int wait;

HeterogeneousCore/SonicTriton/scripts/cmsTriton

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
# defaults
4-
USEDOCKER=""
4+
CONTAINER=apptainer
55
VERBOSE=""
66
VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
77
WTIME=600
@@ -42,7 +42,7 @@ usage() {
4242
$ECHO "-c \t don't cleanup temporary dir (for debugging)"
4343
$ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
4444
$ECHO "-D \t dry run: print container commands rather than executing them"
45-
$ECHO "-d \t use Docker instead of Apptainer"
45+
$ECHO "-d [exe] \t container choice: Apptainer, Docker, Podman (default: ${CONTAINER})"
4646
$ECHO "-E [path] \t include extra path(s) for executables (default: ${EXTRAPATH})"
4747
$ECHO "-f \t force reuse of (possibly) existing container instance"
4848
$ECHO "-g [device] \t device choice: auto (try to detect GPU), CPU, GPU (default: ${DEVICE})"
@@ -73,15 +73,15 @@ if [ -e /run/shm ]; then
7373
SHM=/run/shm
7474
fi
7575

76-
while getopts "cC:Ddfg:i:I:M:m:n:P:p:r:s:t:vw:h" opt; do
76+
while getopts "cC:Dd:fg:i:I:M:m:n:P:p:r:s:t:vw:h" opt; do
7777
case "$opt" in
7878
c) CLEANUP=""
7979
;;
8080
C) COMPAT_USR="$OPTARG"
8181
;;
8282
D) DRYRUN=echo
8383
;;
84-
d) USEDOCKER=true
84+
d) CONTAINER="$OPTARG"
8585
;;
8686
f) FORCE=true
8787
;;
@@ -130,6 +130,13 @@ if [[ ! " auto cpu gpu " =~ " $DEVICE " ]]; then
130130
exit 1
131131
fi
132132

133+
# check acceptable values for container choice
134+
CONTAINER="${CONTAINER,,}"
135+
if [[ ! " apptainer docker podman " =~ " $CONTAINER " ]]; then
136+
echo "Unsupported container value: $CONTAINER"
137+
exit 1
138+
fi
139+
133140
if [ "$RETRIES" -le 0 ]; then
134141
RETRIES=1
135142
fi
@@ -147,9 +154,13 @@ if [ -n "$EXTRAPATH" ]; then
147154
fi
148155

149156
# find executables
150-
if [ -n "$USEDOCKER" ]; then
157+
if [ "$CONTAINER" == "docker" ]; then
151158
if [ -z "$DOCKER" ]; then
152-
DOCKER="sudo docker"
159+
DOCKER="docker"
160+
fi
161+
elif [ "$CONTAINER" == "podman" ]; then
162+
if [ -z "$PODMAN" ]; then
163+
PODMAN="podman"
153164
fi
154165
else
155166
if [ -z "$APPTAINER" ]; then
@@ -246,6 +257,29 @@ start_docker(){
246257
${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
247258
}
248259

260+
start_podman(){
261+
# mount all model repositories
262+
MOUNTARGS=""
263+
REPOARGS=""
264+
for REPO in ${REPOS[@]}; do
265+
MOUNTARGS="$MOUNTARGS --volume $REPO:$REPO"
266+
REPOARGS="$REPOARGS --model-repository=${REPO}"
267+
done
268+
269+
# compatibility driver environment
270+
if [ -n "$COMPAT" ]; then
271+
MOUNTARGS="$MOUNTARGS --volume $COMPAT"
272+
if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
273+
MOUNTARGS="$MOUNTARGS --volume $COMPAT_SCRIPT_MOUNT"
274+
fi
275+
fi
276+
277+
$DRYRUN $PODMAN run -d --name ${SERVER} \
278+
--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
279+
-p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
280+
${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
281+
}
282+
249283
start_apptainer(){
250284
# triton server image may need to modify contents of opt/tritonserver/lib/
251285
# but cvmfs is read-only
@@ -305,6 +339,16 @@ stop_docker(){
305339
$DRYRUN $DOCKER rm ${SERVER}
306340
}
307341

342+
stop_podman(){
343+
# keep log
344+
if [ -z "$DRYRUN" ]; then
345+
if [ -n "$VERBOSE" ]; then $PODMAN logs ${SERVER} >& "$LOG"; fi
346+
fi
347+
348+
$DRYRUN $PODMAN stop ${SERVER}
349+
$DRYRUN $PODMAN rm ${SERVER}
350+
}
351+
308352
stop_apptainer(){
309353
$DRYRUN $APPTAINER instance stop ${SERVER}
310354
}
@@ -314,6 +358,11 @@ test_docker(){
314358
${DOCKER} logs ${SERVER} |& grep "$1"
315359
}
316360

361+
test_podman(){
362+
# podman logs print to stdout
363+
${PODMAN} logs ${SERVER} | grep "$1"
364+
}
365+
317366
test_apptainer(){
318367
grep "$1" $LOG
319368
}
@@ -462,6 +511,10 @@ driver_docker(){
462511
$DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
463512
}
464513

514+
driver_podman(){
515+
$PODMAN run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
516+
}
517+
465518
driver_apptainer(){
466519
D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh
467520
if [ -f "$D2S" ]; then
@@ -536,20 +589,33 @@ extra_docker(){
536589
EXTRA="--gpus all"
537590
fi
538591
}
592+
extra_podman(){
593+
if [ "$DEVICE" == gpu ]; then
594+
EXTRA="--device nvidia.com/gpu=all"
595+
fi
596+
}
539597
extra_apptainer(){
540598
if [ "$DEVICE" == gpu ]; then
541599
EXTRA="--nv"
542600
fi
543601
}
544602

545-
if [ -n "$USEDOCKER" ]; then
603+
if [ "$CONTAINER" == "docker" ]; then
546604
START_FN=start_docker
547605
EXTRA_FN=extra_docker
548606
TEST_FN=test_docker
549607
STOP_FN=stop_docker
550608
DRIVER_FN=driver_docker
551609
COMPAT_FN=compat_docker
552610
PROG_NAME=Docker
611+
elif [ "$CONTAINER" == "podman" ]; then
612+
START_FN=start_podman
613+
EXTRA_FN=extra_podman
614+
TEST_FN=test_podman
615+
STOP_FN=stop_podman
616+
DRIVER_FN=driver_podman
617+
COMPAT_FN=compat_podman
618+
PROG_NAME=Podman
553619
else
554620
START_FN=start_apptainer
555621
EXTRA_FN=extra_apptainer

HeterogeneousCore/SonicTriton/src/TritonService.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -273,12 +273,11 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm::
273273
//assemble server start command
274274
fallbackOpts_.command = "cmsTriton -P -1 -p " + pid_;
275275
fallbackOpts_.command += " -g " + fallbackOpts_.device;
276+
fallbackOpts_.command += " -d " + fallbackOpts_.container;
276277
if (fallbackOpts_.debug)
277278
fallbackOpts_.command += " -c";
278279
if (fallbackOpts_.verbose)
279280
fallbackOpts_.command += " -v";
280-
if (fallbackOpts_.useDocker)
281-
fallbackOpts_.command += " -d";
282281
if (!fallbackOpts_.instanceName.empty())
283282
fallbackOpts_.command += " -n " + fallbackOpts_.instanceName;
284283
if (fallbackOpts_.retries >= 0)
@@ -434,7 +433,8 @@ void TritonService::fillDescriptions(edm::ConfigurationDescriptions& description
434433
fallbackDesc.addUntracked<bool>("enable", false);
435434
fallbackDesc.addUntracked<bool>("debug", false);
436435
fallbackDesc.addUntracked<bool>("verbose", false);
437-
fallbackDesc.addUntracked<bool>("useDocker", false);
436+
fallbackDesc.ifValue(edm::ParameterDescription<std::string>("container", "apptainer", false),
437+
edm::allowedValues<std::string>("apptainer", "docker", "podman"));
438438
fallbackDesc.ifValue(edm::ParameterDescription<std::string>("device", "auto", false),
439439
edm::allowedValues<std::string>("auto", "cpu", "gpu"));
440440
fallbackDesc.addUntracked<int>("retries", -1);

HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
allowed_modes = ["Async","PseudoAsync","Sync"]
1616
allowed_compression = ["none","deflate","gzip"]
1717
allowed_devices = ["auto","cpu","gpu"]
18+
allowed_containers = ["apptainer","docker","podman"]
1819

1920
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
2021
parser.add_argument("--maxEvents", default=-1, type=int, help="Number of events to process (-1 for all)")
@@ -42,7 +43,7 @@
4243
parser.add_argument("--compression", default="", type=str, choices=allowed_compression, help="enable I/O compression")
4344
parser.add_argument("--ssl", default=False, action="store_true", help="enable SSL authentication for server communication")
4445
parser.add_argument("--device", default="auto", type=str.lower, choices=allowed_devices, help="specify device for fallback server")
45-
parser.add_argument("--docker", default=False, action="store_true", help="use Docker for fallback server")
46+
parser.add_argument("--container", default="apptainer", type=str.lower, choices=allowed_containers, help="specify container for fallback server")
4647
parser.add_argument("--tries", default=0, type=int, help="number of retries for failed request")
4748
options = parser.parse_args()
4849

@@ -74,7 +75,7 @@
7475

7576
process.TritonService.verbose = options.verbose or options.verboseService or options.verboseDiscovery
7677
process.TritonService.fallback.verbose = options.verbose or options.verboseServer
77-
process.TritonService.fallback.useDocker = options.docker
78+
process.TritonService.fallback.container = options.container
7879
process.TritonService.fallback.device = options.device
7980
if len(options.fallbackName)>0:
8081
process.TritonService.fallback.instanceBaseName = options.fallbackName

0 commit comments

Comments
 (0)