Skip to content

Commit 5e1e380

Browse files
authored
Merge pull request #45182 from fastmachinelearning/cmsTritonUpdates
SONIC updates for site support
2 parents be1c0ac + e24f362 commit 5e1e380

File tree

9 files changed

+294
-125
lines changed

9 files changed

+294
-125
lines changed

HeterogeneousCore/SonicTriton/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,19 +132,19 @@ The script has three operations (`start`, `stop`, `check`) and the following opt
132132
* `-c`: don't cleanup temporary dir (for debugging)
133133
* `-C [dir]`: directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)
134134
* `-D`: dry run: print container commands rather than executing them
135-
* `-d`: use Docker instead of Apptainer
135+
* `-d [exe]`: container choice: apptainer, docker, podman, podman-hpc (default: apptainer)
136136
* `-E [path]`: include extra path(s) for executables (default: /cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin)
137137
* `-f`: force reuse of (possibly) existing container instance
138-
* `-g`: use GPU instead of CPU
139-
* `-i` [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric)
138+
* `-g [device]`: device choice: auto (try to detect GPU), CPU, GPU (default: auto)
139+
* `-i [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric)
140140
* `-I [num]`: number of model instances (default: 0 -> means no local editing of config files)
141141
* `-M [dir]`: model repository (can be given more than once)
142142
* `-m [dir]`: specific model directory (can be given more than one)
143143
* `-n [name]`: name of container instance, also used for hidden temporary dir (default: triton_server_instance)
144144
* `-P [port]`: base port number for services (-1: automatically find an unused port range) (default: 8000)
145145
* `-p [pid]`: automatically shut down server when process w/ specified PID ends (-1: use parent process PID)
146146
* `-r [num]`: number of retries when starting container (default: 3)
147-
* `-s [dir]`: Apptainer sandbox directory (default: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/fastml/triton-torchgeo:22.07-py3-geometric)
147+
* `-s [dir]`: apptainer sandbox directory (default: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/fastml/triton-torchgeo:22.07-py3-geometric)
148148
* `-t [dir]`: non-default hidden temporary dir
149149
* `-v`: (verbose) start: activate server debugging info; stop: keep server logs
150150
* `-w [time]`: maximum time to wait for server to start (default: 300 seconds)
@@ -200,8 +200,8 @@ The fallback server has a separate set of options, mostly related to the invocat
200200
* `enable`: enable the fallback server
201201
* `debug`: enable debugging (equivalent to `-c` in `cmsTriton`)
202202
* `verbose`: enable verbose output in logs (equivalent to `-v` in `cmsTriton`)
203-
* `useDocker`: use Docker instead of Apptainer (equivalent to `-d` in `cmsTriton`)
204-
* `useGPU`: run on local GPU (equivalent to `-g` in `cmsTriton`)
203+
* `container`: container choice (equivalent to `-d` in `cmsTriton`)
204+
* `device`: device choice (equivalent to `-g` in `cmsTriton`)
205205
* `retries`: number of retries when starting container (passed to `-r [num]` in `cmsTriton` if >= 0; default: -1)
206206
* `wait`: maximum time to wait for server to start (passed to `-w time` in `cmsTriton` if >= 0; default: -1)
207207
* `instanceBaseName`: base name for server instance if random names are enabled (default: triton_server_instance)

HeterogeneousCore/SonicTriton/interface/TritonService.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ class TritonService {
3636
: enable(pset.getUntrackedParameter<bool>("enable")),
3737
debug(pset.getUntrackedParameter<bool>("debug")),
3838
verbose(pset.getUntrackedParameter<bool>("verbose")),
39-
useDocker(pset.getUntrackedParameter<bool>("useDocker")),
40-
useGPU(pset.getUntrackedParameter<bool>("useGPU")),
39+
container(pset.getUntrackedParameter<std::string>("container")),
40+
device(pset.getUntrackedParameter<std::string>("device")),
4141
retries(pset.getUntrackedParameter<int>("retries")),
4242
wait(pset.getUntrackedParameter<int>("wait")),
4343
instanceName(pset.getUntrackedParameter<std::string>("instanceName")),
@@ -54,8 +54,8 @@ class TritonService {
5454
bool enable;
5555
bool debug;
5656
bool verbose;
57-
bool useDocker;
58-
bool useGPU;
57+
std::string container;
58+
std::string device;
5959
int retries;
6060
int wait;
6161
std::string instanceName;
@@ -89,6 +89,7 @@ class TritonService {
8989
std::unordered_set<std::string> models;
9090
static const std::string fallbackName;
9191
static const std::string fallbackAddress;
92+
static const std::string siteconfName;
9293
};
9394
struct Model {
9495
Model(const std::string& path_ = "") : path(path_) {}

HeterogeneousCore/SonicTriton/python/TritonService_cff.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,8 @@
22

33
from Configuration.ProcessModifiers.enableSonicTriton_cff import enableSonicTriton
44

5-
_gpu_available_cached = None
6-
7-
def _gpu_available():
8-
global _gpu_available_cached
9-
if _gpu_available_cached is None:
10-
import os
11-
_gpu_available_cached = (os.system("nvidia-smi -L") == 0)
12-
return _gpu_available_cached
13-
145
enableSonicTriton.toModify(TritonService,
156
fallback = dict(
167
enable = True,
17-
useGPU = _gpu_available(),
188
),
199
)

0 commit comments

Comments
 (0)