|
92 | 92 | ) |
93 | 93 |
|
94 | 94 |
|
95 | | -class CreateRunnerWorkloadSubCommand(SubCommand): |
96 | | - """ |
97 | | - Command to create a runner workload deployment. |
98 | | - """ |
99 | | - |
100 | | - backend: str |
101 | | - device: str |
102 | | - command_script: str | None |
103 | | - port: int |
104 | | - host_network: bool |
105 | | - check: bool |
106 | | - namespace: str |
107 | | - service: str |
108 | | - version: str |
109 | | - name: str |
110 | | - volume: str |
111 | | - extra_args: list[str] |
112 | | - |
113 | | - @staticmethod |
114 | | - def register(parser: _SubParsersAction): |
115 | | - deploy_parser = parser.add_parser( |
116 | | - "create-runner", |
117 | | - help="Create a runner workload deployment", |
118 | | - ) |
119 | | - |
120 | | - deploy_parser.add_argument( |
121 | | - "--backend", |
122 | | - type=str, |
123 | | - help="Backend to use (default: detect from current environment)", |
124 | | - choices=supported_backends(), |
125 | | - ) |
126 | | - |
127 | | - deploy_parser.add_argument( |
128 | | - "--device", |
129 | | - type=str, |
130 | | - help="Device to use, multiple devices join by comma (default: all devices)", |
131 | | - default="all", |
132 | | - ) |
133 | | - |
134 | | - deploy_parser.add_argument( |
135 | | - "--command-script-file", |
136 | | - type=str, |
137 | | - help="Path of command script for the workload", |
138 | | - ) |
139 | | - |
140 | | - deploy_parser.add_argument( |
141 | | - "--port", |
142 | | - type=int, |
143 | | - help="Port to expose", |
144 | | - ) |
145 | | - |
146 | | - deploy_parser.add_argument( |
147 | | - "--host-network", |
148 | | - action="store_true", |
149 | | - help="Use host network (default: False)", |
150 | | - default=False, |
151 | | - ) |
152 | | - |
153 | | - deploy_parser.add_argument( |
154 | | - "--check", |
155 | | - action="store_true", |
156 | | - help="Enable health check, needs --port (default: False)", |
157 | | - default=False, |
158 | | - ) |
159 | | - |
160 | | - deploy_parser.add_argument( |
161 | | - "--namespace", |
162 | | - type=str, |
163 | | - help="Namespace of the runner", |
164 | | - ) |
165 | | - |
166 | | - deploy_parser.add_argument( |
167 | | - "service", |
168 | | - type=str, |
169 | | - help="Service of the runner", |
170 | | - ) |
171 | | - |
172 | | - deploy_parser.add_argument( |
173 | | - "version", |
174 | | - type=str, |
175 | | - help="Version of the runner", |
176 | | - ) |
177 | | - |
178 | | - deploy_parser.add_argument( |
179 | | - "volume", |
180 | | - type=str, |
181 | | - help="Volume to mount", |
182 | | - ) |
183 | | - |
184 | | - deploy_parser.add_argument( |
185 | | - "extra_args", |
186 | | - nargs=REMAINDER, |
187 | | - help="Extra arguments for the runner", |
188 | | - ) |
189 | | - |
190 | | - deploy_parser.set_defaults(func=CreateRunnerWorkloadSubCommand) |
191 | | - |
192 | | - def __init__(self, args: Namespace): |
193 | | - self.backend = args.backend |
194 | | - self.device = args.device |
195 | | - self.command_script = None |
196 | | - self.port = args.port |
197 | | - self.host_network = args.host_network |
198 | | - self.check = args.check |
199 | | - self.namespace = args.namespace |
200 | | - self.service = args.service |
201 | | - self.version = args.version |
202 | | - self.name = f"{args.service}-{args.version}".lower().replace(".", "-") |
203 | | - self.volume = args.volume |
204 | | - self.extra_args = args.extra_args |
205 | | - |
206 | | - if not self.name or not self.volume: |
207 | | - msg = "The name and volume arguments are required." |
208 | | - raise ValueError(msg) |
209 | | - |
210 | | - if args.command_script_file: |
211 | | - command_script_file = Path(args.command_script_file) |
212 | | - if not command_script_file.is_file(): |
213 | | - msg = f"The command script file '{command_script_file}' does not exist." |
214 | | - raise ValueError(msg) |
215 | | - self.command_script = command_script_file.read_text( |
216 | | - encoding="utf-8", |
217 | | - ).strip() |
218 | | - |
219 | | - def run(self): |
220 | | - env = [ |
221 | | - ContainerEnv( |
222 | | - name=name, |
223 | | - value=value, |
224 | | - ) |
225 | | - for name, value in os.environ.items() |
226 | | - if not name.startswith(_IGNORE_ENVS_PREFIX) |
227 | | - and not name.endswith(_IGNORE_ENVS_SUFFIX) |
228 | | - ] |
229 | | - if self.backend: |
230 | | - resources = ContainerResources( |
231 | | - **{ |
232 | | - v: self.device |
233 | | - for k, v in envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.items() |
234 | | - if k == self.backend |
235 | | - }, |
236 | | - ) |
237 | | - else: |
238 | | - resources = ContainerResources( |
239 | | - **{ |
240 | | - envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY: self.device, |
241 | | - }, |
242 | | - ) |
243 | | - mounts = [ |
244 | | - ContainerMount( |
245 | | - path=self.volume, |
246 | | - ), |
247 | | - ] |
248 | | - execution = ContainerExecution( |
249 | | - command_script=self.command_script, |
250 | | - args=self.extra_args, |
251 | | - ) |
252 | | - ports = ( |
253 | | - [ |
254 | | - ContainerPort( |
255 | | - internal=self.port, |
256 | | - ), |
257 | | - ] |
258 | | - if self.port |
259 | | - else None |
260 | | - ) |
261 | | - checks = ( |
262 | | - [ |
263 | | - ContainerCheck( |
264 | | - delay=60, |
265 | | - interval=10, |
266 | | - timeout=5, |
267 | | - retries=6, |
268 | | - tcp=ContainerCheckTCP(port=self.port), |
269 | | - teardown=True, |
270 | | - ), |
271 | | - ] |
272 | | - if self.check and self.port |
273 | | - else None |
274 | | - ) |
275 | | - plan = WorkloadPlan( |
276 | | - name=self.name, |
277 | | - namespace=self.namespace, |
278 | | - host_network=self.host_network, |
279 | | - containers=[ |
280 | | - Container( |
281 | | - restart_policy=( |
282 | | - ContainerRestartPolicyEnum.NEVER |
283 | | - if not self.check |
284 | | - else ContainerRestartPolicyEnum.ALWAYS |
285 | | - ), |
286 | | - image=f"gpustack/runner:{self.backend if self.backend else 'Host'}X.Y-{self.service}{self.version}", |
287 | | - name=self.name, |
288 | | - envs=env, |
289 | | - resources=resources, |
290 | | - mounts=mounts, |
291 | | - execution=execution, |
292 | | - ports=ports, |
293 | | - checks=checks, |
294 | | - ), |
295 | | - ], |
296 | | - ) |
297 | | - create_workload(plan) |
298 | | - print(f"Created workload '{self.name}'.") |
299 | | - |
300 | | - while True: |
301 | | - st = get_workload( |
302 | | - name=self.name, |
303 | | - namespace=self.namespace, |
304 | | - ) |
305 | | - if st and st.state not in ( |
306 | | - WorkloadStatusStateEnum.PENDING, |
307 | | - WorkloadStatusStateEnum.INITIALIZING, |
308 | | - ): |
309 | | - break |
310 | | - time.sleep(1) |
311 | | - |
312 | | - print("\033[2J\033[H", end="") |
313 | | - |
314 | | - async def stream_logs(): |
315 | | - logs_result = await async_logs_workload( |
316 | | - name=self.name, |
317 | | - namespace=self.namespace, |
318 | | - tail=-1, |
319 | | - follow=True, |
320 | | - ) |
321 | | - async for line in logs_result: |
322 | | - print(line.decode("utf-8").rstrip()) |
323 | | - |
324 | | - asyncio.run(stream_logs()) |
325 | | - |
326 | | - |
327 | 95 | class CreateWorkloadSubCommand(SubCommand): |
328 | 96 | """ |
329 | 97 | Command to create a workload deployment. |
@@ -358,8 +126,7 @@ def register(parser: _SubParsersAction): |
358 | 126 | deploy_parser.add_argument( |
359 | 127 | "--device", |
360 | 128 | type=str, |
361 | | - help="Device to use, multiple devices join by comma (default: all devices)", |
362 | | - default="all", |
| 129 | + help="Device to use, multiple devices join by comma, all for all devices", |
363 | 130 | ) |
364 | 131 |
|
365 | 132 | deploy_parser.add_argument( |
@@ -456,20 +223,22 @@ def run(self): |
456 | 223 | if not name.startswith(_IGNORE_ENVS_PREFIX) |
457 | 224 | and not name.endswith(_IGNORE_ENVS_SUFFIX) |
458 | 225 | ] |
459 | | - if self.backend: |
460 | | - resources = ContainerResources( |
461 | | - **{ |
462 | | - v: self.device |
463 | | - for k, v in envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.items() |
464 | | - if k == self.backend |
465 | | - }, |
466 | | - ) |
467 | | - else: |
468 | | - resources = ContainerResources( |
469 | | - **{ |
470 | | - envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY: self.device, |
471 | | - }, |
472 | | - ) |
| 226 | + resources = None |
| 227 | + if self.device: |
| 228 | + if self.backend: |
| 229 | + resources = ContainerResources( |
| 230 | + **{ |
| 231 | + v: self.device |
| 232 | + for k, v in envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.items() |
| 233 | + if k == self.backend |
| 234 | + }, |
| 235 | + ) |
| 236 | + else: |
| 237 | + resources = ContainerResources( |
| 238 | + **{ |
| 239 | + envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY: self.device, |
| 240 | + }, |
| 241 | + ) |
473 | 242 | mounts = [ |
474 | 243 | ContainerMount( |
475 | 244 | path=self.volume, |
|
0 commit comments