|
258 | 258 | operator: "All" |
259 | 259 | targetReplicatedJobs: |
260 | 260 | - {args.targetReplicatedJob} |
| 261 | + startupPolicy: |
| 262 | + startupPolicyOrder: InOrder |
261 | 263 | replicatedJobs: |
262 | | - - name: worker |
263 | | - replicas: {args.num_slices} |
264 | | - template: |
265 | | - metadata: |
266 | | - annotations: |
267 | | - alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool |
268 | | - labels: |
269 | | - xpk.google.com/workload: {args.workload} |
270 | | - spec: |
271 | | - backoffLimit: {backoff_limit} |
272 | | - completions: {system.vms_per_slice} |
273 | | - parallelism: {system.vms_per_slice} |
274 | | - template: |
275 | | - metadata: |
276 | | - annotations: |
277 | | - {storage_annotations} |
278 | | - spec: |
279 | | - terminationGracePeriodSeconds: {args.termination_grace_period_seconds} |
280 | | - serviceAccountName: {service_account} |
281 | | - containers: |
282 | | - - args: |
283 | | - {pathways_worker_args} |
284 | | - image: {args.server_image} |
285 | | - imagePullPolicy: Always |
286 | | - name: pathways-worker |
287 | | - ports: |
288 | | - - containerPort: 29001 |
289 | | - - containerPort: 8471 |
290 | | - - containerPort: 8080 |
291 | | - resources: |
292 | | - limits: |
293 | | - {resource_type}: {system.chips_per_vm} |
294 | | - securityContext: |
295 | | - privileged: true |
296 | | - volumeMounts: |
297 | | - - mountPath: /tmp |
298 | | - name: shared-tmp |
299 | | - {storage_volume_mounts} |
300 | | - env: |
301 | | - - name: PROJECT_ID |
302 | | - value: {args.project} |
303 | | - - name: LOCATION |
304 | | - value: {args.zone} |
305 | | - - name: CLUSTER_NAME |
306 | | - value: {args.cluster} |
307 | | - - name: POD_NAME |
308 | | - valueFrom: |
309 | | - fieldRef: |
310 | | - fieldPath: metadata.name |
311 | | - - name: CONTAINER_NAME |
312 | | - value: "pathways-worker" |
313 | | - - name: NAMESPACE |
314 | | - valueFrom: |
315 | | - fieldRef: |
316 | | - fieldPath: metadata.namespace |
317 | | - # Workaround for v6e |
318 | | - - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER |
319 | | - value: "false" |
320 | | - - name: MEGASCALE_NUM_SLICES |
321 | | - valueFrom: |
322 | | - fieldRef: |
323 | | - fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" |
324 | | - - name: JOBSET_NAME |
325 | | - valueFrom: |
326 | | - fieldRef: |
327 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
328 | | - - name: REPLICATED_JOB_NAME |
329 | | - valueFrom: |
330 | | - fieldRef: |
331 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
332 | | - - name: MEGASCALE_SLICE_ID |
333 | | - valueFrom: |
334 | | - fieldRef: |
335 | | - fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" |
336 | | - - name: MEGASCALE_COORDINATOR_ADDRESS |
337 | | - value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" |
338 | | - {pathways_sidecar_container} |
339 | | - nodeSelector: |
340 | | - {accelerator_label} |
341 | | - {machine_label} |
342 | | - {autoprovisioning_args} |
343 | | - priorityClassName: {args.priority} |
344 | | - hostNetwork: true |
345 | | - dnsPolicy: ClusterFirstWithHostNet |
346 | | - volumes: |
347 | | - - hostPath: |
348 | | - path: /tmp |
349 | | - type: DirectoryOrCreate |
350 | | - name: shared-tmp |
351 | | - {storage_volumes} |
352 | 264 | - name: rm |
353 | 265 | replicas: 1 |
354 | 266 | template: |
|
365 | 277 | - args: |
366 | 278 | {pathways_rm_args} |
367 | 279 | env: |
| 280 | + - name: REPLICATED_JOB_NAME |
| 281 | + valueFrom: |
| 282 | + fieldRef: |
| 283 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
| 284 | + - name: JOBSET_NAME |
| 285 | + valueFrom: |
| 286 | + fieldRef: |
| 287 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
| 288 | + - name: HOST_ADDRESS |
| 289 | + value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) |
| 290 | + - name: TPU_SKIP_MDS_QUERY |
| 291 | + value: "true" |
368 | 292 | - name: PROJECT_ID |
369 | 293 | value: {args.project} |
370 | 294 | - name: LOCATION |
|
381 | 305 | valueFrom: |
382 | 306 | fieldRef: |
383 | 307 | fieldPath: metadata.namespace |
384 | | - - name: REPLICATED_JOB_NAME |
385 | | - valueFrom: |
386 | | - fieldRef: |
387 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
388 | | - - name: JOBSET_NAME |
389 | | - valueFrom: |
390 | | - fieldRef: |
391 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
392 | | - - name: HOST_ADDRESS |
393 | | - value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) |
394 | | - - name: TPU_SKIP_MDS_QUERY |
395 | | - value: "true" |
396 | | - image: {args.server_image} |
397 | 308 | imagePullPolicy: Always |
398 | 309 | name: pathways-rm |
399 | 310 | ports: |
|
454 | 365 | nodeSelector: |
455 | 366 | cloud.google.com/gke-nodepool: cpu-proxy-np |
456 | 367 | {user_workload} |
| 368 | + - name: worker |
| 369 | + replicas: {args.num_slices} |
| 370 | + template: |
| 371 | + metadata: |
| 372 | + annotations: |
| 373 | + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool |
| 374 | + labels: |
| 375 | + xpk.google.com/workload: {args.workload} |
| 376 | + spec: |
| 377 | + backoffLimit: {backoff_limit} |
| 378 | + completions: {system.vms_per_slice} |
| 379 | + parallelism: {system.vms_per_slice} |
| 380 | + template: |
| 381 | + metadata: |
| 382 | + annotations: |
| 383 | + {storage_annotations} |
| 384 | + spec: |
| 385 | + terminationGracePeriodSeconds: {args.termination_grace_period_seconds} |
| 386 | + serviceAccountName: {service_account} |
| 387 | + containers: |
| 388 | + - args: |
| 389 | + {pathways_worker_args} |
| 390 | + image: {args.server_image} |
| 391 | + imagePullPolicy: Always |
| 392 | + name: pathways-worker |
| 393 | + ports: |
| 394 | + - containerPort: 29001 |
| 395 | + - containerPort: 8471 |
| 396 | + - containerPort: 8080 |
| 397 | + resources: |
| 398 | + limits: |
| 399 | + {resource_type}: {system.chips_per_vm} |
| 400 | + securityContext: |
| 401 | + privileged: true |
| 402 | + volumeMounts: |
| 403 | + - mountPath: /tmp |
| 404 | + name: shared-tmp |
| 405 | + {storage_volume_mounts} |
| 406 | + env: |
| 407 | + - name: PROJECT_ID |
| 408 | + value: {args.project} |
| 409 | + - name: LOCATION |
| 410 | + value: {args.zone} |
| 411 | + - name: CLUSTER_NAME |
| 412 | + value: {args.cluster} |
| 413 | + - name: POD_NAME |
| 414 | + valueFrom: |
| 415 | + fieldRef: |
| 416 | + fieldPath: metadata.name |
| 417 | + - name: CONTAINER_NAME |
| 418 | + value: "pathways-worker" |
| 419 | + - name: NAMESPACE |
| 420 | + valueFrom: |
| 421 | + fieldRef: |
| 422 | + fieldPath: metadata.namespace |
| 423 | + # Workaround for v6e |
| 424 | + - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER |
| 425 | + value: "false" |
| 426 | + - name: MEGASCALE_NUM_SLICES |
| 427 | + valueFrom: |
| 428 | + fieldRef: |
| 429 | + fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" |
| 430 | + - name: JOBSET_NAME |
| 431 | + valueFrom: |
| 432 | + fieldRef: |
| 433 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
| 434 | + - name: REPLICATED_JOB_NAME |
| 435 | + valueFrom: |
| 436 | + fieldRef: |
| 437 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
| 438 | + - name: MEGASCALE_SLICE_ID |
| 439 | + valueFrom: |
| 440 | + fieldRef: |
| 441 | + fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" |
| 442 | + - name: MEGASCALE_COORDINATOR_ADDRESS |
| 443 | + value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" |
| 444 | + {pathways_sidecar_container} |
| 445 | + nodeSelector: |
| 446 | + {accelerator_label} |
| 447 | + {machine_label} |
| 448 | + {autoprovisioning_args} |
| 449 | + priorityClassName: {args.priority} |
| 450 | + hostNetwork: true |
| 451 | + dnsPolicy: ClusterFirstWithHostNet |
| 452 | + volumes: |
| 453 | + - hostPath: |
| 454 | + path: /tmp |
| 455 | + type: DirectoryOrCreate |
| 456 | + name: shared-tmp |
| 457 | + {storage_volumes} |
457 | 458 | """ |
458 | 459 |
|
459 | 460 |
|
@@ -742,8 +743,7 @@ def workload_create(args) -> None: |
742 | 743 | ' done! ******* ' |
743 | 744 | ) |
744 | 745 | xpk_print( |
745 | | - 'Steps to connect to the proxy: kubectl get pods | grep proxy ;' |
746 | | - ' kubectl port-forward <proxy-pod-name> 29000:29000; ' |
| 746 | + 'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &' |
747 | 747 | ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;' |
748 | 748 | " python -c 'import pathwaysutils; import jax; print(jax.devices())'" |
749 | 749 | ) |
|
0 commit comments