diff --git a/README.md b/README.md index 1ed2e41bf..53c092268 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,14 @@ all zones. --num-slices=4 --on-demand \ --tpu-type=v5litepod-16 ``` + Please specify `--enable-clouddns` if you would like CloudDNS to be the + DNS provider for the Pathways cluster. For example, + ```shell + python3 xpk.py cluster create-pathways \ + --cluster xpk-pw-test-clouddns \ + --num-slices=4 --on-demand \ + --tpu-type=v5litepod-16 + ``` * Cluster Create can be called again with the same `--cluster name` to modify the number of slices or retry failed steps. @@ -370,8 +378,8 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp --tpu-type=v5litepod-16 \ --cluster xpk-pw-test ``` - Executing the command above would provide the address of the proxy that the user job should connect to. - Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=` and `import previewutilies` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks! + Executing the command above would provide the address of the proxy that the user job should connect to. Users would need to use kubectl port-forwarding to establish connection from the notebook/VM to the proxy. + Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=` and `import pathwaysutils` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks! ### Set `max-restarts` for production jobs diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 0440b7419..9948abfff 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -88,7 +88,7 @@ def cluster_create(args) -> None: xpk_exit(create_cluster_command_code) # Update Pathways clusters with CloudDNS if not enabled already. - if args.enable_pathways: + if args.enable_pathways and args.enable_clouddns: update_cluster_command_code = update_cluster_with_clouddns_if_necessary( args ) @@ -468,10 +468,15 @@ def run_gke_cluster_create_command( command += ( ' --enable-ip-alias' f' --create-subnetwork name={args.cluster}-subnetwork' - ' --cluster-dns=clouddns' - ' --cluster-dns-scope=vpc' - f' --cluster-dns-domain={args.cluster}-domain' ) + if args.enable_clouddns: + # Enables CloudDNS as the default provider of the Pathways cluster, + # useful for Pathways headless mode workloads. + command += ( + ' --cluster-dns=clouddns' + ' --cluster-dns-scope=vpc' + f' --cluster-dns-domain={args.cluster}-domain' + ) return_code = run_command_with_updates(command, 'GKE Cluster Create', args) if return_code != 0: diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index fc8a6e26f..e3bd65188 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -326,10 +326,9 @@ def workload_create(args) -> None: if args.headless and not is_cluster_using_clouddns(args): xpk_print( - 'Please run xpk cluster create-pathways first, to upgrade and enable' - ' CloudDNS on your cluster.' + 'Cluster is not using CloudDNS, connect to the proxy server' + ' using kubectl port forwarding. ' ) - xpk_exit(1) set_cluster_command_code = set_cluster_command(args) if set_cluster_command_code != 0: diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index f8e9a04fe..6400f302d 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -283,8 +283,9 @@ def run_command_for_value( ) except subprocess.CalledProcessError as e: xpk_print(f'Task {task} failed with {e.returncode}') - xpk_print('*' * 80) - xpk_print(e.output) - xpk_print('*' * 80) + if e.output: + xpk_print('*' * 80) + xpk_print(e.output) + xpk_print('*' * 80) return e.returncode, str(e.output, 'UTF-8') return 0, str(output, 'UTF-8') diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index c10161f55..8002defb3 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -168,6 +168,11 @@ def set_cluster_parser(cluster_parser): default=None, help='The tpu type to use, v5litepod-16, etc.', ) + cluster_create_pathways_optional_arguments.add_argument( + '--enable-clouddns', + action='store_true', + help='Enables CloudDNS on the Pathways cluster.', + ) add_shared_cluster_create_required_arguments([ cluster_create_required_arguments, diff --git a/src/xpk/parser/workload.py b/src/xpk/parser/workload.py index 3ea38f33b..8ab7c6156 100644 --- a/src/xpk/parser/workload.py +++ b/src/xpk/parser/workload.py @@ -521,6 +521,8 @@ def add_shared_workload_create_optional_arguments(args_parsers): ' headless mode. This arg can only be used in `xpk workload' ' create-pathways`(preferred) or `xpk workload create' ' --use-pathways.` (--use-pathways will be deprecated soon).' + ' Headless workloads may be created on clusters with/without ' + ' CloudDNS.' ), ) custom_parser.add_argument(