Skip to content

Commit 98bcf4e

Browse files
authored
Merge branch 'develop' into lidanny/feature/update-to-CoreDNS
2 parents 0945e87 + f2340d0 commit 98bcf4e

File tree

9 files changed

+177
-1089
lines changed

9 files changed

+177
-1089
lines changed

.github/workflows/build_tests.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ jobs:
4040
group-name: ${{ steps.set-group-name.outputs.group-name }}
4141
zone: ${{ steps.set-zone.outputs.zone }}
4242
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
43+
tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }}
4344
location: ${{steps.set-location.outputs.location}}
4445
run-id: ${{steps.set-run-id.outputs.run-id}}
4546
steps:
@@ -76,6 +77,10 @@ jobs:
7677
id: set-tpu-type
7778
run: |
7879
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
80+
- name: set tpu-type-topology
81+
id: set-tpu-type-topology
82+
run: |
83+
echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT
7984
- name: set location
8085
id: set-location
8186
run: |
@@ -152,7 +157,7 @@ jobs:
152157
with:
153158
run-id: '${{needs.set-variables.outputs.run-id}}'
154159
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
155-
tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}'
160+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
156161
zone: '${{needs.set-variables.outputs.zone}}'
157162
location: '${{needs.set-variables.outputs.location}}'
158163
secrets: inherit
@@ -165,7 +170,7 @@ jobs:
165170
with:
166171
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
167172
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
168-
tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}'
173+
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
169174
zone: '${{needs.set-variables.outputs.zone}}'
170175
location: '${{needs.set-variables.outputs.location}}'
171176
run-id: '${{needs.set-variables.outputs.run-id}}'
@@ -180,6 +185,7 @@ jobs:
180185
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
181186
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
182187
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
188+
tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}}
183189
zone: ${{needs.set-variables.outputs.zone}}
184190
run-id: '${{needs.set-variables.outputs.run-id}}'
185191
secrets: inherit

.github/workflows/reusable_workload_tests.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ on:
2424
tpu-type:
2525
required: true
2626
type: string
27+
tpu-type-topology:
28+
required: true
29+
type: string
2730
tpu-type-dws:
2831
required: false
2932
type: string
@@ -108,7 +111,7 @@ jobs:
108111
--docker-password='${{secrets.GCP_SA_KEY}}' \
109112
--docker-email='${{secrets.GCP_SA_EMAIL}}'
110113
- name: Run workload with private image
111-
run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key
114+
run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type-topology}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key
112115
- name: Wait for private image workload completion and confirm it succeeded
113116
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $PRIVATE_IMAGE_WORKLOAD_NAME --timeout 300
114117
- name: Delete kubectl secret

src/xpk/commands/cluster.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,6 +1101,7 @@ def run_gke_cluster_create_command(
11011101
f' {args.custom_cluster_arguments}'
11021102
f' {rapid_release_cmd}'
11031103
' --enable-dns-access'
1104+
' --autoscaling-profile=optimize-utilization'
11041105
)
11051106

11061107
enable_ip_alias = False

src/xpk/core/capacity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type(
232232
case CapacityType.ON_DEMAND.name:
233233
node_selector = ''
234234
case CapacityType.FLEX_START.name:
235-
node_selector = 'cloud.google.com/gke-queued="true"'
235+
node_selector = 'cloud.google.com/gke-queued: "true"'
236236
case CapacityType.SPOT.name:
237-
node_selector = 'cloud.google.com/gke-spot="true"'
237+
node_selector = 'cloud.google.com/gke-spot: "true"'
238238
case CapacityType.RESERVATION.name:
239239
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
240240
case _:

src/xpk/core/jobset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
limits:
8282
memory: {memory_limit_size}
8383
requests:
84-
cpu: 500m
84+
cpu: 1000m
8585
memory: 128Mi
8686
securityContext:
8787
allowPrivilegeEscalation: false

src/xpk/core/kueue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,10 @@
244244
periodSeconds: 10
245245
resources:
246246
limits:
247-
cpu: 750m
247+
cpu: 1000m
248248
memory: {memory_limit_size}
249249
requests:
250-
cpu: 750m
250+
cpu: 1000m
251251
memory: 512Mi
252252
securityContext:
253253
allowPrivilegeEscalation: false

src/xpk/core/nap.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def enable_autoprovisioning_on_cluster(
9999
f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
100100
' --autoprovisioning-config-file'
101101
f' {autoprovisioning_config.config_filename}'
102+
' --autoscaling-profile=optimize-utilization'
102103
)
103104
task = 'Update cluster with autoprovisioning enabled'
104105
return_code = run_command_with_updates(command, task, args)

src/xpk/core/scheduling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
4949
missing_gke_accelerator_type = False
5050
if not cluster_config_map.get(system.gke_accelerator):
5151
xpk_print(
52-
f'Gke Accelerator Type Check: {args.workload} is requesting'
52+
f'GKE Accelerator Type Check: {args.workload} is requesting'
5353
f' {system.gke_accelerator} but cluster only contains'
5454
f' {cluster_config_map.keys()}. '
5555
)

0 commit comments

Comments
 (0)