Skip to content

Commit 4657912

Browse files
pditommasojonmarti
andauthored
Report actual GCP zone in Google Batch trace records (#6855)
Co-authored-by: Jon Martí <jmartifraiz@gmail.com>
1 parent 5b96758 commit 4657912

File tree

2 files changed

+163
-0
lines changed

2 files changed

+163
-0
lines changed

plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
118118

119119
private volatile long timestamp
120120

121+
/**
122+
* Flag to indicate that the zone has been resolved from the status events
123+
*/
124+
private volatile boolean zoneUpdated
125+
121126
/**
122127
* A flag to indicate that the job has failed without launching any tasks
123128
*/
@@ -769,10 +774,57 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
769774
}
770775
}
771776

777+
/**
778+
* Regex pattern to extract zone from StatusEvent description.
779+
* Matches the pattern: zones/<zone-name>/instances/<instance-id>
780+
*/
781+
private static final Pattern ZONE_PATTERN = ~/zones\/([^\/]+)\/instances\//
782+
772783
protected CloudMachineInfo getMachineInfo() {
784+
if( machineInfo!=null && !zoneUpdated && isCompleted() )
785+
updateZoneFromEvents()
773786
return machineInfo
774787
}
775788

789+
/**
790+
* Parse the actual execution zone from StatusEvent descriptions and update
791+
* the machineInfo zone field accordingly.
792+
*
793+
* Google Batch status events include zone info in the description field with
794+
* the format: {@code zones/<zone-name>/instances/<instance-id>}
795+
*/
796+
private void updateZoneFromEvents() {
797+
try {
798+
final events = client.getTaskStatus(jobId, taskId)?.statusEventsList
799+
final zone = resolveZoneFromEvents(events)
800+
if( zone ) {
801+
machineInfo = new CloudMachineInfo(
802+
type: machineInfo.type,
803+
zone: zone,
804+
priceModel: machineInfo.priceModel
805+
)
806+
}
807+
}
808+
catch( Exception e ) {
809+
log.debug "[GOOGLE BATCH] Unable to resolve zone from events for task: `${task.lazyName()}` - ${e.message}"
810+
}
811+
finally {
812+
zoneUpdated = true
813+
}
814+
}
815+
816+
@PackageScope
817+
static String resolveZoneFromEvents(List<StatusEvent> events) {
818+
if( !events )
819+
return null
820+
for( def event : events ) {
821+
final matcher = ZONE_PATTERN.matcher(event.description ?: '')
822+
if( matcher.find() )
823+
return matcher.group(1)
824+
}
825+
return null
826+
}
827+
776828
/**
777829
* Count the number of spot instance reclamations for this task by examining
778830
* the task status events and checking for preemption exit codes

plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import nextflow.Session
3939
import nextflow.SysEnv
4040
import nextflow.cloud.google.batch.client.BatchClient
4141
import nextflow.cloud.google.batch.client.BatchConfig
42+
import nextflow.cloud.types.CloudMachineInfo
4243
import nextflow.cloud.types.PriceModel
4344
import nextflow.executor.Executor
4445
import nextflow.executor.ExecutorConfig
@@ -1113,6 +1114,116 @@ class GoogleBatchTaskHandlerTest extends Specification {
11131114
result.getEnvironment().getVariablesMap() == [VAR1: 'value1']
11141115
}
11151116

1117+
def 'should resolve zone from status events: #DESCRIPTION' () {
1118+
given:
1119+
def handler = Spy(GoogleBatchTaskHandler)
1120+
handler.@jobId = 'job-123'
1121+
handler.@taskId = '0'
1122+
handler.@machineInfo = new CloudMachineInfo(type: 'n2-standard-4', zone: ORIGINAL, priceModel: PriceModel.spot)
1123+
and:
1124+
def builder = TaskStatus.newBuilder()
1125+
if( EVENT_DESC )
1126+
builder.addStatusEvents(StatusEvent.newBuilder().setDescription(EVENT_DESC).build())
1127+
def status = builder.build()
1128+
1129+
when:
1130+
def info = handler.getMachineInfo()
1131+
then:
1132+
handler.isCompleted() >> true
1133+
handler.@client = Mock(BatchClient) {
1134+
getTaskStatus('job-123', '0') >> status
1135+
}
1136+
and:
1137+
info.zone == EXPECTED
1138+
1139+
where:
1140+
DESCRIPTION | ORIGINAL | EVENT_DESC | EXPECTED
1141+
'succeeded event' | 'europe-west2' | 'Task state is updated from RUNNING to SUCCEEDED on zones/europe-west2-a/instances/i-abc123' | 'europe-west2-a'
1142+
'preemption event' | 'us-central1' | 'Task state is updated from RUNNING to FAILED on zones/us-central1-f/instances/i-xyz789 due to Spot VM preemption with exit code 50001.' | 'us-central1-f'
1143+
'no zone in event' | 'europe-west2' | 'Task state is updated from PENDING to RUNNING' | 'europe-west2'
1144+
'empty events' | 'europe-west2' | null | 'europe-west2'
1145+
}
1146+
1147+
def 'should not resolve zone when task is not completed or machineInfo is null' () {
1148+
given:
1149+
def handler = Spy(GoogleBatchTaskHandler)
1150+
handler.@machineInfo = MACHINE_INFO
1151+
1152+
when:
1153+
def info = handler.getMachineInfo()
1154+
then:
1155+
handler.isCompleted() >> COMPLETED
1156+
and:
1157+
info?.zone == EXPECTED
1158+
1159+
where:
1160+
MACHINE_INFO | COMPLETED | EXPECTED
1161+
new CloudMachineInfo(type: 'n2-standard-4', zone: 'europe-west2', priceModel: PriceModel.spot) | false | 'europe-west2'
1162+
null | false | null
1163+
}
1164+
1165+
def 'should only resolve zone once across multiple getMachineInfo calls' () {
1166+
given:
1167+
def mockClient = Mock(BatchClient)
1168+
def handler = Spy(GoogleBatchTaskHandler)
1169+
handler.@jobId = 'job-123'
1170+
handler.@taskId = '0'
1171+
handler.@client = mockClient
1172+
handler.@machineInfo = new CloudMachineInfo(type: 'n2-standard-4', zone: 'europe-west2', priceModel: PriceModel.spot)
1173+
and:
1174+
def status = TaskStatus.newBuilder()
1175+
.addStatusEvents(StatusEvent.newBuilder()
1176+
.setDescription('Task state is updated from RUNNING to SUCCEEDED on zones/europe-west2-a/instances/i-abc123')
1177+
.build())
1178+
.build()
1179+
1180+
when:
1181+
handler.isCompleted() >> true
1182+
def info1 = handler.getMachineInfo()
1183+
def info2 = handler.getMachineInfo()
1184+
1185+
then:
1186+
1 * mockClient.getTaskStatus('job-123', '0') >> status
1187+
and:
1188+
info1.zone == 'europe-west2-a'
1189+
info2.zone == 'europe-west2-a'
1190+
}
1191+
1192+
def 'should handle API error gracefully when resolving zone' () {
1193+
given:
1194+
def handler = Spy(GoogleBatchTaskHandler)
1195+
handler.@jobId = 'job-123'
1196+
handler.@taskId = '0'
1197+
handler.@machineInfo = new CloudMachineInfo(type: 'n2-standard-4', zone: 'europe-west2', priceModel: PriceModel.spot)
1198+
handler.task = Mock(TaskRun) { lazyName() >> 'foo (1)' }
1199+
1200+
when:
1201+
def info = handler.getMachineInfo()
1202+
then:
1203+
handler.isCompleted() >> true
1204+
handler.@client = Mock(BatchClient) {
1205+
getTaskStatus('job-123', '0') >> { throw new IOException('API error') }
1206+
}
1207+
and:
1208+
info.zone == 'europe-west2'
1209+
}
1210+
1211+
def 'should resolve zone from events: #DESCRIPTION' () {
1212+
expect:
1213+
GoogleBatchTaskHandler.resolveZoneFromEvents(EVENTS) == EXPECTED
1214+
1215+
where:
1216+
DESCRIPTION | EVENTS | EXPECTED
1217+
'succeeded event' | [StatusEvent.newBuilder().setDescription('Task state is updated from RUNNING to SUCCEEDED on zones/europe-west2-a/instances/i-abc').build()] | 'europe-west2-a'
1218+
'preemption event' | [StatusEvent.newBuilder().setDescription('Task state is updated from RUNNING to FAILED on zones/us-central1-f/instances/i-xyz due to Spot VM preemption').build()] | 'us-central1-f'
1219+
'no zone pattern' | [StatusEvent.newBuilder().setDescription('Task state is updated from PENDING to RUNNING').build()] | null
1220+
'empty description' | [StatusEvent.newBuilder().setDescription('').build()] | null
1221+
'null description' | [StatusEvent.newBuilder().build()] | null
1222+
'null events' | null | null
1223+
'empty events' | [] | null
1224+
'picks first zone match' | [StatusEvent.newBuilder().setDescription('no zone here').build(), StatusEvent.newBuilder().setDescription('on zones/us-east1-b/instances/i-1').build()] | 'us-east1-b'
1225+
}
1226+
11161227
def 'should build container runnable with fusion privileged' () {
11171228
given:
11181229
def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch')

0 commit comments

Comments
 (0)