88 type CompleteRunAttemptResult ,
99 HeartbeatService ,
1010 type RunExecutionData ,
11+ type TaskRunExecutionMetrics ,
1112 type TaskRunExecutionResult ,
1213 type TaskRunFailedExecutionResult ,
1314 WorkerManifest ,
@@ -25,6 +26,11 @@ import { assertExhaustive } from "../utilities/assertExhaustive.js";
2526import { setTimeout as sleep } from "timers/promises" ;
2627import { io , type Socket } from "socket.io-client" ;
2728
29+ const DateEnv = z
30+ . string ( )
31+ . transform ( ( val ) => new Date ( parseInt ( val , 10 ) ) )
32+ . pipe ( z . date ( ) ) ;
33+
2834// All IDs are friendly IDs
2935const Env = z . object ( {
3036 // Set at build time
@@ -50,6 +56,10 @@ const Env = z.object({
5056 TRIGGER_RUNNER_ID : z . string ( ) ,
5157 TRIGGER_METADATA_URL : z . string ( ) . optional ( ) ,
5258
59+ // Timeline metrics
60+ TRIGGER_POD_SCHEDULED_AT_MS : DateEnv ,
61+ TRIGGER_DEQUEUED_AT_MS : DateEnv ,
62+
5363 // May be overridden
5464 TRIGGER_SUPERVISOR_API_PROTOCOL : z . enum ( [ "http" , "https" ] ) ,
5565 TRIGGER_SUPERVISOR_API_DOMAIN : z . string ( ) ,
@@ -238,6 +248,14 @@ class ManagedRunController {
238248
239249 if ( ! response . success ) {
240250 console . error ( "[ManagedRunController] Heartbeat failed" , { error : response . error } ) ;
251+
252+ this . sendDebugLog ( {
253+ runId : this . runFriendlyId ,
254+ message : "heartbeat: failed" ,
255+ properties : {
256+ error : response . error ,
257+ } ,
258+ } ) ;
241259 }
242260 } ,
243261 intervalMs : this . heartbeatIntervalSeconds * 1000 ,
@@ -620,6 +638,14 @@ class ManagedRunController {
620638 if ( ! continuationResult . success ) {
621639 console . error ( "Failed to continue execution" , { error : continuationResult . error } ) ;
622640
641+ this . sendDebugLog ( {
642+ runId : run . friendlyId ,
643+ message : "failed to continue execution" ,
644+ properties : {
645+ error : continuationResult . error ,
646+ } ,
647+ } ) ;
648+
623649 this . waitForNextRun ( ) ;
624650 return ;
625651 }
@@ -734,10 +760,14 @@ class ManagedRunController {
734760 private async startAndExecuteRunAttempt ( {
735761 runFriendlyId,
736762 snapshotFriendlyId,
763+ dequeuedAt,
764+ podScheduledAt,
737765 isWarmStart = false ,
738766 } : {
739767 runFriendlyId : string ;
740768 snapshotFriendlyId : string ;
769+ dequeuedAt ?: Date ;
770+ podScheduledAt ?: Date ;
741771 isWarmStart ?: boolean ;
742772 } ) {
743773 if ( ! this . socket ) {
@@ -749,39 +779,79 @@ class ManagedRunController {
749779 snapshot : { friendlyId : snapshotFriendlyId } ,
750780 } ) ;
751781
782+ const attemptStartedAt = Date . now ( ) ;
783+
752784 const start = await this . httpClient . startRunAttempt ( runFriendlyId , snapshotFriendlyId , {
753785 isWarmStart,
754786 } ) ;
755787
756788 if ( ! start . success ) {
757789 console . error ( "[ManagedRunController] Failed to start run" , { error : start . error } ) ;
758790
791+ this . sendDebugLog ( {
792+ runId : runFriendlyId ,
793+ message : "failed to start run attempt" ,
794+ properties : {
795+ error : start . error ,
796+ } ,
797+ } ) ;
798+
759799 this . waitForNextRun ( ) ;
760800 return ;
761801 }
762802
803+ const attemptDuration = Date . now ( ) - attemptStartedAt ;
804+
763805 const { run, snapshot, execution, envVars } = start . data ;
764806
765807 logger . debug ( "[ManagedRunController] Started run" , {
766808 runId : run . friendlyId ,
767809 snapshot : snapshot . friendlyId ,
768810 } ) ;
769811
770- // TODO: We may already be executing this run, this may be a new attempt
771- // This is the only case where incrementing the attempt number is allowed
772812 this . enterRunPhase ( run , snapshot ) ;
773813
814+ const metrics = [
815+ {
816+ name : "start" ,
817+ event : "create_attempt" ,
818+ timestamp : attemptStartedAt ,
819+ duration : attemptDuration ,
820+ } ,
821+ ]
822+ . concat (
823+ dequeuedAt
824+ ? [
825+ {
826+ name : "start" ,
827+ event : "dequeue" ,
828+ timestamp : dequeuedAt . getTime ( ) ,
829+ duration : 0 ,
830+ } ,
831+ ]
832+ : [ ]
833+ )
834+ . concat (
835+ podScheduledAt
836+ ? [
837+ {
838+ name : "start" ,
839+ event : "pod_scheduled" ,
840+ timestamp : podScheduledAt . getTime ( ) ,
841+ duration : 0 ,
842+ } ,
843+ ]
844+ : [ ]
845+ ) satisfies TaskRunExecutionMetrics ;
846+
774847 const taskRunEnv = {
775848 ...gatherProcessEnv ( ) ,
776849 ...envVars ,
777850 } ;
778851
779852 try {
780- return await this . executeRun ( { run, snapshot, envVars : taskRunEnv , execution } ) ;
853+ return await this . executeRun ( { run, snapshot, envVars : taskRunEnv , execution, metrics } ) ;
781854 } catch ( error ) {
782- // TODO: Handle the case where we're in the warm start phase or executing a new run
783- // This can happen if we kill the run while it's still executing, e.g. after receiving an attempt number mismatch
784-
785855 console . error ( "Error while executing attempt" , {
786856 error,
787857 } ) ;
@@ -810,7 +880,13 @@ class ManagedRunController {
810880 error : completionResult . error ,
811881 } ) ;
812882
813- // TODO: Maybe we should keep retrying for a while longer
883+ this . sendDebugLog ( {
884+ runId : run . friendlyId ,
885+ message : "completion: failed to submit after error" ,
886+ properties : {
887+ error : completionResult . error ,
888+ } ,
889+ } ) ;
814890
815891 this . waitForNextRun ( ) ;
816892 return ;
@@ -923,6 +999,7 @@ class ManagedRunController {
923999 this . startAndExecuteRunAttempt ( {
9241000 runFriendlyId : nextRun . run . friendlyId ,
9251001 snapshotFriendlyId : nextRun . snapshot . friendlyId ,
1002+ dequeuedAt : nextRun . dequeuedAt ,
9261003 isWarmStart : true ,
9271004 } ) . finally ( ( ) => { } ) ;
9281005 return ;
@@ -1032,7 +1109,10 @@ class ManagedRunController {
10321109 snapshot,
10331110 envVars,
10341111 execution,
1035- } : WorkloadRunAttemptStartResponseBody ) {
1112+ metrics,
1113+ } : WorkloadRunAttemptStartResponseBody & {
1114+ metrics ?: TaskRunExecutionMetrics ;
1115+ } ) {
10361116 this . snapshotPoller . start ( ) ;
10371117
10381118 if ( ! this . taskRunProcess || ! this . taskRunProcess . isPreparedForNextRun ) {
@@ -1058,6 +1138,7 @@ class ManagedRunController {
10581138 payload : {
10591139 execution,
10601140 traceContext : execution . run . traceContext ?? { } ,
1141+ metrics,
10611142 } ,
10621143 messageId : run . friendlyId ,
10631144 env : envVars ,
@@ -1096,6 +1177,14 @@ class ManagedRunController {
10961177 error : completionResult . error ,
10971178 } ) ;
10981179
1180+ this . sendDebugLog ( {
1181+ runId : run . friendlyId ,
1182+ message : "completion: failed to submit" ,
1183+ properties : {
1184+ error : completionResult . error ,
1185+ } ,
1186+ } ) ;
1187+
10991188 this . waitForNextRun ( ) ;
11001189 return ;
11011190 }
@@ -1212,6 +1301,8 @@ class ManagedRunController {
12121301 this . startAndExecuteRunAttempt ( {
12131302 runFriendlyId : env . TRIGGER_RUN_ID ,
12141303 snapshotFriendlyId : env . TRIGGER_SNAPSHOT_ID ,
1304+ dequeuedAt : env . TRIGGER_DEQUEUED_AT_MS ,
1305+ podScheduledAt : env . TRIGGER_POD_SCHEDULED_AT_MS ,
12151306 } ) . finally ( ( ) => { } ) ;
12161307 return ;
12171308 }
0 commit comments