@@ -11,6 +11,7 @@ import {
1111 exceptionEventEnhancer ,
1212 flattenAttributes ,
1313 internalErrorFromUnexpectedExit ,
14+ isManualOutOfMemoryError ,
1415 sanitizeError ,
1516 shouldRetryError ,
1617 taskRunErrorEnhancer ,
@@ -691,20 +692,38 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId:
691692}
692693
693694function isOOMError ( error : TaskRunError ) {
694- if ( error . type !== "INTERNAL_ERROR" ) return false ;
695- if ( error . code === "TASK_PROCESS_OOM_KILLED" || error . code === "TASK_PROCESS_MAYBE_OOM_KILLED" ) {
696- return true ;
695+ if ( error . type === "INTERNAL_ERROR" ) {
696+ if (
697+ error . code === "TASK_PROCESS_OOM_KILLED" ||
698+ error . code === "TASK_PROCESS_MAYBE_OOM_KILLED"
699+ ) {
700+ return true ;
701+ }
702+
703+ // For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
704+ // This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
705+ // {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
706+ if (
707+ error . code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
708+ error . message &&
709+ error . message . includes ( "SIGKILL" ) &&
710+ error . message . includes ( "-1" )
711+ ) {
712+ return true ;
713+ }
714+ }
715+
716+ if ( error . type === "BUILT_IN_ERROR" ) {
717+ // ffmpeg also does weird stuff
718+ // { "name": "Error", "type": "BUILT_IN_ERROR", "message": "ffmpeg was killed with signal SIGKILL" }
719+ if ( error . message && error . message . includes ( "ffmpeg was killed with signal SIGKILL" ) ) {
720+ return true ;
721+ }
697722 }
698723
699- // For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
700- // This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
701- // {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
702- if (
703- error . code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
704- error . message &&
705- error . message . includes ( "SIGKILL" ) &&
706- error . message . includes ( "-1" )
707- ) {
724+ // Special `OutOfMemoryError` for doing a manual OOM kill.
725+ // Useful if a native library does an OOM but doesn't actually crash the run and you want to manually
726+ if ( isManualOutOfMemoryError ( error ) ) {
708727 return true ;
709728 }
710729
0 commit comments