2323from models_library .users import UserID
2424from pydantic import PositiveInt
2525from servicelib .common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
26+ from servicelib .logging_errors import create_troubleshootting_log_kwargs
2627from servicelib .logging_utils import log_catch , log_context
2728from servicelib .utils import limited_as_completed
2829
2930from ...core .errors import (
3031 ComputationalBackendNotConnectedError ,
3132 ComputationalBackendOnDemandNotReadyError ,
3233 ComputationalBackendTaskResultsNotReadyError ,
33- TaskSchedulingError ,
34+ PortsValidationError ,
3435)
3536from ...models .comp_runs import CompRunsAtDB , Iteration , RunMetadataDict
3637from ...models .comp_tasks import CompTaskAtDB
5960_logger = logging .getLogger (__name__ )
6061
6162_DASK_CLIENT_RUN_REF : Final [str ] = "{user_id}:{project_id}:{run_id}"
63+ _TASK_RETRIEVAL_ERROR_TYPE : Final [str ] = "task-result-retrieval-timeout"
64+ _TASK_RETRIEVAL_ERROR_MSG : Final [str ] = "Retrieval of task result timed-out"
65+ _TASK_RETRIEVAL_ERROR_CONTEXT_TIME_KEY : Final [str ] = "check_time"
6266
6367
6468@asynccontextmanager
@@ -329,66 +333,121 @@ async def _process_task_result(
329333 _logger .debug ("received %s result: %s" , f"{ task = } " , f"{ result = } " )
330334 task_final_state = RunningState .FAILED
331335 simcore_platform_status = SimcorePlatformStatus .OK
332- errors : list [ErrorDict ] = []
336+ task_errors : list [ErrorDict ] = []
337+ task_completed = True
333338
334- if task .job_id is not None :
335- (
336- _service_key ,
337- _service_version ,
338- user_id ,
339- project_id ,
340- node_id ,
341- ) = parse_dask_job_id (task .job_id )
339+ assert task .job_id # nosec
340+ (
341+ _service_key ,
342+ _service_version ,
343+ user_id ,
344+ project_id ,
345+ node_id ,
346+ ) = parse_dask_job_id (task .job_id )
342347
343- assert task .project_id == project_id # nosec
344- assert task .node_id == node_id # nosec
348+ assert task .project_id == project_id # nosec
349+ assert task .node_id == node_id # nosec
350+ assert task .job_id # nosec
351+ log_error_context = {
352+ "user_id" : user_id ,
353+ "project_id" : project_id ,
354+ "node_id" : node_id ,
355+ "job_id" : task .job_id ,
356+ }
345357
358+ if isinstance (result , TaskOutputData ):
359+ # That means the task successfully completed
346360 try :
347- if isinstance (result , TaskOutputData ):
348- # success!
349- await parse_output_data (
350- self .db_engine ,
351- task .job_id ,
352- result ,
361+ await parse_output_data (
362+ self .db_engine ,
363+ task .job_id ,
364+ result ,
365+ )
366+ task_final_state = RunningState .SUCCESS
367+ except PortsValidationError as err :
368+ _logger .exception (
369+ ** create_troubleshootting_log_kwargs (
370+ "Unexpected error while parsing output data, comp_tasks/comp_pipeline is not in sync with what was started" ,
371+ error = err ,
372+ error_context = log_error_context ,
373+ )
374+ )
375+ task_errors .extend (err .get_errors ())
376+ task_final_state = RunningState .FAILED
377+ # NOTE: simcore platform state is still OK as the task ran fine, the issue is likely due to the service labels
378+ elif isinstance (result , ComputationalBackendTaskResultsNotReadyError ):
379+ # Task result retrieval failed due to communication error, task will be retried
380+ # so we keep it as is
381+ _logger .warning (
382+ ** create_troubleshootting_log_kwargs (
383+ f"Retrieval of task { task .job_id } result timed-out" ,
384+ error = result ,
385+ error_context = log_error_context ,
386+ tip = "This can happen if the computational backend is overloaded with requests. It will be automatically retried again." ,
387+ )
388+ )
389+
390+ if task .errors :
391+ for error in task .errors :
392+ if error ["type" ] == _TASK_RETRIEVAL_ERROR_TYPE :
393+ # already had a timeout error, let's keep it
394+ task_errors .append (error )
395+ break
396+ if not task_errors :
397+ # first time we have this error
398+ task_errors .append (
399+ ErrorDict (
400+ loc = (f"{ task .project_id } " , f"{ task .node_id } " ),
401+ msg = f"{ result } " ,
402+ type = _TASK_RETRIEVAL_ERROR_TYPE ,
403+ ctx = {
404+ _TASK_RETRIEVAL_ERROR_CONTEXT_TIME_KEY : f"{ arrow .utcnow ()} " ,
405+ "user_id" : user_id ,
406+ "project_id" : project_id ,
407+ "node_id" : node_id ,
408+ "job_id" : task .job_id ,
409+ },
410+ )
411+ )
412+
413+ task_completed = False
414+ else :
415+ # the task itself failed, check why
416+ if isinstance (result , TaskCancelledError ):
417+ _logger .info (
418+ ** create_troubleshootting_log_kwargs (
419+ f"Task { task .job_id } was cancelled" ,
420+ error = result ,
421+ error_context = log_error_context ,
353422 )
354- task_final_state = RunningState .SUCCESS
355- elif isinstance (result , ComputationalBackendTaskResultsNotReadyError ):
356- # Task result retrieval failed due to communication error, task will be retried
357- # so we keep it as is
358- assert task .job_id # nosec
359- return False , task .job_id
360- else :
361- if isinstance (result , TaskCancelledError ):
362- task_final_state = RunningState .ABORTED
423+ )
424+ task_final_state = RunningState .ABORTED
363425
364- else :
365- task_final_state = RunningState .FAILED
366- errors .append (
367- {
368- "loc" : (
369- f"{ task .project_id } " ,
370- f"{ task .node_id } " ,
371- ),
372- "msg" : f"{ result } " ,
373- "type" : "runtime" ,
374- }
375- )
376- if isinstance (result , ComputationalBackendNotConnectedError ):
377- simcore_platform_status = SimcorePlatformStatus .BAD
378- # we need to remove any invalid files in the storage
379- await clean_task_output_and_log_files_if_invalid (
380- self .db_engine , user_id , project_id , node_id
426+ else :
427+ _logger .info (
428+ ** create_troubleshootting_log_kwargs (
429+ f"Task { task .job_id } completed with errors" ,
430+ error = result ,
431+ error_context = log_error_context ,
381432 )
382- except TaskSchedulingError as err :
433+ )
383434 task_final_state = RunningState .FAILED
384- simcore_platform_status = SimcorePlatformStatus . BAD
385- errors = err . get_errors ()
386- _logger . debug (
387- "Unexpected failure while processing results of %s: %s " ,
388- f" { task = } " ,
389- f" { errors = } " ,
435+ task_errors . append (
436+ ErrorDict (
437+ loc = ( f" { task . project_id } " , f" { task . node_id } " ),
438+ msg = f" { result } " ,
439+ type = "runtime " ,
440+ )
390441 )
391442
443+ if isinstance (result , ComputationalBackendNotConnectedError ):
444+ simcore_platform_status = SimcorePlatformStatus .BAD
445+ # we need to remove any invalid files in the storage
446+ await clean_task_output_and_log_files_if_invalid (
447+ self .db_engine , user_id , project_id , node_id
448+ )
449+
450+ if task_completed :
392451 # resource tracking
393452 await publish_service_resource_tracking_stopped (
394453 self .rabbitmq_client ,
@@ -408,17 +467,25 @@ async def _process_task_result(
408467 task_final_state = task_final_state ,
409468 )
410469
411- await CompTasksRepository (self .db_engine ).update_project_tasks_state (
412- task .project_id ,
413- run_id ,
414- [task .node_id ],
415- task_final_state ,
416- errors = errors ,
417- optional_progress = 1 ,
418- optional_stopped = arrow .utcnow ().datetime ,
419- )
420- assert task .job_id # nosec
421- return True , task .job_id
470+ await CompTasksRepository (self .db_engine ).update_project_tasks_state (
471+ task .project_id ,
472+ run_id ,
473+ [task .node_id ],
474+ task_final_state ,
475+ errors = task_errors ,
476+ optional_progress = 1 ,
477+ optional_stopped = arrow .utcnow ().datetime ,
478+ )
479+ else :
480+ await CompTasksRepository (self .db_engine ).update_project_tasks_state (
481+ task .project_id ,
482+ run_id ,
483+ [task .node_id ],
484+ RunningState .STARTED , # keep the same state as before
485+ errors = task_errors ,
486+ )
487+
488+ return task_completed , task .job_id
422489
423490 async def _task_progress_change_handler (
424491 self , event : tuple [UnixTimestamp , Any ]
0 commit comments