@@ -179,7 +179,7 @@ def __init__(
179179 self ._non_breaking_exceptions : List [Exception ] = []
180180
181181 def _replace_failed_jobs (self , partition : AsyncPartition ) -> None :
182- failed_status_jobs = (AsyncJobStatus .FAILED , AsyncJobStatus . TIMED_OUT )
182+ failed_status_jobs = (AsyncJobStatus .FAILED ,)
183183 jobs_to_replace = [job for job in partition .jobs if job .status () in failed_status_jobs ]
184184 for job in jobs_to_replace :
185185 new_job = self ._start_job (job .job_parameters (), job .api_job_id ())
@@ -359,14 +359,11 @@ def _process_running_partitions_and_yield_completed_ones(
359359 self ._process_partitions_with_errors (partition )
360360 case _:
361361 self ._stop_timed_out_jobs (partition )
362+ # re-allocate FAILED jobs, but TIMEOUT jobs are not re-allocated
363+ self ._reallocate_partition (current_running_partitions , partition )
362364
363- # job will be restarted in `_start_job`
364- current_running_partitions .insert (0 , partition )
365-
366- for job in partition .jobs :
367- # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority
368- if job .status () == AsyncJobStatus .COMPLETED :
369- self ._job_tracker .remove_job (job .api_job_id ())
365+ # We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
366+ self ._remove_completed_or_timed_out_jobs (partition )
370367
371368 # update the referenced list with running partitions
372369 self ._running_partitions = current_running_partitions
@@ -381,8 +378,11 @@ def _stop_partition(self, partition: AsyncPartition) -> None:
381378 def _stop_timed_out_jobs (self , partition : AsyncPartition ) -> None :
382379 for job in partition .jobs :
383380 if job .status () == AsyncJobStatus .TIMED_OUT :
384- # we don't free allocation here because it is expected to retry the job
385- self ._abort_job (job , free_job_allocation = False )
381+ self ._abort_job (job , free_job_allocation = True )
382+ raise AirbyteTracedException (
383+ internal_message = f"Job { job .api_job_id ()} has timed out. Try increasing the `polling job timeout`." ,
384+ failure_type = FailureType .config_error ,
385+ )
386386
387387 def _abort_job (self , job : AsyncJob , free_job_allocation : bool = True ) -> None :
388388 try :
@@ -392,6 +392,34 @@ def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
392392 except Exception as exception :
393393 LOGGER .warning (f"Could not free budget for job { job .api_job_id ()} : { exception } " )
394394
395+ def _remove_completed_or_timed_out_jobs (self , partition : AsyncPartition ) -> None :
396+ """
397+ Remove completed or timed out jobs from the partition.
398+
399+ Args:
400+ partition (AsyncPartition): The partition to process.
401+ """
402+ for job in partition .jobs :
403+ if job .status () in [AsyncJobStatus .COMPLETED , AsyncJobStatus .TIMED_OUT ]:
404+ self ._job_tracker .remove_job (job .api_job_id ())
405+
406+ def _reallocate_partition (
407+ self ,
408+ current_running_partitions : List [AsyncPartition ],
409+ partition : AsyncPartition ,
410+ ) -> None :
411+ """
412+ Reallocate the partition by starting a new job for each job in the
413+ partition.
414+ Args:
415+ current_running_partitions (list): The list of currently running partitions.
416+ partition (AsyncPartition): The partition to reallocate.
417+ """
418+ for job in partition .jobs :
419+ if job .status () != AsyncJobStatus .TIMED_OUT :
420+ # allow the FAILED jobs to be re-allocated for partition
421+ current_running_partitions .insert (0 , partition )
422+
395423 def _process_partitions_with_errors (self , partition : AsyncPartition ) -> None :
396424 """
397425 Process a partition with status errors (FAILED and TIMEOUT).
0 commit comments