1212"""
1313
1414import asyncio
15+ import contextlib
1516import datetime
1617import logging
1718from abc import ABC , abstractmethod
3132from pydantic import PositiveInt
3233from servicelib .common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
3334from servicelib .rabbitmq import RabbitMQClient , RabbitMQRPCClient
34- from servicelib .redis import RedisClientSDK
35+ from servicelib .redis import CouldNotAcquireLockError , RedisClientSDK
3536from servicelib .redis_utils import exclusive
3637from servicelib .utils import limited_gather
3738
@@ -231,10 +232,27 @@ async def stop_pipeline(
231232
232233 async def schedule_all_pipelines (self ) -> None :
233234 self .wake_up_event .clear ()
234- # if one of the task throws, the other are NOT cancelled which is what we want
235+ # this task might be distributed among multiple replicas of director-v2,
236+ # we do not care if CouldNotAcquireLockError raises as that means another dv-2 is taking
237+ # care of it
238+
239+ async def _distributed_schedule_pipeline (
240+ user_id : UserID ,
241+ project_id : ProjectID ,
242+ iteration : Iteration ,
243+ pipeline_params : ScheduledPipelineParams ,
244+ ) -> None :
245+ with contextlib .suppress (CouldNotAcquireLockError ):
246+ return await self ._schedule_pipeline (
247+ user_id = user_id ,
248+ project_id = project_id ,
249+ iteration = iteration ,
250+ pipeline_params = pipeline_params ,
251+ )
252+
235253 await limited_gather (
236254 * (
237- self . _schedule_pipeline (
255+ _distributed_schedule_pipeline (
238256 user_id = user_id ,
239257 project_id = project_id ,
240258 iteration = iteration ,
@@ -246,7 +264,6 @@ async def schedule_all_pipelines(self) -> None:
246264 iteration ,
247265 ), pipeline_params in self .scheduled_pipelines .items ()
248266 ),
249- reraise = False ,
250267 log = _logger ,
251268 limit = 40 ,
252269 tasks_group_prefix = "computational-scheduled-pipeline" ,
0 commit comments