4848    publish_service_stopped_metrics ,
4949)
5050from  ..clusters_keeper  import  get_or_create_on_demand_cluster 
51- from  ..dask_client  import  DaskClient ,  PublishedComputationTask 
51+ from  ..dask_client  import  DaskClient 
5252from  ..dask_clients_pool  import  DaskClientsPool 
5353from  ..db .repositories .comp_runs  import  (
5454    CompRunsRepository ,
5555)
5656from  ..db .repositories .comp_tasks  import  CompTasksRepository 
57- from  ._constants  import  (
58-     MAX_CONCURRENT_PIPELINE_SCHEDULING ,
59- )
6057from  ._models  import  TaskStateTracker 
6158from  ._scheduler_base  import  BaseCompScheduler 
6259from  ._utils  import  (
6865_DASK_CLIENT_RUN_REF : Final [str ] =  "{user_id}:{project_id}:{run_id}" 
6966_TASK_RETRIEVAL_ERROR_TYPE : Final [str ] =  "task-result-retrieval-timeout" 
7067_TASK_RETRIEVAL_ERROR_CONTEXT_TIME_KEY : Final [str ] =  "check_time" 
68+ _PUBLICATION_CONCURRENCY_LIMIT : Final [int ] =  10 
7169
7270
7371@asynccontextmanager  
@@ -149,37 +147,31 @@ async def _start_tasks(
149147                RunningState .PENDING ,
150148            )
151149            # each task is started independently 
152-             results : list [list [PublishedComputationTask ]] =  await  limited_gather (
153-                 * (
154-                     client .send_computation_tasks (
155-                         user_id = user_id ,
156-                         project_id = project_id ,
157-                         tasks = {node_id : task .image },
158-                         hardware_info = task .hardware_info ,
159-                         callback = wake_up_callback ,
160-                         metadata = comp_run .metadata ,
161-                         resource_tracking_run_id = ServiceRunID .get_resource_tracking_run_id_for_computational (
162-                             user_id , project_id , node_id , comp_run .iteration 
163-                         ),
164-                     )
165-                     for  node_id , task  in  scheduled_tasks .items ()
166-                 ),
167-                 log = _logger ,
168-                 limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
169-             )
170150
171-             # update the database so we do have the correct job_ids there 
172-             await  limited_gather (
173-                 * (
174-                     comp_tasks_repo .update_project_task_job_id (
175-                         project_id , task .node_id , comp_run .run_id , task .job_id 
176-                     )
177-                     for  task_sents  in  results 
178-                     for  task  in  task_sents 
179-                 ),
180-                 log = _logger ,
181-                 limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
182-             )
151+             for  node_id , task  in  scheduled_tasks .items ():
152+                 published_tasks  =  await  client .send_computation_tasks (
153+                     user_id = user_id ,
154+                     project_id = project_id ,
155+                     tasks = {node_id : task .image },
156+                     hardware_info = task .hardware_info ,
157+                     callback = wake_up_callback ,
158+                     metadata = comp_run .metadata ,
159+                     resource_tracking_run_id = ServiceRunID .get_resource_tracking_run_id_for_computational (
160+                         user_id , project_id , node_id , comp_run .iteration 
161+                     ),
162+                 )
163+ 
164+                 # update the database so we do have the correct job_ids there 
165+                 await  limited_gather (
166+                     * (
167+                         comp_tasks_repo .update_project_task_job_id (
168+                             project_id , task .node_id , comp_run .run_id , task .job_id 
169+                         )
170+                         for  task  in  published_tasks 
171+                     ),
172+                     log = _logger ,
173+                     limit = 1 ,
174+                 )
183175
184176    async  def  _get_tasks_status (
185177        self ,
@@ -208,7 +200,7 @@ async def _process_executing_tasks(
208200        tasks : list [CompTaskAtDB ],
209201        comp_run : CompRunsAtDB ,
210202    ) ->  None :
211-         task_progresses  =  []
203+         task_progress_events  =  []
212204        try :
213205            async  with  _cluster_dask_client (
214206                user_id ,
@@ -218,42 +210,33 @@ async def _process_executing_tasks(
218210                run_id = comp_run .run_id ,
219211                run_metadata = comp_run .metadata ,
220212            ) as  client :
221-                 task_progresses  =  [
213+                 task_progress_events  =  [
222214                    t 
223215                    for  t  in  await  client .get_tasks_progress (
224216                        [f"{ t .job_id }   for  t  in  tasks ],
225217                    )
226218                    if  t  is  not None 
227219                ]
228-             await  limited_gather (
229-                 * (
230-                     CompTasksRepository (self .db_engine ).update_project_task_progress (
231-                         t .task_owner .project_id ,
232-                         t .task_owner .node_id ,
233-                         comp_run .run_id ,
234-                         t .progress ,
235-                     )
236-                     for  t  in  task_progresses 
237-                 ),
238-                 log = _logger ,
239-                 limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
240-             )
220+             for  progress_event  in  task_progress_events :
221+                 await  CompTasksRepository (self .db_engine ).update_project_task_progress (
222+                     progress_event .task_owner .project_id ,
223+                     progress_event .task_owner .node_id ,
224+                     comp_run .run_id ,
225+                     progress_event .progress ,
226+                 )
241227
242228        except  ComputationalBackendOnDemandNotReadyError :
243229            _logger .info ("The on demand computational backend is not ready yet..." )
244230
245231        comp_tasks_repo  =  CompTasksRepository (self .db_engine )
232+         for  task  in  task_progress_events :
233+             await  comp_tasks_repo .update_project_task_progress (
234+                 task .task_owner .project_id ,
235+                 task .task_owner .node_id ,
236+                 comp_run .run_id ,
237+                 task .progress ,
238+             )
246239        await  limited_gather (
247-             * (
248-                 comp_tasks_repo .update_project_task_progress (
249-                     t .task_owner .project_id ,
250-                     t .task_owner .node_id ,
251-                     comp_run .run_id ,
252-                     t .progress ,
253-                 )
254-                 for  t  in  task_progresses 
255-                 if  t 
256-             ),
257240            * (
258241                publish_service_progress (
259242                    self .rabbitmq_client ,
@@ -262,11 +245,10 @@ async def _process_executing_tasks(
262245                    node_id = t .task_owner .node_id ,
263246                    progress = t .progress ,
264247                )
265-                 for  t  in  task_progresses 
266-                 if  t 
248+                 for  t  in  task_progress_events 
267249            ),
268250            log = _logger ,
269-             limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
251+             limit = _PUBLICATION_CONCURRENCY_LIMIT ,
270252        )
271253
272254    async  def  _release_resources (self , comp_run : CompRunsAtDB ) ->  None :
@@ -300,25 +282,14 @@ async def _stop_tasks(
300282                run_id = comp_run .run_id ,
301283                run_metadata = comp_run .metadata ,
302284            ) as  client :
303-                 await  limited_gather (
304-                     * (
305-                         client .abort_computation_task (t .job_id )
306-                         for  t  in  tasks 
307-                         if  t .job_id 
308-                     ),
309-                     log = _logger ,
310-                     limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
311-                 )
312-                 # tasks that have no-worker must be unpublished as these are blocking forever 
313-                 await  limited_gather (
314-                     * (
315-                         client .release_task_result (t .job_id )
316-                         for  t  in  tasks 
317-                         if  t .state  is  RunningState .WAITING_FOR_RESOURCES  and  t .job_id 
318-                     ),
319-                     log = _logger ,
320-                     limit = MAX_CONCURRENT_PIPELINE_SCHEDULING ,
321-                 )
285+                 for  t  in  tasks :
286+                     if  not  t .job_id :
287+                         _logger .warning ("%s has no job_id, cannot be stopped" , t )
288+                         continue 
289+                     await  client .abort_computation_task (t .job_id )
290+                     # tasks that have no-worker must be unpublished as these are blocking forever 
291+                     if  t .state  is  RunningState .WAITING_FOR_RESOURCES :
292+                         await  client .release_task_result (t .job_id )
322293
323294    async  def  _process_completed_tasks (
324295        self ,
@@ -342,7 +313,7 @@ async def _process_completed_tasks(
342313                ),
343314                reraise = False ,
344315                log = _logger ,
345-                 limit = MAX_CONCURRENT_PIPELINE_SCHEDULING , 
316+                 limit = 1 ,   # to avoid overloading the dask scheduler 
346317            )
347318            async  for  future  in  limited_as_completed (
348319                (
@@ -354,7 +325,7 @@ async def _process_completed_tasks(
354325                    )
355326                    for  task , result  in  zip (tasks , tasks_results , strict = True )
356327                ),
357-                 limit = MAX_CONCURRENT_PIPELINE_SCHEDULING , 
328+                 limit = 10 ,   # this is not accessing the dask-scheduelr (only db) 
358329            ):
359330                with  log_catch (_logger , reraise = False ):
360331                    task_can_be_cleaned , job_id  =  await  future 
0 commit comments