1616
1717"""planner.py."""
1818
19+ from dataclasses import dataclass
1920import json
2021from pathlib import Path
2122
@@ -80,6 +81,14 @@ def enumerate(self) -> ExecPlan:
8081 yield plan
8182
8283
84+ @dataclass
85+ class PipelineData :
86+ """PipelineData class."""
87+
88+ worker_ids : set [str ]
89+ total_throughput : float
90+
91+
8392class Planner :
8493 """Planner class."""
8594
@@ -90,6 +99,8 @@ def __init__(self, path: str, autoscale: bool) -> None:
9099 self ._autoscale = autoscale
91100
92101 self ._colls : dict [str , PlanCollection ] = {}
102+
103+ self .pipeline_data : dict [str , list [PipelineData ]] = {}
93104
94105 def build_config (
95106 self ,
@@ -115,8 +126,13 @@ def build_config(
115126 if solution is None :
116127 raise InsufficientResources ("No placement solution found" )
117128
118- gen2 = CfgGen2 (solution [0 ], solution [1 ], source , "cuda" , base_cfg )
129+ placement , agent_ctxts_list , total_throughput = solution
130+
131+ gen2 = CfgGen2 (placement , agent_ctxts_list , source , "cuda" , base_cfg )
119132 cfg = gen2 .generate ()
133+
134+ self ._set_pipeline_data (cfg , total_throughput )
135+
120136 return cfg
121137
122138 #####
@@ -129,6 +145,19 @@ def build_config(
129145 # plan_list = self._colls[source.model].pick_plans(demand)
130146 # gen = CfgGen(agent_ctxts, source, plan_list, "cuda", base_cfg)
131147 # return gen.generate()
148+
149+ def _set_pipeline_data (self , cfg : JobConfig , total_throughput ) -> None :
150+ """Set pipeline data."""
151+ job_id = cfg .job_id
152+
153+ if job_id not in self .pipeline_data :
154+ self .pipeline_data [job_id ] = []
155+
156+ pipeline_identifiers = JobConfig .get_pipeline_identifiers (cfg )
157+ prev_identifiers = {wid for data in self .pipeline_data [job_id ] for wid in data .worker_ids }
158+ new_identifiers = pipeline_identifiers - prev_identifiers
159+
160+ self .pipeline_data [job_id ].append (PipelineData (new_identifiers , total_throughput ))
132161
133162 def _search_feasible_placement (
134163 self ,
@@ -138,15 +167,15 @@ def _search_feasible_placement(
138167 gpu_count : int ,
139168 ctx_list : list [AgentContext ],
140169 dispatcher_on_gpu : bool = True ,
141- ) -> tuple [dict , list [AgentContext ]] | None :
170+ ) -> tuple [dict , list [AgentContext ], float ] | None :
142171 # we'd like to search a feasible solution by increasing the number of nodes
143172 for num_nodes in range (1 , len (ctx_list ) + 1 ):
144173 res = placement .calculate_placement (
145174 gpu_count , len (ctx_list [:num_nodes ]), nfaults , dispatcher_on_gpu
146175 )
147176 meta = res ["meta" ]
148177 if meta ["total_throughput" ] > demand :
149- return (res , ctx_list [:num_nodes ])
178+ return (res , ctx_list [:num_nodes ], meta [ "total_throughput" ] )
150179
151180 return None
152181
@@ -156,7 +185,7 @@ def _calculate_placement(
156185 agent_ctxts : dict [str , AgentContext ],
157186 demand : float ,
158187 dispatcher_on_gpu : bool = True ,
159- ) -> tuple [dict , list [AgentContext ]] | None :
188+ ) -> tuple [dict , list [AgentContext ], float ] | None :
160189 gpu_count_and_nodes : dict [int , list [AgentContext ]] = {}
161190 for ctx in agent_ctxts .values ():
162191 count = ctx .avail_gpu_count ()
0 commit comments