Skip to content

Commit d7f9bfb

Browse files
authored
refactor: rename recover count attribute (#301)
Since we will expand the world duplication count in a next PR, we need to rename recover_count to duplicate_count. Besides the worker failure when we need to increment this count, we need to also update that count for multiple scale_out / in / out scenarios. Second scale_out generated config will have the same world names as the first scale_out, therefore, there will be a world name conflict in multiworld.
1 parent 5baeadb commit d7f9bfb

File tree

4 files changed

+8
-8
lines changed

4 files changed

+8
-8
lines changed

infscale/configs/job.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class WorldInfo:
7070
addr: str = "127.0.0.1"
7171
backend: Optional[str] = ""
7272
recover: bool = False
73-
recover_count: int = 0
73+
duplicate_count: int = 0
7474

7575

7676
@dataclass

infscale/controller/job_context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,13 +880,13 @@ def _update_recovery_flow_graph(
880880
for world_info in recover_flow_graph:
881881
world_info.addr = ip
882882
world_info.recover = True
883-
world_info.recover_count += 1
883+
world_info.duplicate_count += 1
884884

885885
for world_list in cfg.flow_graph.values():
886886
for world_info in world_list:
887887
if recover_wid in world_info.peers:
888888
world_info.recover = True
889-
world_info.recover_count += 1
889+
world_info.duplicate_count += 1
890890

891891
def _update_recovery_worker_data(
892892
self, cfg: JobConfig, wrk_id: str, gpu_id: int

infscale/execution/pipeline.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -459,14 +459,14 @@ def _build_world_infos(self) -> dict[str, WorldInfo]:
459459
else:
460460
continue
461461

462-
name, backend, addr, data_port, ctrl_port, recover, recover_count = (
462+
name, backend, addr, data_port, ctrl_port, recover, duplicate_count = (
463463
cfg_world_info.name,
464464
cfg_world_info.backend,
465465
cfg_world_info.addr,
466466
cfg_world_info.data_port,
467467
cfg_world_info.ctrl_port,
468468
cfg_world_info.recover,
469-
cfg_world_info.recover_count,
469+
cfg_world_info.duplicate_count,
470470
)
471471

472472
world_size = len(cfg_world_info.peers) + 1
@@ -484,8 +484,8 @@ def _build_world_infos(self) -> dict[str, WorldInfo]:
484484
"other_id": other_id,
485485
"other": other_rank,
486486
"recover": recover,
487-
"recover_count": recover_count,
488-
"multiworld_name": f"{name}-{recover_count}",
487+
"duplicate_count": duplicate_count,
488+
"multiworld_name": f"{name}-{duplicate_count}",
489489
}
490490
world_info = WorldInfo(**data)
491491
world_infos[name] = world_info

infscale/execution/world.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,5 @@ class WorldInfo:
4242
other: int # other peer's rank
4343

4444
recover: bool
45-
recover_count: int
45+
duplicate_count: int
4646
multiworld_name: str

0 commit comments

Comments
 (0)