Skip to content

Commit 194a2d1

Browse files
committed
Add resume metadata, tests, and bump version
1 parent e931dd4 commit 194a2d1

File tree

19 files changed

+1309
-18
lines changed

19 files changed

+1309
-18
lines changed

docs/cli.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ Order used by commands that need a config:
2929
Dev-only: provision containers and run via Ansible (requires `.lb_dev_cli` or `LB_ENABLE_TEST_CLI=1`).
3030
- `lb run ... --multipass [--nodes N]`
3131
Dev-only: provision Multipass VMs and run via Ansible (requires `.lb_dev_cli` or `LB_ENABLE_TEST_CLI=1`).
32+
- `lb resume [RUN_ID] [-c FILE] [--root PATH] [--remote/--no-remote] [--docker|--multipass]`
33+
Resume a previous run; without RUN_ID, pick interactively from `benchmark_results/`.
3234
- `lb runs list [--root PATH] [-c FILE]` / `lb runs show RUN_ID [--root PATH] [-c FILE]`
3335
Inspect stored runs under `benchmark_results/`.
3436
- `lb analyze [RUN_ID] [--kind aggregate] [--root PATH] [--workload NAME] [--host NAME]`

lb_app/client.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,30 @@ def list_runs(self, config: BenchmarkConfig) -> Iterable[RunJournal]:
4545
def get_run_plan(self, config: BenchmarkConfig, tests: Sequence[str], execution_mode: str = "remote"):
4646
return self._run_service.get_run_plan(config, list(tests), execution_mode=execution_mode)
4747

48-
def _provision(self, config: BenchmarkConfig, execution_mode: str, node_count: int, docker_engine: str | None = None):
48+
def _provision(
49+
self,
50+
config: BenchmarkConfig,
51+
execution_mode: str,
52+
node_count: int,
53+
*,
54+
docker_engine: str | None = None,
55+
resume: str | None = None,
56+
):
4957
"""Provision nodes according to execution mode; returns updated config and provisioner result."""
5058
mode = ProvisioningMode(execution_mode)
59+
node_names = None
60+
if resume and mode in (ProvisioningMode.DOCKER, ProvisioningMode.MULTIPASS):
61+
node_names = self._resume_node_names(config, resume)
62+
if not node_names:
63+
raise ProvisioningError(
64+
"Unable to determine previous container/VM names for resume; "
65+
"ensure the run journal or host directories are available."
66+
)
67+
if node_count != len(node_names):
68+
raise ProvisioningError(
69+
"Resume node count does not match original run; "
70+
"use --nodes to match the previous run."
71+
)
5172
if mode is ProvisioningMode.REMOTE:
5273
request = ProvisioningRequest(
5374
mode=ProvisioningMode.REMOTE,
@@ -58,13 +79,15 @@ def _provision(self, config: BenchmarkConfig, execution_mode: str, node_count: i
5879
request = ProvisioningRequest(
5980
mode=ProvisioningMode.DOCKER,
6081
count=node_count,
82+
node_names=node_names,
6183
docker_engine=docker_engine or "docker",
6284
)
6385
else:
6486
temp_dir = config.output_dir.parent / "temp_keys"
6587
request = ProvisioningRequest(
6688
mode=ProvisioningMode.MULTIPASS,
6789
count=node_count,
90+
node_names=node_names,
6891
state_dir=temp_dir,
6992
)
7093
result = self._provisioner.provision(request)
@@ -83,6 +106,49 @@ def _provision(self, config: BenchmarkConfig, execution_mode: str, node_count: i
83106
config.remote_execution.enabled = True
84107
return config, result
85108

109+
@staticmethod
110+
def _resume_node_names(config: BenchmarkConfig, resume: str) -> list[str] | None:
111+
from lb_app.services.run_journal import (
112+
find_latest_journal,
113+
find_latest_results_run,
114+
)
115+
116+
run_root = None
117+
journal_path = None
118+
if resume == "latest":
119+
journal_path = find_latest_journal(config)
120+
if journal_path is not None:
121+
run_root = journal_path.parent
122+
else:
123+
latest = find_latest_results_run(config)
124+
if latest:
125+
journal_path = latest[1]
126+
run_root = journal_path.parent
127+
else:
128+
run_root = config.output_dir / resume
129+
journal_path = run_root / "run_journal.json"
130+
131+
if journal_path is not None and journal_path.exists():
132+
try:
133+
journal = RunJournal.load(journal_path)
134+
except Exception:
135+
journal = None
136+
if journal is not None:
137+
names = sorted(
138+
{task.host for task in journal.tasks.values() if task.host}
139+
)
140+
if names:
141+
return names
142+
143+
if run_root is not None and run_root.exists():
144+
names = sorted(
145+
entry.name
146+
for entry in run_root.iterdir()
147+
if entry.is_dir() and not entry.name.startswith("_")
148+
)
149+
return names or None
150+
return None
151+
86152
def start_run(self, request: RunRequest, hooks: UIHooks) -> RunResult | None:
87153
cfg = request.config
88154
target_tests = list(
@@ -105,6 +171,7 @@ def start_run(self, request: RunRequest, hooks: UIHooks) -> RunResult | None:
105171
setup=request.setup,
106172
stop_file=request.stop_file,
107173
execution_mode=request.execution_mode,
174+
node_count=request.node_count,
108175
preloaded_config=cfg,
109176
)
110177

@@ -115,7 +182,8 @@ def start_run(self, request: RunRequest, hooks: UIHooks) -> RunResult | None:
115182
cfg,
116183
request.execution_mode,
117184
request.node_count,
118-
request.docker_engine,
185+
docker_engine=request.docker_engine,
186+
resume=request.resume,
119187
)
120188
except ProvisioningError as exc:
121189
hooks.on_warning(f"Provisioning failed: {exc}", ttl=5)

lb_app/services/run_service.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def build_context(
135135
resume: Optional[str] = None,
136136
stop_file: Optional[Path] = None,
137137
execution_mode: str = "remote",
138+
node_count: int | None = None,
138139
) -> RunContext:
139140
"""Compute the run context and registry."""
140141
registry = self._registry_factory()
@@ -151,6 +152,7 @@ def build_context(
151152
resume_latest=resume == "latest",
152153
stop_file=stop_file,
153154
execution_mode=execution_mode,
155+
node_count=node_count,
154156
)
155157

156158
def create_session(
@@ -167,6 +169,7 @@ def create_session(
167169
setup: bool = True,
168170
stop_file: Optional[Path] = None,
169171
execution_mode: str = "remote",
172+
node_count: int | None = None,
170173
preloaded_config: BenchmarkConfig | None = None,
171174
) -> RunContext:
172175
"""
@@ -190,6 +193,7 @@ def create_session(
190193
resume=resume,
191194
stop_file=stop_file,
192195
execution_mode=execution_mode,
196+
node_count=node_count,
193197
)
194198
return context
195199

@@ -751,6 +755,16 @@ def _prepare_journal_and_dashboard(
751755
context, run_id
752756
)
753757

758+
if journal.metadata is not None:
759+
journal.metadata.setdefault("execution_mode", context.execution_mode)
760+
node_count = context.node_count
761+
if node_count is None:
762+
if context.execution_mode in ("docker", "multipass"):
763+
node_count = max(1, len(context.config.remote_hosts or []))
764+
else:
765+
node_count = len(context.config.remote_hosts or []) or 1
766+
journal.metadata.setdefault("node_count", node_count)
767+
754768
# Persist the initial state so resume is possible even if execution aborts early
755769
journal_path.parent.mkdir(parents=True, exist_ok=True)
756770
journal.save(journal_path)

lb_app/services/run_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class RunContext:
3636
resume_latest: bool = False
3737
stop_file: Path | None = None
3838
execution_mode: str = "remote"
39+
node_count: int | None = None
3940

4041

4142
@dataclass

lb_provisioner/engine/service.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def __init__(
4040

4141
def provision(self, request: ProvisioningRequest) -> ProvisioningResult:
4242
"""Provision resources according to the request."""
43-
self._enforce_limits(request.count)
43+
effective_count = len(request.node_names) if request.node_names else request.count
44+
self._enforce_limits(effective_count)
4445
self._assert_ui_caller()
4546

4647
if request.mode is ProvisioningMode.REMOTE:

lb_provisioner/models/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class ProvisioningRequest:
2727
mode: ProvisioningMode
2828
count: int = 1
2929
remote_hosts: Optional[List[RemoteHostSpec]] = None
30+
node_names: Optional[List[str]] = None
3031
docker_engine: str = "docker"
3132
docker_image: str = "ubuntu:24.04"
3233
multipass_image: str = "24.04"

lb_provisioner/providers/docker.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,20 @@ def provision(self, request: ProvisioningRequest) -> List[ProvisionedNode]:
3232
if not shutil.which(engine):
3333
raise ProvisioningError(f"{engine} not found in PATH")
3434

35-
count = max(1, min(request.count, MAX_NODES))
35+
if request.node_names:
36+
names = list(request.node_names)
37+
count = len(names)
38+
else:
39+
names = []
40+
count = max(1, min(request.count, MAX_NODES))
3641
state_root = request.state_dir or Path("/tmp/lb_docker_keys")
3742
state_root.mkdir(parents=True, exist_ok=True)
3843
nodes: List[ProvisionedNode] = []
3944
for idx in range(count):
40-
name = f"lb-docker-{uuid.uuid4().hex[:8]}-{idx}"
45+
if names:
46+
name = names[idx]
47+
else:
48+
name = f"lb-docker-{uuid.uuid4().hex[:8]}-{idx}"
4149
key_path = state_root / f"{name}_id_rsa"
4250
pub_path = state_root / f"{name}_id_rsa.pub"
4351
self._generate_ssh_keypair(key_path)

lb_provisioner/providers/multipass.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,18 @@ def provision(self, request: ProvisioningRequest) -> List[ProvisionedNode]:
3838
if not shutil.which("multipass"):
3939
raise ProvisioningError("Multipass CLI not found in PATH")
4040

41-
count = max(1, min(request.count, MAX_NODES))
41+
if request.node_names:
42+
names = list(request.node_names)
43+
count = len(names)
44+
else:
45+
names = []
46+
count = max(1, min(request.count, MAX_NODES))
4247
nodes: List[ProvisionedNode] = []
4348
state_root = request.state_dir or self.base_state_dir
4449
state_root.mkdir(parents=True, exist_ok=True)
4550

46-
for _ in range(count):
47-
vm_name = f"lb-worker-{uuid.uuid4().hex[:8]}"
51+
for idx in range(count):
52+
vm_name = names[idx] if names else f"lb-worker-{uuid.uuid4().hex[:8]}"
4853
key_path = state_root / f"{vm_name}_id_rsa"
4954
pub_path = state_root / f"{vm_name}_id_rsa.pub"
5055
self._generate_ephemeral_keys(key_path)

0 commit comments

Comments
 (0)