Skip to content

Commit 82662c9

Browse files
authored
Merge pull request #748 from NVIDIA/am/genai-k8s
2 parents 21290b1 + d45c22e commit 82662c9

File tree

4 files changed

+61
-91
lines changed

4 files changed

+61
-91
lines changed

src/cloudai/systems/kubernetes/kubernetes_system.py

Lines changed: 48 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
if TYPE_CHECKING:
2727
import kubernetes as k8s
2828

29-
3029
from cloudai.core import BaseJob, System
3130
from cloudai.util.lazy_imports import lazy
3231

@@ -325,54 +324,63 @@ def _check_model_server(self) -> bool:
325324
logging.warning("Invalid JSON response from model server")
326325
return False
327326

327+
def _get_frontend_pod_name(self) -> str:
328+
for pod in self.core_v1.list_namespaced_pod(namespace=self.default_namespace).items:
329+
labels = pod.metadata.labels
330+
logging.debug(f"Found pod: {pod.metadata.name} with labels: {labels}")
331+
if labels and str(labels.get("nvidia.com/dynamo-component", "")).lower() == "frontend":
332+
return pod.metadata.name
333+
raise RuntimeError("No frontend pod found for the job")
334+
328335
def _run_genai_perf(self, job: KubernetesJob) -> None:
329336
from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition
330337

331-
test_definition = job.test_run.test
332-
if not isinstance(test_definition, AIDynamoTestDefinition):
338+
tdef = job.test_run.test
339+
if not isinstance(tdef, AIDynamoTestDefinition):
333340
raise TypeError("Test definition must be an instance of AIDynamoTestDefinition")
334341

335-
python_exec = test_definition.python_executable
336-
if not python_exec or not python_exec.venv_path:
337-
raise ValueError("Python executable path not set - executable may not be installed")
338-
339-
genai_perf_args_obj = test_definition.cmd_args.genai_perf
340-
if not genai_perf_args_obj:
341-
raise ValueError("GenAI perf args not set")
342+
genai_perf_results_path = "/tmp/cloudai/genai-perf"
342343

343-
output_path = job.test_run.output_path
344-
if not output_path:
345-
raise ValueError("Output path not set")
346-
347-
genai_perf_args = genai_perf_args_obj.model_dump()
348-
args = [f"--artifact-dir={output_path.absolute()}"]
349-
extra_args = None
350-
351-
for k, v in genai_perf_args.items():
352-
if k == "extra-args":
353-
extra_args = str(v)
354-
else:
355-
args.append(f"--{k}={v}")
344+
genai_perf_cmd = ["genai-perf", "profile", f"--artifact-dir={genai_perf_results_path}"]
345+
for k, v in tdef.cmd_args.genai_perf.model_dump(
346+
exclude={"extra_args", "extra-args"}, exclude_none=True
347+
).items():
348+
genai_perf_cmd.append(f"--{k}={v}")
349+
if extra_args := tdef.cmd_args.genai_perf.extra_args:
350+
genai_perf_cmd.extend(extra_args.split())
351+
logging.debug(f"GenAI perf arguments: {genai_perf_cmd=}")
356352

357-
if extra_args:
358-
args.append(extra_args)
359-
args_str = " ".join(args)
353+
frontend_pod = self._get_frontend_pod_name()
360354

361-
venv_path = python_exec.venv_path.absolute()
362-
cmd = f"{venv_path}/bin/genai-perf profile {args_str}"
363-
logging.debug(f"Running GenAI performance test: {cmd}")
364-
result: subprocess.CompletedProcess | None = None
355+
logging.debug(f"Executing genai-perf in pod={frontend_pod} cmd={genai_perf_cmd}")
365356
try:
366-
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
367-
logging.debug("GenAI performance test completed successfully")
368-
except subprocess.CalledProcessError as e:
369-
logging.error(f"GenAI performance test failed: {e.stderr}")
370-
371-
if result:
372-
with (job.test_run.output_path / "stdout.txt").open("w") as f:
373-
f.write(result.stdout)
374-
with (job.test_run.output_path / "stderr.txt").open("w") as f:
375-
f.write(result.stderr)
357+
genai_results = lazy.k8s.stream.stream(
358+
self.core_v1.connect_get_namespaced_pod_exec,
359+
name=frontend_pod,
360+
namespace=self.default_namespace,
361+
command=genai_perf_cmd,
362+
stderr=True,
363+
stdin=False,
364+
stdout=True,
365+
tty=False,
366+
_request_timeout=60 * 10,
367+
)
368+
with (job.test_run.output_path / "genai_perf.log").open("w") as f:
369+
f.write(genai_results)
370+
except lazy.k8s.client.ApiException as e:
371+
logging.error(f"Error executing genai-perf command in pod '{frontend_pod}': {e}")
372+
373+
cp_logs_cmd = " ".join(
374+
[
375+
"kubectl",
376+
"cp",
377+
f"{self.default_namespace}/{frontend_pod}:{genai_perf_results_path}",
378+
str(job.test_run.output_path / "genai-perf"),
379+
]
380+
)
381+
logging.debug(f"Copying genai-perf results with command: {cp_logs_cmd}")
382+
p = subprocess.run(cp_logs_cmd, shell=True, capture_output=True, text=True)
383+
logging.debug(f"Returned code {p.returncode}, stdout: {p.stdout}, stderr: {p.stderr}")
376384

377385
def _check_deployment_conditions(self, conditions: list) -> bool:
378386
logging.debug(f"Checking deployment conditions: {conditions}")

src/cloudai/workloads/ai_dynamo/ai_dynamo.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from pydantic import AliasChoices, BaseModel, ConfigDict, Field
2222

23-
from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, JobStatusResult, PythonExecutable, TestRun
23+
from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, JobStatusResult, TestRun
2424
from cloudai.models.workload import CmdArgs, TestDefinition
2525

2626
from .report_generation_strategy import CSV_FILES_PATTERN, JSON_FILES_PATTERN
@@ -106,6 +106,12 @@ class GenAIPerfArgs(BaseModel):
106106

107107
model_config = ConfigDict(extra="allow")
108108

109+
extra_args: str | None = Field(
110+
default=None,
111+
serialization_alias="extra-args",
112+
validation_alias=AliasChoices("extra-args", "extra_args"),
113+
)
114+
109115

110116
class AIDynamoCmdArgs(CmdArgs):
111117
"""Arguments for AI Dynamo."""
@@ -126,11 +132,6 @@ class AIDynamoTestDefinition(TestDefinition):
126132
dynamo_repo: GitRepo = GitRepo(
127133
url="https://github.com/ai-dynamo/dynamo.git", commit="f7e468c7e8ff0d1426db987564e60572167e8464"
128134
)
129-
genai_perf_repo: GitRepo = GitRepo(
130-
url="https://github.com/triton-inference-server/perf_analyzer.git",
131-
commit="3c0bc9efa1844a82dfcc911f094f5026e6dd9214",
132-
)
133-
_python_executable: Optional[PythonExecutable] = None
134135
_hf_model: HFModel | None = None
135136

136137
@property
@@ -147,15 +148,7 @@ def hf_model(self) -> HFModel:
147148

148149
@property
149150
def installables(self) -> list[Installable]:
150-
return [self.docker_image, self.script, self.dynamo_repo, self.python_executable, self.hf_model]
151-
152-
@property
153-
def python_executable(self) -> PythonExecutable:
154-
if not self._python_executable:
155-
self._python_executable = PythonExecutable(
156-
GitRepo(url=self.genai_perf_repo.url, commit=self.genai_perf_repo.commit),
157-
)
158-
return self._python_executable
151+
return [self.docker_image, self.script, self.dynamo_repo, self.hf_model]
159152

160153
def was_run_successful(self, tr: TestRun) -> JobStatusResult:
161154
output_path = tr.output_path

src/cloudai/workloads/ai_dynamo/kubernetes_json_gen_strategy.py

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import logging
18-
import subprocess
19-
from pathlib import Path
2017
from typing import Any, Dict, cast
2118

2219
import yaml
@@ -32,31 +29,6 @@ class AIDynamoKubernetesJsonGenStrategy(JsonGenStrategy):
3229

3330
DEPLOYMENT_FILE_NAME = "deployment.yaml"
3431

35-
def _install_python_packages(self, repo_root: Path, venv_pip: Path) -> None:
36-
installs = [
37-
("perf_analyzer", repo_root),
38-
("genai-perf", repo_root / "genai-perf"),
39-
]
40-
41-
for package, path in installs:
42-
install_cmd = f"cd {path} && {venv_pip} install ."
43-
logging.info(f"Installing {package} with command: {install_cmd}")
44-
subprocess.run(install_cmd, shell=True, capture_output=True, text=True, check=True)
45-
46-
def _setup_genai(self, td: AIDynamoTestDefinition) -> None:
47-
python_exec = td.python_executable
48-
if not python_exec.venv_path:
49-
raise ValueError(
50-
f"The virtual environment for git repo {python_exec.git_repo} does not exist. "
51-
"Please ensure to run installation before running the test."
52-
)
53-
54-
venv_pip = python_exec.venv_path.absolute() / "bin" / "pip"
55-
assert python_exec.git_repo.installed_path
56-
repo_root = python_exec.git_repo.installed_path.absolute()
57-
58-
self._install_python_packages(repo_root, venv_pip)
59-
6032
def gen_frontend_dict(self) -> dict[str, Any]:
6133
system = cast(KubernetesSystem, self.system)
6234
tdef = cast(AIDynamoTestDefinition, self.test_run.test)
@@ -113,21 +85,19 @@ def gen_json(self) -> Dict[Any, Any]:
11385
td = cast(AIDynamoTestDefinition, self.test_run.test)
11486
k8s_system = cast(KubernetesSystem, self.system)
11587

116-
self._setup_genai(td)
117-
11888
deployment = {
11989
"apiVersion": "nvidia.com/v1alpha1",
12090
"kind": "DynamoGraphDeployment",
12191
"metadata": {"name": k8s_system.default_namespace},
12292
"spec": {
12393
"services": {
124-
"Frontend": self.gen_frontend_dict(),
125-
"VllmDecodeWorker": self.gen_decode_dict(),
94+
"frontend": self.gen_frontend_dict(),
95+
"decode": self.gen_decode_dict(),
12696
},
12797
},
12898
}
12999
if td.cmd_args.dynamo.prefill_worker:
130-
deployment["spec"]["services"]["VllmPrefillWorker"] = self.gen_prefill_dict()
100+
deployment["spec"]["services"]["prefill"] = self.gen_prefill_dict()
131101

132102
with (self.test_run.output_path / self.DEPLOYMENT_FILE_NAME).open("w") as f:
133103
yaml.safe_dump(deployment, f)

tests/json_gen_strategy/test_ai_dynamo.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ def test_gen_json(json_gen: AIDynamoKubernetesJsonGenStrategy) -> None:
187187
k8s_system = cast(KubernetesSystem, json_gen.system)
188188
tdef = cast(AIDynamoTestDefinition, json_gen.test_run.test)
189189
json_gen.test_run.output_path.mkdir(parents=True, exist_ok=True)
190-
json_gen._setup_genai = lambda td: None
191190

192191
deployment = json_gen.gen_json()
193192

@@ -196,11 +195,11 @@ def test_gen_json(json_gen: AIDynamoKubernetesJsonGenStrategy) -> None:
196195
assert deployment.get("metadata", {}).get("name") == k8s_system.default_namespace
197196

198197
if tdef.cmd_args.dynamo.prefill_worker:
199-
assert "VllmPrefillWorker" in deployment.get("spec", {}).get("services", {})
198+
assert "prefill" in deployment.get("spec", {}).get("services", {})
200199
else:
201200
assert "spec" in deployment
202201
assert "services" in deployment["spec"]
203-
assert "VllmPrefillWorker" not in deployment["spec"]["services"]
202+
assert "prefill" not in deployment["spec"]["services"]
204203

205204
with open(json_gen.test_run.output_path / json_gen.DEPLOYMENT_FILE_NAME, "r") as f:
206205
content = yaml.safe_load(f)

0 commit comments

Comments
 (0)