Skip to content

Commit 568e5f9

Browse files
authored
Merge pull request #6 from boeschf/eval/env_var
Environment variables
2 parents ca851a9 + 6adb031 commit 568e5f9

File tree

7 files changed

+222
-6
lines changed

7 files changed

+222
-6
lines changed

examples/alps-ml-vetting/config.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,24 @@ scheduler: slurm
33
pip:
44
index_url: "https://jfrog.svc.cscs.ch/artifactory/api/pypi/pypi-remote/simple"
55
evals:
6+
- name: Environment Variables
7+
type: vetnode.evaluations.env_var_eval.EnvVarEval
8+
expected:
9+
CUDA_CACHE_DISABLE: "1"
10+
NCCL_NET: "AWS Libfabric"
11+
NCCL_CROSS_NIC: "1"
12+
NCCL_NET_GDR_LEVEL: "PHB"
13+
NCCL_PROTO: "^LL128"
14+
FI_PROVIDER: "cxi"
15+
FI_CXI_DEFAULT_CQ_SIZE: "131072"
16+
FI_CXI_DEFAULT_TX_SIZE: "16384"
17+
FI_CXI_DISABLE_HOST_REGISTER: "1"
18+
FI_CXI_RDZV_PROTO: "alt_read"
19+
FI_CXI_RDZV_EAGER_SIZE: "0"
20+
FI_CXI_RDZV_GET_MIN: "0"
21+
FI_CXI_RDZV_THRESHOLD: "0"
22+
FI_CXI_RX_MATCH_MODE: "hybrid"
23+
FI_MR_CACHE_MONITOR: "userfaultfd"
624
- name: Check GPU
725
type: vetnode.evaluations.gpu_eval.GPUEval
826
max_temp: 30
@@ -56,4 +74,4 @@ evals:
5674
runs: 3
5775
requirements:
5876
- ['torch', '--index-url', 'https://download.pytorch.org/whl/cu130']
59-
- numpy
77+
- numpy

examples/image-vet/config.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,24 @@ scheduler: slurm
33
pip:
44
index_url: "https://jfrog.svc.cscs.ch/artifactory/api/pypi/pypi-remote/simple"
55
evals:
6+
- name: Environment Variables
7+
type: vetnode.evaluations.env_var_eval.EnvVarEval
8+
expected:
9+
CUDA_CACHE_DISABLE: "1"
10+
NCCL_NET: "AWS Libfabric"
11+
NCCL_CROSS_NIC: "1"
12+
NCCL_NET_GDR_LEVEL: "PHB"
13+
NCCL_PROTO: "^LL128"
14+
FI_PROVIDER: "cxi"
15+
FI_CXI_DEFAULT_CQ_SIZE: "131072"
16+
FI_CXI_DEFAULT_TX_SIZE: "16384"
17+
FI_CXI_DISABLE_HOST_REGISTER: "1"
18+
FI_CXI_RDZV_PROTO: "alt_read"
19+
FI_CXI_RDZV_EAGER_SIZE: "0"
20+
FI_CXI_RDZV_GET_MIN: "0"
21+
FI_CXI_RDZV_THRESHOLD: "0"
22+
FI_CXI_RX_MATCH_MODE: "hybrid"
23+
FI_MR_CACHE_MONITOR: "userfaultfd"
624
- name: CudaKernel
725
type: vetnode.evaluations.cuda_eval.CUDAEval
826
cuda_home: /usr/local/cuda
@@ -52,4 +70,4 @@ evals:
5270
runs: 3
5371
requirements:
5472
- ['torch', '--index-url', 'https://download.pytorch.org/whl/cu130']
55-
- numpy
73+
- numpy

requirements-testing.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pytest
2+
pytest-asyncio
23
build
3-
twine
4+
twine

src/vetnode/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def diagnose(config,skip_install,verbose) -> None:
8787
click.secho(f"[{hostname}-{main_context.rank}: Vetted] ", fg='green',nl=False)
8888
else:
8989
if verbose:
90-
click.secho(f"[{hostname}-{main_context.rank}:Cordon] ", fg='red',nl=False)
90+
click.secho(f"[{hostname}-{main_context.rank}: Cordon] ", fg='red',nl=False)
9191
sys.exit(1)
9292

9393
@click.command()
@@ -310,4 +310,4 @@ def load_requirements(requirements: List[str], index_url: str = None):
310310
cmd.append(package)
311311
else:
312312
cmd += package
313-
subprocess.check_call(cmd)
313+
subprocess.check_call(cmd)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from __future__ import annotations
2+
3+
import os
4+
from typing import Literal
5+
from pydantic import Field
6+
7+
from vetnode.evaluations.base_eval import BaseEval
8+
from vetnode.evaluations.models import EvalResultStatus
9+
10+
11+
class EnvVarEval(BaseEval):
12+
"""
13+
Vetnode evaluation that verifies expected environment variables are present
14+
and (optionally) match expected values.
15+
16+
Example config concept:
17+
expected:
18+
VAR: "value" # exact string
19+
SOME_VAR: null # must exist, any value
20+
"""
21+
22+
name: str
23+
type: Literal["vetnode.evaluations.env_var_eval.EnvVarEval"]
24+
25+
# Map of env var -> expected value (str) OR None to mean "must exist"
26+
expected: dict[str, str | None] = Field(default_factory=dict)
27+
28+
async def check(self, executor) -> tuple[EvalResultStatus, dict]:
29+
found: dict[str, str | None] = {}
30+
missing: list[str] = []
31+
mismatched: dict[str, dict[str, str | None]] = {}
32+
33+
for key, value in (self.expected or {}).items():
34+
actual = os.environ.get(key)
35+
found[key] = actual
36+
37+
if actual is None:
38+
missing.append(key)
39+
continue
40+
41+
if value is not None and actual != value:
42+
mismatched[key] = {"expected": value, "actual": actual}
43+
44+
ok = (not missing) and (not mismatched)
45+
status = EvalResultStatus.SUCCESS if ok else EvalResultStatus.FAILED
46+
47+
metrics = {
48+
"expected": self.expected,
49+
"found": found,
50+
"missing": missing,
51+
"mismatched": mismatched,
52+
}
53+
return status, metrics

src/vetnode/evaluations/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,4 @@ class BandwidthSize(ByteSize):
9494
'eb/s': 2**60,
9595
}
9696
byte_string_pattern = r'^\s*(\d*\.?\d+)\s*([\w\/]+)?'
97-
byte_string_re = re.compile(byte_string_pattern, re.IGNORECASE)
97+
byte_string_re = re.compile(byte_string_pattern, re.IGNORECASE)

tests/test_env_var_eval.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import pytest
2+
import textwrap
3+
from click.testing import CliRunner
4+
5+
from vetnode.evaluations.models import EvalContext, EvalResultStatus
6+
from vetnode.evaluations.env_var_eval import EnvVarEval
7+
from vetnode.configuration import Configuration
8+
from vetnode.cli import build_context, load_evals, diagnose
9+
10+
11+
def mk_ctx() -> EvalContext:
12+
return EvalContext(
13+
scheduler="standalone",
14+
rank=0,
15+
local_rank=0,
16+
eval_id=0,
17+
world_size=1,
18+
master_addr="localhost",
19+
master_port=29500,
20+
)
21+
22+
23+
@pytest.mark.asyncio
24+
async def test_must_exist_passes_when_set(monkeypatch):
25+
monkeypatch.setenv("SOME_FLAG", "")
26+
ev = EnvVarEval(mk_ctx(), name="env", type="vetnode.evaluations.env_var_eval.EnvVarEval",
27+
expected={"SOME_FLAG": None})
28+
29+
status, metrics = await ev.check(None)
30+
assert status == EvalResultStatus.SUCCESS
31+
assert metrics["missing"] == []
32+
assert metrics["mismatched"] == {}
33+
34+
35+
@pytest.mark.asyncio
36+
async def test_must_exist_fails_when_missing(monkeypatch):
37+
monkeypatch.delenv("SOME_FLAG", raising=False)
38+
ev = EnvVarEval(mk_ctx(), name="env", type="vetnode.evaluations.env_var_eval.EnvVarEval",
39+
expected={"SOME_FLAG": None})
40+
41+
status, metrics = await ev.check(None)
42+
assert status == EvalResultStatus.FAILED
43+
assert metrics["missing"] == ["SOME_FLAG"]
44+
45+
46+
@pytest.mark.asyncio
47+
async def test_exact_match(monkeypatch):
48+
monkeypatch.setenv("FI_CXI_RX_MATCH_MODE", "hybrid")
49+
ev = EnvVarEval(mk_ctx(), name="env", type="vetnode.evaluations.env_var_eval.EnvVarEval",
50+
expected={"FI_CXI_RX_MATCH_MODE": "hybrid"})
51+
52+
status, _ = await ev.check(None)
53+
assert status == EvalResultStatus.SUCCESS
54+
55+
56+
@pytest.mark.asyncio
57+
async def test_mismatch(monkeypatch):
58+
monkeypatch.setenv("FI_CXI_RX_MATCH_MODE", "wrong")
59+
ev = EnvVarEval(mk_ctx(), name="env", type="vetnode.evaluations.env_var_eval.EnvVarEval",
60+
expected={"FI_CXI_RX_MATCH_MODE": "hybrid"})
61+
62+
status, metrics = await ev.check(None)
63+
assert status == EvalResultStatus.FAILED
64+
assert "FI_CXI_RX_MATCH_MODE" in metrics["mismatched"]
65+
66+
67+
def test_yaml_populates_expected_env(tmp_path):
68+
cfg = tmp_path / "cfg.yaml"
69+
cfg.write_text(textwrap.dedent("""
70+
name: test-config
71+
scheduler: standalone
72+
evals:
73+
- name: env-check
74+
type: vetnode.evaluations.env_var_eval.EnvVarEval
75+
expected:
76+
SOME_FLAG: null
77+
FI_CXI_RX_MATCH_MODE: hybrid
78+
""").lstrip())
79+
80+
# Make Configuration read this YAML
81+
Configuration._yaml_file = str(cfg)
82+
conf = Configuration()
83+
84+
ctx = build_context(conf)
85+
evals = load_evals(ctx, conf.evals)
86+
87+
assert len(evals) == 1
88+
ev = evals[0]
89+
90+
assert ev.expected["SOME_FLAG"] is None
91+
assert ev.expected["FI_CXI_RX_MATCH_MODE"] == "hybrid"
92+
93+
94+
@pytest.mark.parametrize(
95+
"env, expected_exit, expected_token",
96+
[
97+
({"FI_CXI_RX_MATCH_MODE": "hybrid", "FI_MR_CACHE_MONITOR": "userfaultfd"}, 0, ": Vetted"),
98+
({"FI_CXI_RX_MATCH_MODE": "nope", "FI_MR_CACHE_MONITOR": "userfaultfd"}, 1, ": Cordon"),
99+
({}, 1, ": Cordon"),
100+
],
101+
)
102+
def test_diagnose_env_eval(tmp_path, monkeypatch, env, expected_exit, expected_token):
103+
# Set/clear relevant env vars
104+
for k in ["FI_CXI_RX_MATCH_MODE", "FI_MR_CACHE_MONITOR"]:
105+
monkeypatch.delenv(k, raising=False)
106+
for k, v in env.items():
107+
monkeypatch.setenv(k, v)
108+
109+
cfg = tmp_path / "cfg.yaml"
110+
cfg.write_text(textwrap.dedent("""
111+
name: test-config
112+
scheduler: standalone
113+
evals:
114+
- name: env-check
115+
type: vetnode.evaluations.env_var_eval.EnvVarEval
116+
expected:
117+
FI_CXI_RX_MATCH_MODE: hybrid
118+
FI_MR_CACHE_MONITOR: userfaultfd
119+
""").lstrip())
120+
121+
runner = CliRunner()
122+
result = runner.invoke(diagnose, ["--skip-install", "--verbose", str(cfg)])
123+
124+
print(result.output)
125+
assert result.exit_code == expected_exit
126+
assert expected_token in result.output

0 commit comments

Comments
 (0)