Skip to content

Commit 84f5280

Browse files
HyeockJinKimclaude
andauthored
fix(BA-4550): use persistent directory for Prometheus multiprocess metrics (#9114)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8ed8080 commit 84f5280

File tree

3 files changed

+164
-13
lines changed

3 files changed

+164
-13
lines changed

changes/9114.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix Prometheus metrics scrape crash when `/tmp` multiprocess directory is cleaned by OS, by using a persistent directory (`/var/run/backendai/prometheus/`) outside `/tmp`.

src/ai/backend/common/metrics/multiprocess.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,18 @@
2929

3030
_multiprocess_dir: Path | None = None
3131

32+
_DEFAULT_BASE_DIR = Path(tempfile.gettempdir()) / "backendai" / "prometheus"
3233

33-
def setup_prometheus_multiprocess_dir(component: str = "manager") -> Path:
34+
35+
def setup_prometheus_multiprocess_dir(
36+
component: str = "manager",
37+
) -> Path:
3438
"""
3539
Set up the prometheus multiprocess directory and environment variable.
3640
3741
MUST be called before any prometheus_client import.
3842
39-
Creates a temporary directory for prometheus multiprocess files and sets
43+
Creates a directory for prometheus multiprocess files and sets
4044
the PROMETHEUS_MULTIPROC_DIR environment variable.
4145
4246
Args:
@@ -50,15 +54,15 @@ def setup_prometheus_multiprocess_dir(component: str = "manager") -> Path:
5054
if _multiprocess_dir is not None:
5155
return _multiprocess_dir
5256

53-
base_dir = Path(tempfile.gettempdir()) / "backendai-prometheus"
54-
base_dir.mkdir(parents=True, exist_ok=True)
57+
multiprocess_dir = _DEFAULT_BASE_DIR / component
58+
multiprocess_dir.mkdir(parents=True, exist_ok=True)
5559

56-
multiprocess_dir = Path(
57-
tempfile.mkdtemp(
58-
prefix=f"{component}-",
59-
dir=base_dir,
60-
)
61-
)
60+
# Clean stale .db files from previous runs
61+
for db_file in multiprocess_dir.glob("*.db"):
62+
try:
63+
db_file.unlink()
64+
except OSError:
65+
pass
6266

6367
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(multiprocess_dir)
6468
_multiprocess_dir = multiprocess_dir
@@ -75,9 +79,31 @@ def generate_latest_multiprocess() -> bytes:
7579
7680
This should be used by multi-worker components (manager, agent, storage, etc.).
7781
"""
78-
registry = CollectorRegistry()
79-
MultiProcessCollector(registry) # type: ignore[no-untyped-call]
80-
return generate_latest(registry)
82+
try:
83+
registry = CollectorRegistry()
84+
MultiProcessCollector(registry) # type: ignore[no-untyped-call]
85+
return generate_latest(registry)
86+
except ValueError:
87+
# Directory may have been deleted (e.g., by systemd-tmpfiles-clean).
88+
# Attempt to recreate it and retry once.
89+
if _multiprocess_dir is not None:
90+
try:
91+
_multiprocess_dir.mkdir(parents=True, exist_ok=True)
92+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(_multiprocess_dir)
93+
registry = CollectorRegistry()
94+
MultiProcessCollector(registry) # type: ignore[no-untyped-call]
95+
log.warning(
96+
"Prometheus multiprocess dir was missing and has been recreated: %s",
97+
_multiprocess_dir,
98+
)
99+
return generate_latest(registry)
100+
except Exception:
101+
log.error(
102+
"Failed to recover prometheus multiprocess dir: %s",
103+
_multiprocess_dir,
104+
exc_info=True,
105+
)
106+
return b""
81107

82108

83109
def generate_latest_singleprocess() -> bytes:
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
from __future__ import annotations
2+
3+
import os
4+
from collections.abc import Iterator
5+
from pathlib import Path
6+
from unittest.mock import patch
7+
8+
import pytest
9+
10+
import ai.backend.common.metrics.multiprocess as mp_mod
11+
from ai.backend.common.metrics.multiprocess import (
12+
cleanup_prometheus_multiprocess_dir,
13+
generate_latest_multiprocess,
14+
setup_prometheus_multiprocess_dir,
15+
)
16+
17+
18+
@pytest.fixture(autouse=True)
19+
def _reset_multiprocess_state() -> Iterator[None]:
20+
"""Reset the module-level global state before each test."""
21+
original_dir = mp_mod._multiprocess_dir
22+
original_env = os.environ.get("PROMETHEUS_MULTIPROC_DIR")
23+
yield
24+
mp_mod._multiprocess_dir = original_dir
25+
if original_env is not None:
26+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = original_env
27+
elif "PROMETHEUS_MULTIPROC_DIR" in os.environ:
28+
del os.environ["PROMETHEUS_MULTIPROC_DIR"]
29+
30+
31+
class TestSetupPrometheusMultiprocDir:
32+
def test_creates_directory_with_default_base(self, tmp_path: Path) -> None:
33+
with patch.object(mp_mod, "_DEFAULT_BASE_DIR", tmp_path):
34+
result = setup_prometheus_multiprocess_dir("manager")
35+
36+
assert result == tmp_path / "manager"
37+
assert result.is_dir()
38+
assert os.environ["PROMETHEUS_MULTIPROC_DIR"] == str(result)
39+
40+
def test_cleans_stale_db_files(self, tmp_path: Path) -> None:
41+
prom_dir = tmp_path / "manager"
42+
prom_dir.mkdir(parents=True)
43+
(prom_dir / "gauge_liveall_123.db").touch()
44+
(prom_dir / "counter_456.db").touch()
45+
(prom_dir / "keep_this.txt").touch()
46+
47+
with patch.object(mp_mod, "_DEFAULT_BASE_DIR", tmp_path):
48+
setup_prometheus_multiprocess_dir("manager")
49+
50+
assert not (prom_dir / "gauge_liveall_123.db").exists()
51+
assert not (prom_dir / "counter_456.db").exists()
52+
assert (prom_dir / "keep_this.txt").exists()
53+
54+
def test_idempotent_returns_same_path(self, tmp_path: Path) -> None:
55+
with patch.object(mp_mod, "_DEFAULT_BASE_DIR", tmp_path):
56+
first = setup_prometheus_multiprocess_dir("manager")
57+
second = setup_prometheus_multiprocess_dir("manager")
58+
59+
assert first == second
60+
61+
def test_idempotent_ignores_different_component(self, tmp_path: Path) -> None:
62+
"""Once setup, calling again with different component still returns the first path."""
63+
with patch.object(mp_mod, "_DEFAULT_BASE_DIR", tmp_path):
64+
first = setup_prometheus_multiprocess_dir("manager")
65+
second = setup_prometheus_multiprocess_dir("agent")
66+
67+
assert first == second # idempotent, returns first result
68+
69+
70+
class TestGenerateLatestMultiprocess:
71+
def test_returns_bytes_normally(self, tmp_path: Path) -> None:
72+
prom_dir = tmp_path / "test-component"
73+
prom_dir.mkdir(parents=True)
74+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(prom_dir)
75+
mp_mod._multiprocess_dir = prom_dir
76+
77+
result = generate_latest_multiprocess()
78+
assert isinstance(result, bytes)
79+
80+
def test_recovers_from_missing_directory(self, tmp_path: Path) -> None:
81+
prom_dir = tmp_path / "test-component"
82+
prom_dir.mkdir(parents=True)
83+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(prom_dir)
84+
mp_mod._multiprocess_dir = prom_dir
85+
86+
# Simulate systemd-tmpfiles-clean deleting the directory
87+
prom_dir.rmdir()
88+
assert not prom_dir.exists()
89+
90+
result = generate_latest_multiprocess()
91+
# Should recover by recreating the directory
92+
assert isinstance(result, bytes)
93+
assert prom_dir.exists()
94+
95+
def test_returns_empty_bytes_on_unrecoverable_failure(self) -> None:
96+
# Set an invalid path that can't be recreated
97+
mp_mod._multiprocess_dir = Path("/nonexistent/impossible/path")
98+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/nonexistent/impossible/path"
99+
100+
result = generate_latest_multiprocess()
101+
assert result == b""
102+
103+
104+
class TestCleanupPrometheusMultiprocDir:
105+
def test_removes_directory_and_env(self, tmp_path: Path) -> None:
106+
prom_dir = tmp_path / "manager"
107+
prom_dir.mkdir(parents=True)
108+
(prom_dir / "test.db").touch()
109+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(prom_dir)
110+
mp_mod._multiprocess_dir = prom_dir
111+
112+
cleanup_prometheus_multiprocess_dir()
113+
114+
assert not prom_dir.exists()
115+
assert "PROMETHEUS_MULTIPROC_DIR" not in os.environ
116+
assert mp_mod._multiprocess_dir is None
117+
118+
def test_noop_when_not_initialized(self) -> None:
119+
mp_mod._multiprocess_dir = None
120+
if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
121+
del os.environ["PROMETHEUS_MULTIPROC_DIR"]
122+
123+
# Should not raise
124+
cleanup_prometheus_multiprocess_dir()

0 commit comments

Comments
 (0)