Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7231f00
DAOS-17916 test: Verify page eviction on MD on SSD
phender Nov 25, 2025
d126fe1
Updates.
phender Nov 25, 2025
3184187
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 3, 2025
b385403
Update metrics.
phender Dec 3, 2025
fe66b75
Fix expected metric values after pool create.
phender Dec 3, 2025
254b1e8
Updates for mdtest_utils.py
phender Dec 4, 2025
8715379
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 8, 2025
96a55d9
Fix mdtest env.
phender Dec 8, 2025
1e29d5d
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 9, 2025
7d7112d
Fix mdtest env again.
phender Dec 9, 2025
2444880
Use POSIX container.
phender Dec 9, 2025
a463680
Update mdtest command
phender Dec 9, 2025
3837dda
Fix
phender Dec 9, 2025
a17c79f
Set mdtest log file name.
phender Dec 10, 2025
9aeca07
Fix call.
phender Dec 10, 2025
1a4fa3f
Update mdtest params.
phender Dec 10, 2025
295c098
Cleanup.
phender Dec 11, 2025
fefd811
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 11, 2025
a9ca47c
Increase mdtest timeout
phender Dec 11, 2025
f829519
Further iuncrease tiomeout.
phender Dec 11, 2025
1a5c295
Increase pool size.
phender Dec 11, 2025
15e5b32
Further increase timeout
phender Dec 11, 2025
1438e09
Adding more clients.
phender Dec 12, 2025
4a37c1d
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 16, 2025
553dc6b
Adding more clients; moving to HW Large
phender Dec 16, 2025
ecbf012
Debug.
phender Dec 17, 2025
aac104c
Updates.
phender Dec 19, 2025
e07888b
Merge branch 'master' into hendersp/DAOS-17916
phender Dec 19, 2025
b2df760
Add debug.
phender Dec 19, 2025
7922669
Merge branch 'master' into hendersp/DAOS-17916
phender Jan 6, 2026
76a4fa3
Test fixes.
phender Jan 6, 2026
d661c2d
Move to HW Medium
phender Jan 7, 2026
db2cada
Remove mdtest depth arg
phender Jan 8, 2026
ba39b80
Merge branch 'master' into hendersp/DAOS-17916
phender Jan 8, 2026
f2b1552
Remove debug.
phender Jan 8, 2026
382d817
Merge branch 'master' into hendersp/DAOS-17916
phender Jan 14, 2026
06a8c40
Debug
phender Jan 14, 2026
2f08c8a
Smaller write size
phender Jan 15, 2026
2f54bd5
Reducing pool size.
phender Jan 16, 2026
ddafeca
Merge branch 'master' into hendersp/DAOS-17916
phender Jan 16, 2026
8f1e99c
Updated mdtest -n calculation
phender Jan 16, 2026
92fc178
Adjusting num_of_files_dirs.
phender Jan 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions src/tests/ftest/pool/eviction_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
(C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import json
import math

from job_manager_utils import get_job_manager
from mdtest_utils import MDTEST_NAMESPACE, run_mdtest
from telemetry_test_base import TestWithTelemetry


class EvictionMetrics(TestWithTelemetry):
"""
Tests DAOS client eviction from a pool that the client is using.

:avocado: recursive
"""

def test_eviction_metrics(self):
"""Verify page eviction on the pool

1. Create a pool with a mem ratio of 100% (for pmem or phase 1) or 25% (for phase 2)
2. Collect a baseline for the pool eviction metrics
3. Run mdtest -a DFS to generate many small files larger than mem size
4. Collect new page eviction metrics
5. Verify page eviction

:avocado: tags=all,daily_regression
:avocado: tags=hw,medium
:avocado: tags=pool
:avocado: tags=EvictionMetrics,test_eviction_metrics
"""
write_bytes = self.params.get('write_bytes', MDTEST_NAMESPACE, None)
processes = self.params.get('processes', MDTEST_NAMESPACE, None)
ppn = self.params.get('ppn', MDTEST_NAMESPACE, None)

evict_metrics = list(self.telemetry.ENGINE_POOL_VOS_CACHE_METRICS)

self.log_step('Creating a pool (dmg pool create)')
pool = self.get_pool(connect=False)
try:
_result = json.loads(pool.dmg.result.stdout)
tier_bytes_scm = int(_result["response"]["tier_bytes"][0])
mem_file_bytes = int(_result["response"]["mem_file_bytes"])
except Exception as error: # pylint: disable=broad-except
self.fail(f"Error extracting data for dmg pool create output: {error}")

# Calculate the mdtest files_per_process based upon the scm size and other mdtest params
_write_processes = processes
if ppn is not None:
_write_processes = ppn * len(self.host_info.clients.hosts)
files_per_process = math.floor(mem_file_bytes / (write_bytes * _write_processes))
if tier_bytes_scm > mem_file_bytes:
# Write more (125%) files to exceed mem_file_bytes and cause eviction
mdtest_params = {"num_of_files_dirs": math.ceil(files_per_process * 1.25)}
else:
# Write less (75%) files to avoid out of space errors
mdtest_params = {"num_of_files_dirs": math.floor(files_per_process * 0.75)}

self.log.debug("-" * 60)
self.log.debug("Pool %s create data:", pool)
self.log.debug(" tier_bytes_scm: %s", tier_bytes_scm)
self.log.debug(" mem_file_bytes: %s", mem_file_bytes)
self.log.debug(" mem_ratio.value: %s", pool.mem_ratio.value)
self.log.debug("Mdtest write parameters:")
self.log.debug(" write_bytes: %s", write_bytes)
if ppn is not None:
self.log.debug(" ppn / nodes: %s / %s", ppn, len(self.host_info.clients.hosts))
else:
self.log.debug(" processes: %s", processes)
self.log.debug(" files_per_process: %s", files_per_process)
self.log.debug(" num_of_files_dirs: %s", mdtest_params["num_of_files_dirs"])
self.log.debug(" expected to write: %s",
_write_processes * write_bytes * mdtest_params["num_of_files_dirs"])
self.log.debug("-" * 60)

self.log_step('Creating a container (dmg container create)')
container = self.get_container(pool)

self.log_step(
'Collect pool eviction metrics after creating a pool (dmg telemetry metrics query)')
expected_ranges = self.telemetry.collect_data(evict_metrics)
for metric in sorted(expected_ranges):
for label in expected_ranges[metric]:
if pool.mem_ratio.value is not None and metric.endswith('_hit'):
expected_ranges[metric][label] = [0, 100] # 0-100 (phase 2)
elif pool.mem_ratio.value is not None and metric.endswith('_miss'):
expected_ranges[metric][label] = [0, 5] # 0-5 (phase 2)
elif pool.mem_ratio.value is not None and metric.endswith('_ne'):
expected_ranges[metric][label] = [0, 5] # 0-5 (phase 2)
else:
expected_ranges[metric][label] = [0, 0] # 0 only
self.log.debug("%s expected_ranges: %s", pool, expected_ranges)

self.log_step('Verify pool eviction metrics after pool creation')
if not self.telemetry.verify_data(expected_ranges):
self.fail('Pool eviction metrics verification failed after pool creation')

self.log_step('Writing data to the pool (mdtest -a DFS)')
manager = get_job_manager(self, subprocess=False, timeout=None)
run_mdtest(
self, self.hostlist_clients, self.workdir, None, container, processes, ppn, manager,
mdtest_params=mdtest_params)

self.log_step(
'Collect pool eviction metrics after writing data (dmg telemetry metrics query)')
expected_ranges = self.telemetry.collect_data(evict_metrics)
for metric in sorted(expected_ranges):
for label in expected_ranges[metric]:
if pool.mem_ratio.value is None:
expected_ranges[metric][label] = [0, 0] # 0 only (phase 1)
elif metric.endswith('_page_flush'):
expected_ranges[metric][label] = [0] # 0 or greater (phase 2)
else:
expected_ranges[metric][label] = [1, 10000000] # 1-10,000,000 (phase 2)
self.log.debug("%s expected_ranges: %s", pool, expected_ranges)

self.log_step('Verify pool eviction metrics after writing data')
if not self.telemetry.verify_data(expected_ranges):
self.fail('Pool eviction metrics verification failed after writing data')

self.log_step('Test passed')
42 changes: 42 additions & 0 deletions src/tests/ftest/pool/eviction_metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
launch:
!filter-only : /run/pool/default # yamllint disable-line rule:colons

hosts:
test_servers: 1
test_clients: 3

timeout: 16000

server_config:
name: daos_server
engines_per_host: 1
engines:
0:
targets: 4
nr_xs_helpers: 0
storage: auto

pool: !mux
default:
size: 10%
md_on_ssd_p2:
size: 10%
mem_ratio: 25

container:
type: POSIX
oclass: S1
dir_oclass: SX

mdtest:
dfs_oclass: S1
dfs_dir_oclass: SX
dfs_destroy: False
manager: "MPICH"
ppn: 32
test_dir: "/"
api: DFS
flags: "-C -F -G 27 -N 1 -Y -u -L"
branching_factor: 1
write_bytes: 1024
read_bytes: 1024
Loading
Loading