From 4932c9ab1cd6aeb67e6001d1b5fb9817b224af8d Mon Sep 17 00:00:00 2001 From: Srikanth Muppandam Date: Fri, 29 Aug 2025 19:09:16 +0530 Subject: [PATCH 1/2] stressapptest: add validation run.sh and README - Support for common CLI arguments and safe/dry-run modes - Dynamic resource detection (CPUs, memory, disk, network) - Strict mode checking for OOM, panics, I/O errors - Default localhost fallback if no IP is detected - Added README.md with install instructions, examples, and usage Signed-off-by: Srikanth Muppandam --- .../Kernel/Stress/Stressapptest/README.md | 227 +++++++ .../suites/Kernel/Stress/Stressapptest/run.sh | 569 ++++++++++++++++++ 2 files changed, 796 insertions(+) create mode 100644 Runner/suites/Kernel/Stress/Stressapptest/README.md create mode 100755 Runner/suites/Kernel/Stress/Stressapptest/run.sh diff --git a/Runner/suites/Kernel/Stress/Stressapptest/README.md b/Runner/suites/Kernel/Stress/Stressapptest/README.md new file mode 100644 index 00000000..ebd96755 --- /dev/null +++ b/Runner/suites/Kernel/Stress/Stressapptest/README.md @@ -0,0 +1,227 @@ +# Stressapptest Validation + +This test validates system stability using [stressapptest](https://github.com/stressapptest/stressapptest). + +## Overview + +`stressapptest` is a stress-testing tool for CPU, memory, disk and networking, widely used in reliability testing for servers and embedded systems. + +This wrapper script adds: + +- **Cgroup-aware** memory sizing with safety guards +- **Safe** and **Strict** modes +- Post-run **dmesg** scanning (toggleable) +- Auto detection for CPUs / memory / mounts / IP +- Optional **auto** setup for disk and network tests +- Looping with per-loop and aggregate **JSON** summaries +- CPU pinning via **taskset** (if available) or **cpuset cgroups** (root, when supported) + +## Prerequisites + +- `stressapptest` must be installed and available in `PATH`. + +Optional tools (the wrapper works without them, but features degrade gracefully): + +- `taskset` (if present, used for CPU pinning) +- Writable **cpuset** cgroups (kernel feature; used for pinning when `taskset` is absent) +- `df` (for auto disk selection), `ip`/`hostname` (for auto network) +- `getconf`/**`nproc`** (or `/proc/cpuinfo`) for CPU counting + +Build from source (typical host): +```bash +git clone https://github.com/stressapptest/stressapptest.git +cd stressapptest +./configure +make +sudo make install + +Yocto image: + +IMAGE_INSTALL:append = " stressapptest" + +Side-load: + +scp stressapptest user@target:/usr/local/bin/ + +Usage + +./run.sh [options] + +Options forwarded to stressapptest + +(These map 1:1 to stressapptest flags.) + +-M : Memory to test (default: auto; see memory sizing below) + +-s : Duration (default: 300; safe mode: 120) + +-m : Memory copy threads (default: online CPUs; safe: ~half, up to 4) + +-W : More CPU-stressful memory copy + +-n : Network client thread to + +--listen : Listen thread (for networking) + +-f : Disk thread using + +-F : Use libc memcpy + +-l : Log file (default: ./Stressapptest.log) + +-v : Verbosity 0–20 (default: 8) + + +Wrapper-specific options + +--safe : Conservative sizing and CPU subset + +--dry-run : Print the command that would run and exit + +--strict : Fail run if severe dmesg issues are detected + +--auto-net[=primary|loopback] : Start local listener and set -n automatically +(default mode: primary; falls back to loopback if no primary IP) + +--auto-disk : Pick a writable mount and create a tempfile for -f + +--auto : Shorthand for --auto-net --auto-disk + + +Memory sizing knobs (cgroup-aware) + +--mem-pct=

: Percent of available RAM to use (default 60; safe: 35) + +--mem-headroom-mb= : Keep this many MB free (default 256; safe: 512) + +--mem-cap-mb= : Hard upper cap on -M + +--require-mem-mb= : Refuse to run if computed target < MB + + +Control & reporting + +--loops= : Repeat test N times (default 1) + +--loop-delay= : Sleep S seconds between loops (default 0) + +--json= : Write line-delimited JSON per loop + final aggregate + + +> You can also supply most of these via environment variables (e.g. SAFE=1, MEM_CAP_MB=256, JSON_OUT=summary.json, LOOPS=3). + + + +Examples + +Run for 60s using auto sizing: + +./run.sh -s 60 + +Safer profile (shorter, fewer threads, more headroom): + +./run.sh --safe + +Low-memory guard (refuse to run < 512 MB): + +./run.sh --require-mem-mb=512 + +Cap memory and add extra headroom: + +./run.sh --mem-cap-mb=256 --mem-headroom-mb=512 + +Multiple loops with JSON summary (and strict dmesg checks): + +./run.sh --loops=5 --loop-delay=10 --json=summary.json --strict + +Auto network + auto disk: + +./run.sh --auto + +Dry run (show exact command that would execute): + +./run.sh --dry-run + +CPU usage & pinning + +The wrapper starts one stressapptest process with -m (defaults to online CPUs). +Those workers are threads—not separate processes—so ps typically shows a single process. + +Pinning behavior: + +1. If taskset exists → the process is pinned to the CPU list (logged as CPU pinning method: taskset (...)). + + +2. Else, if cpuset cgroups are available (and writable) → the wrapper confines the process to that CPU list +(logged as CPU pinning method: cgroup cpuset (...)). + + +3. Else → runs unpinned (logged as CPU pinning method: none). + + + + +How to verify + +Count threads: + +PID=$(pidof stressapptest) +grep '^Threads:' /proc/$PID/status +# or +ls -1 /proc/$PID/task | wc -l + +See allowed CPUs: + +PID=$(pidof stressapptest) +awk '/Cpus_allowed_list/ {print $2}' /proc/$PID/status + +Check cpuset cgroup (if used): + +cat /proc/$(pidof stressapptest)/cgroup +# then inspect matching cpuset.cpus file under /sys/fs/cgroup/... + +Memory sizing (how it’s computed) + +1. Determine available memory (prefer cgroup limit/usage if present; otherwise MemAvailable). + + +2. Take available * mem_pct (default 60%; --safe uses 35%). + + +3. Reserve headroom (--mem-headroom-mb; default 256 MB; safe: 512 MB). + + +4. Apply hard cap (--mem-cap-mb) if set. + + +5. Clamp to sane floor (≥ 16 MB) and not above “available minus headroom”. + + +6. If --require-mem-mb=N and computed < N, the run aborts. + + +The final value is passed to stressapptest as -M. + +Output + +Result: ./Stressapptest.res → PASS or FAIL + +Log file: ./Stressapptest.log + +If --json is used: line-delimited JSON entries per loop and a final aggregate. + + +Notes + +By default you’ll see one stressapptest process; workers are threads (use /proc/$PID/task to list them). + +Auto disk selection avoids RO/system mounts and picks the largest free writable mount for -f. + +Auto network starts a local listen thread and chooses a primary IP (falling back to loopback). + +--- + +License + +The test runner: BSD-3-Clause-Clear (Qualcomm Technologies, Inc. and/or its subsidiaries). +stressapptest is licensed by its upstream author; see its repository for details. diff --git a/Runner/suites/Kernel/Stress/Stressapptest/run.sh b/Runner/suites/Kernel/Stress/Stressapptest/run.sh new file mode 100755 index 00000000..8d080893 --- /dev/null +++ b/Runner/suites/Kernel/Stress/Stressapptest/run.sh @@ -0,0 +1,569 @@ +#!/bin/sh +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause-Clear +# +# stressapptest/run.sh +# Safe wrapper around stressapptest for embedded/CI use. +# Memory-safety features: --mem-pct, --mem-cap-mb, --mem-headroom-mb, +# --require-mem-mb, cgroup-aware sizing, JSON summaries, loops, NUMA, etc. + +############################################################################### +# Boilerplate: locate and source init_env + functestlib.sh +############################################################################### +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +INIT_ENV="" +SEARCH="$SCRIPT_DIR" +while [ "$SEARCH" != "/" ]; do + if [ -f "$SEARCH/init_env" ]; then + INIT_ENV="$SEARCH/init_env" + break + fi + SEARCH=$(dirname "$SEARCH") +done + +if [ -z "$INIT_ENV" ]; then + echo "[ERROR] Could not find init_env (starting at $SCRIPT_DIR)" >&2 + exit 1 +fi + +if [ -z "$__INIT_ENV_LOADED" ]; then + # shellcheck disable=SC1090 + . "$INIT_ENV" +fi +# shellcheck disable=SC1090,SC1091 +. "$TOOLS/functestlib.sh" + +TESTNAME="Stressapptest" +RES_FILE="./${TESTNAME}.res" +LOG_FILE="./${TESTNAME}.log" +test_path="$(find_test_case_by_name "$TESTNAME" 2>/dev/null || echo "$SCRIPT_DIR")" +cd "$test_path" || exit 1 + +log_info "----------------------------------------------------------------------" +log_info "------------------- Starting $TESTNAME Testcase ----------------------" +log_info "=== Test Initialization ===" + +############################################################################### +# Usage +############################################################################### +print_usage() { +cat < Memory to test (default: ~60% MemAvailable or --mem-pct) + -s Duration seconds (default: 300; safe: 120) + -m Memory copy threads (default: online CPUs; safe: ~half) + -W More CPU-stressful memory copy + -n Network client thread to + --listen Run a listen thread for network tests + -f Add a disk thread using tempfile + -F Use libc memcpy (skip per-transaction result check) + -l Log file (default: ./Stressapptest.log) + -v Verbosity 0-20 (default: 8) + +Wrapper-specific (CLI flags and/or ENV equivalents): + --safe (SAFE=1) Conservative limits + --dry-run (DRYRUN=1) Print command and exit + --strict (STRICT=1) Fail on critical dmesg issues + --auto-net[=mode] (AUTO_NET=1, AUTO_NET_MODE=primary|loopback) + --auto-disk (AUTO_DISK=1) + --auto shorthand for --auto-net --auto-disk + +Memory sizing (cgroup-aware): + --mem-pct=

(MEM_PCT) Percent of MemAvailable (def 60; safe 35) + --mem-cap-mb= (MEM_CAP_MB) Hard upper cap in MB + --mem-headroom-mb= (MEM_HEADROOM_MB) Reserve MB from available + --require-mem-mb= (REQUIRE_MEM_MB) Require at least MB or FAIL + +Control / reporting: + --loops= (LOOPS) Repeat N times (def 1) + --loop-delay= (LOOP_DELAY) Sleep S sec between loops (def 0) + --json= (JSON_OUT) Write JSON per loop + aggregate +EOF +} + +############################################################################### +# Parse CLI (+ allow ENV defaults) +############################################################################### +# ENV defaults first (so CLI can override) +SAFE="${SAFE:-0}"; DRYRUN="${DRYRUN:-0}"; STRICT="${STRICT:-0}" +AUTO_NET="${AUTO_NET:-0}"; AUTO_NET_MODE="${AUTO_NET_MODE:-primary}" +AUTO_DISK="${AUTO_DISK:-0}" +MEM_PCT="${MEM_PCT:-}"; MEM_CAP_MB="${MEM_CAP_MB:-}"; MEM_HEADROOM_MB="${MEM_HEADROOM_MB:-}" +REQUIRE_MEM_MB="${REQUIRE_MEM_MB:-}" +LOOPS="${LOOPS:-1}"; LOOP_DELAY="${LOOP_DELAY:-0}" +JSON_OUT="${JSON_OUT:-}" + +USER_M="" USER_S="" USER_m="" USER_W=0 USER_n="" USER_listen=0 USER_f="" USER_F=0 USER_l="" USER_v="" + +while [ $# -gt 0 ]; do + case "$1" in + -M) shift; USER_M="$1" ;; + -s) shift; USER_S="$1" ;; + -m) shift; USER_m="$1" ;; + -W) USER_W=1 ;; + -n) shift; USER_n="$1" ;; + --listen) USER_listen=1 ;; + -f) shift; USER_f="$1" ;; + -F) USER_F=1 ;; + -l) shift; USER_l="$1" ;; + -v) shift; USER_v="$1" ;; + --safe) SAFE=1 ;; + --dry-run) DRYRUN=1 ;; + --strict) STRICT=1 ;; + --auto-net) AUTO_NET=1 ;; + --auto-net=*) AUTO_NET=1; AUTO_NET_MODE="${1#--auto-net=}";; + --auto-disk) AUTO_DISK=1 ;; + --auto) AUTO_NET=1; AUTO_DISK=1 ;; + --mem-pct=*) MEM_PCT="${1#--mem-pct=}" ;; + --mem-cap-mb=*) MEM_CAP_MB="${1#--mem-cap-mb=}" ;; + --mem-headroom-mb=*) MEM_HEADROOM_MB="${1#--mem-headroom-mb=}" ;; + --require-mem-mb=*) REQUIRE_MEM_MB="${1#--require-mem-mb=}" ;; + --loops=*) LOOPS="${1#--loops=}" ;; + --loop-delay=*) LOOP_DELAY="${1#--loop-delay=}" ;; + --json=*) JSON_OUT="${1#--json=}" ;; + --help) print_usage; exit 0 ;; + *) log_warn "Ignoring unknown option: $1" ;; + esac + shift +done + +# Validate simple enums +case "$AUTO_NET_MODE" in primary|loopback) : ;; *) log_warn "Unknown --auto-net mode '$AUTO_NET_MODE', using 'primary'"; AUTO_NET_MODE="primary";; esac + +############################################################################### +# Dependencies (getconf optional; fallback if missing) +############################################################################### +check_dependencies stressapptest awk grep sed || { + log_skip "$TESTNAME SKIP - required tools missing" + echo "$TESTNAME SKIP" >"$RES_FILE" + exit 0 +} +for t in df cut head tail sort tr stat ip hostname nproc getconf taskset; do + command -v "$t" >/dev/null 2>&1 || true +done + +############################################################################### +# Helpers +############################################################################### +# Expand "0-3,5,7" into "0 1 2 3 5 7" +expand_list() { + in="$1"; oldIFS=$IFS; IFS=,; out="" + for part in $in; do + part=$(printf "%s" "$part" | tr -d ' ') + case "$part" in + *-*) a=${part%-*}; b=${part#*-}; i="$a"; while [ "$i" -le "$b" ] 2>/dev/null; do out="$out $i"; i=$((i+1)); done;; + '') ;; + *) out="$out $part" ;; + esac + done + IFS=$oldIFS; printf "%s\n" "$out" +} + +# Detect online CPUs (prefer /sys; else getconf; else nproc; else /proc/cpuinfo) +detect_online_cpus() { + ONLINE_STR="" + if [ -r /sys/devices/system/cpu/online ]; then + ONLINE_STR=$(cat /sys/devices/system/cpu/online 2>/dev/null) + fi + if [ -z "$ONLINE_STR" ]; then + n="" + if command -v getconf >/dev/null 2>&1; then n=$(getconf _NPROCESSORS_ONLN 2>/dev/null); fi + if [ -z "$n" ] || ! [ "$n" -gt 0 ] 2>/dev/null; then + if command -v nproc >/dev/null 2>&1; then n=$(nproc 2>/dev/null); fi + fi + if [ -z "$n" ] || ! [ "$n" -gt 0 ] 2>/dev/null; then + n=$(awk -F: '/^processor[ \t]*:/{c++} END{print c+0}' /proc/cpuinfo 2>/dev/null) + fi + [ -z "$n" ] && n=1 + i=0; out="" + while [ "$i" -lt "$n" ]; do out="$out,$i"; i=$((i+1)); done + ONLINE_STR="${out#,}" + fi + ONLINE_CPUS=$(expand_list "$ONLINE_STR") + [ -n "$ONLINE_CPUS" ] || ONLINE_CPUS="0" + CPU_COUNT=$(printf "%s\n" $ONLINE_CPUS | wc -l | tr -d ' ') +} +detect_online_cpus + +# Return online NUMA nodes as CSV (e.g. "0,1") +nodes_online() { + if [ -r /sys/devices/system/node/online ]; then + sed 's/-/,/g' /sys/devices/system/node/online + else + echo 0 + fi +} + +# Detect cpuset controller availability (cgroup v2 or v1) +cpuset_supported() { + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -qw cpuset /sys/fs/cgroup/cgroup.controllers + return $? + fi + [ -d /sys/fs/cgroup/cpuset ] +} + +# Run a command confined to a CPU list via cpuset cgroups (v2 or v1). +# Usage: run_with_cpuset "0-3,6" 'sh -c "your cmd"' +run_with_cpuset() { + cpus="$1"; shift + cmd="$*" + + # cgroup v2 + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true + cg="/sys/fs/cgroup/sat.$$" + mkdir "$cg" 2>/dev/null || return 1 + mems="$(nodes_online)" + echo "$cpus" > "$cg/cpuset.cpus" 2>/dev/null || { rmdir "$cg" 2>/dev/null; return 1; } + echo "$mems" > "$cg/cpuset.mems" 2>/dev/null || { rmdir "$cg" 2>/dev/null; return 1; } + + sh -c "$cmd" & + pid=$! + echo "$pid" > "$cg/cgroup.procs" 2>/dev/null || true + wait "$pid"; ret=$? + rmdir "$cg" 2>/dev/null || true + return $ret + fi + + # cgroup v1 + if [ -d /sys/fs/cgroup/cpuset ]; then + cg="/sys/fs/cgroup/cpuset/sat.$$" + mkdir "$cg" 2>/dev/null || return 1 + if [ -r /sys/fs/cgroup/cpuset/cpuset.mems ]; then + parent_mems=$(cat /sys/fs/cgroup/cpuset/cpuset.mems) + else + parent_mems="0" + fi + echo "$cpus" > "$cg/cpuset.cpus" 2>/dev/null || { rmdir "$cg" 2>/dev/null; return 1; } + echo "$parent_mems" > "$cg/cpuset.mems" 2>/dev/null || { rmdir "$cg" 2>/dev/null; return 1; } + + sh -c "$cmd" & + pid=$! + echo "$pid" > "$cg/tasks" 2>/dev/null || true + wait "$pid"; ret=$? + rmdir "$cg" 2>/dev/null || true + return $ret + fi + + # Fallback: cpuset not supported + sh -c "$cmd" +} + +############################################################################### +# Core configuration (duration, memory, threads) +############################################################################### +if [ -n "$USER_S" ]; then DURATION="$USER_S"; else DURATION=$([ "$SAFE" -eq 1 ] && echo 120 || echo 300); fi + +# cgroup-aware available memory (kB), prefer v2; fallback to meminfo +cgroup_available_kb() { + # v2 + if [ -r /sys/fs/cgroup/memory.max ] && [ -r /sys/fs/cgroup/memory.current ]; then + max=$(cat /sys/fs/cgroup/memory.max 2>/dev/null) + cur=$(cat /sys/fs/cgroup/memory.current 2>/dev/null) + if [ "$max" != "max" ] && [ -n "$max" ] && [ -n "$cur" ]; then + awk -v max="$max" -v cur="$cur" 'BEGIN{d=max-cur; if(d<0)d=0; print int(d/1024)}' + return + fi + fi + # v1 + if [ -r /sys/fs/cgroup/memory/memory.limit_in_bytes ] && [ -r /sys/fs/cgroup/memory/memory.usage_in_bytes ]; then + max=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes 2>/dev/null) + cur=$(cat /sys/fs/cgroup/memory/memory.usage_in_bytes 2>/dev/null) + if [ -n "$max" ] && [ -n "$cur" ] && [ "$max" -gt 0 ] 2>/dev/null; then + awk -v max="$max" -v cur="$cur" 'BEGIN{d=max-cur; if(d<0)d=0; print int(d/1024)}' + return + fi + fi + echo "" +} + +MEM_DEBUG="" +calc_mem_mb() { + avail_kb=$(awk '/MemAvailable:/ {print $2; exit}' /proc/meminfo 2>/dev/null) + [ -z "$avail_kb" ] && avail_kb=$(awk '/MemFree:/ {print $2; exit}' /proc/meminfo 2>/dev/null) + [ -z "$avail_kb" ] && avail_kb=262144 + + cg_kb="$(cgroup_available_kb)" + if [ -n "$cg_kb" ] && [ "$cg_kb" -gt 0 ] 2>/dev/null; then + avail_kb="$cg_kb"; cg_note="(cgroup)" + else + cg_note="" + fi + + pct="$MEM_PCT"; [ -z "$pct" ] && pct=$([ "$SAFE" -eq 1 ] && echo 35 || echo 60) + # default headroom + if [ -z "$MEM_HEADROOM_MB" ]; then head_mb=$([ "$SAFE" -eq 1 ] && echo 512 || echo 256); else head_mb="$MEM_HEADROOM_MB"; fi + + use_kb=$(( avail_kb * pct / 100 )) + max_usable_kb=$(( avail_kb - head_mb * 1024 )); [ "$max_usable_kb" -lt 0 ] && max_usable_kb=0 + [ "$use_kb" -gt "$max_usable_kb" ] && use_kb="$max_usable_kb" + + if [ -n "$MEM_CAP_MB" ]; then + cap_kb=$(( MEM_CAP_MB * 1024 )) + [ "$use_kb" -gt "$cap_kb" ] && use_kb="$cap_kb" + cap_note="$MEM_CAP_MB MB" + else + cap_note="none" + fi + + # Final sanity: floor 16MB, and never above max_usable + [ "$use_kb" -lt 16384 ] && use_kb=16384 + [ "$use_kb" -gt "$max_usable_kb" ] && use_kb="$max_usable_kb" + + MEM_DEBUG="MemAvailable=${avail_kb}kB${cg_note} pct=${pct}% headroom=${head_mb}MB cap=${cap_note}" + echo $(( use_kb / 1024 )) +} + +if [ -n "$USER_M" ]; then + MEM_MB="$USER_M" + MEM_DEBUG="(user override) $MEM_MB MB" +else + MEM_MB="$(calc_mem_mb)" +fi + +# Hard requirement: require at least N MB or FAIL +if [ -n "$REQUIRE_MEM_MB" ]; then + if [ "$MEM_MB" -lt "$REQUIRE_MEM_MB" ] 2>/dev/null; then + log_fail "Memory target $MEM_MB MB < required $REQUIRE_MEM_MB MB; refusing to run" + echo "$TESTNAME FAIL" >"$RES_FILE" + exit 1 + fi +fi + +# Threads +if [ -n "$USER_m" ]; then + MEM_THREADS="$USER_m" +else + if [ "$CPU_COUNT" -gt 1 ] && [ "$SAFE" -eq 1 ]; then + half=$(( (CPU_COUNT + 1) / 2 )); [ "$half" -lt 1 ] && half=1; [ "$half" -gt 4 ] && half=4 + MEM_THREADS="$half" + else + MEM_THREADS="$CPU_COUNT" + fi +fi + +# CPU subset for SAFE mode +SUBSET_STR="$ONLINE_STR" +if [ "$SAFE" -eq 1 ]; then + i=0; subset="" + for c in $ONLINE_CPUS; do subset="$subset,$c"; i=$((i+1)); [ "$i" -ge "$MEM_THREADS" ] && break; done + SUBSET_STR="${subset#,}" +fi + +VERBOSITY="${USER_v:-8}" +LOG_FILE="${USER_l:-$LOG_FILE}" + +############################################################################### +# Auto NET + listener +############################################################################### +pick_primary_ip() { + if command -v ip >/dev/null 2>&1; then + ip route get 1.1.1.1 2>/dev/null | awk '/src/ {for(i=1;i<=NF;i++) if ($i=="src"){print $(i+1); exit}}' + else + hostname -I 2>/dev/null | awk '{print $1}' + fi +} +LISTENER_PID=""; LISTEN_LOG="" +if [ "$AUTO_NET" -eq 1 ]; then + if [ "$USER_listen" -eq 0 ]; then + LISTEN_LOG="./stressapptest-listener.log" + log_info "Auto-net: starting local listener for ${DURATION}s" + stressapptest --listen -s $((DURATION + 15)) -l "$LISTEN_LOG" >/dev/null 2>&1 & + LISTENER_PID=$!; USER_listen=1 + fi + if [ -z "$USER_n" ]; then + if [ "$AUTO_NET_MODE" = "primary" ]; then + ip_addr="$(pick_primary_ip)" + if [ -n "$ip_addr" ]; then USER_n="$ip_addr"; log_info "Auto-net: primary IP detected: $USER_n"; else USER_n="127.0.0.1"; log_warn "Auto-net: primary IP unavailable; using loopback"; fi + else + USER_n="127.0.0.1"; log_info "Auto-net: using loopback (127.0.0.1)" + fi + fi +fi + +############################################################################### +# Auto DISK: pick writable mount +############################################################################### +TMPF="" +if [ "$AUTO_DISK" -eq 1 ] && [ -z "$USER_f" ]; then + if command -v df >/dev/null 2>&1; then + best_mp=""; best_free=0 + while read -r _dev mp fstype opts _; do + case "$fstype" in proc|sysfs|devtmpfs|devpts|cgroup*|pstore|debugfs|tracefs|configfs|securityfs|overlay) continue ;; esac + echo "$opts" | grep -qw ro && continue + free=$(df -Pm "$mp" 2>/dev/null | awk 'NR==2 {print $4+0}') + [ -z "$free" ] && free=0 + if [ "$free" -gt "$best_free" ]; then best_free="$free"; best_mp="$mp"; fi + done < /proc/mounts + if [ -n "$best_mp" ] && [ "$best_free" -gt 128 ]; then + TMPF="$best_mp/stressapptest.$$.tmp"; USER_f="$TMPF" + log_info "Auto-disk: '${best_mp}' (free=${best_free}M), file=$TMPF" + else + log_warn "Auto-disk: no suitable writable mount" + fi + else + log_warn "Auto-disk requested but 'df' not available; skipping" + fi +fi + +cleanup_auto_bits() { + [ -n "$LISTENER_PID" ] && kill "$LISTENER_PID" 2>/dev/null + [ -n "$TMPF" ] && rm -f "$TMPF" 2>/dev/null +} +trap cleanup_auto_bits EXIT INT TERM + +############################################################################### +# dmesg patterns (always scan; STRICT decides fatal vs warn) +############################################################################### +DMESG_MODULES='oom|memory|BUG|hung task|soft lockup|hard lockup|rcu|page allocation failure|I/O error|AER|EDAC|Machine check' +DMESG_EXCLUDE='using dummy regulator|not found|No NUMA|EEXIST|AER: Corrected error' +DMESG_MODULES_STRICT='(Out of memory|oom-killer|invoked oom-killer|Kernel panic|panic|BUG:|Oops|general protection fault|Unable to handle kernel NULL pointer|Call Trace:|hung task|soft lockup|hard lockup|rcu_sched self-detected stall|page allocation failure|I/O error|EXT4-fs error|BTRFS: error|XFS .* Internal error|EDAC|Machine check|AER: Uncorrected)' +DMESG_EXCLUDE_STRICT='thermal throttle|probe deferred|Bluetooth: hci0: advertising data|irq .* affinity broken|AER: Corrected' + +############################################################################### +# Build SAT command +############################################################################### +build_sat_cmd() { + c="stressapptest -s $DURATION -M $MEM_MB -m $MEM_THREADS -v $VERBOSITY" + [ "$USER_W" -eq 1 ] && c="$c -W" + [ -n "$USER_n" ] && c="$c -n $USER_n" + [ "$USER_listen" -eq 1 ] && c="$c --listen" + [ -n "$USER_f" ] && c="$c -f $USER_f" + [ "$USER_F" -eq 1 ] && c="$c -F" + [ -n "$LOG_FILE" ] && c="$c -l $LOG_FILE" + echo "$c" +} + +# Keep wrap_affinity minimal: only taskset if present; cpuset fallback. +wrap_affinity() { + c="$1" + if command -v taskset >/dev/null 2>&1; then + masklist="$( [ "$SAFE" -eq 1 ] && echo "$SUBSET_STR" || echo "$ONLINE_STR" )" + echo "taskset -a -c \"$masklist\" sh -c \"$c\"" + else + echo "sh -c \"$c\"" + fi +} + +############################################################################### +# Looped execution +############################################################################### +PASS_COUNT=0 FAIL_COUNT=0 STRICT_FAIL=0 +RUN=1 +while [ "$RUN" -le "$LOOPS" ]; do + [ "$LOOPS" -gt 1 ] && log_info "===== Loop $RUN/$LOOPS =====" + + log_info "Mode: $( [ "$SAFE" -eq 1 ] && echo SAFE || echo NORMAL )" + log_info "Online CPUs: $ONLINE_STR (count=$CPU_COUNT)" + [ "$SAFE" -eq 1 ] && log_info "CPU subset: $SUBSET_STR" + log_info "Config: duration=${DURATION}s mem=${MEM_MB}MB threads=${MEM_THREADS} verbosity=$VERBOSITY" + log_info "Memory sizing: $MEM_DEBUG" + [ -n "$USER_n" ] && log_info "Network: client to $USER_n (listener=$( [ "$USER_listen" -eq 1 ] && echo yes || echo no ))" + [ -n "$USER_f" ] && log_info "Disk: tempfile $USER_f" + + CMD="$(build_sat_cmd)" + WRAPPED_CMD="$(wrap_affinity "$CMD")" + log_info "Command: $CMD" + + if [ "$DRYRUN" -eq 1 ]; then + log_info "[Dry-run] Command that would execute (wrapped):" + echo "$WRAPPED_CMD" + echo "$TESTNAME DRY-RUN" >"$RES_FILE" + exit 0 + fi + + START_TS=$(date +%s) + + masklist="$( [ "$SAFE" -eq 1 ] && echo "$SUBSET_STR" || echo "$ONLINE_STR" )" + if command -v taskset >/dev/null 2>&1; then + log_info "CPU pinning method: taskset ($masklist)" + sh -c "$WRAPPED_CMD" >>"$LOG_FILE" 2>&1 + RET=$? + else + if cpuset_supported; then + log_info "CPU pinning method: cgroup cpuset ($masklist)" + run_with_cpuset "$masklist" "sh -c \"$CMD\" >>\"$LOG_FILE\" 2>&1" + RET=$? + else + log_info "CPU pinning method: none (no taskset/cpuset)" + sh -c "$CMD" >>"$LOG_FILE" 2>&1 + RET=$? + fi + fi + + END_TS=$(date +%s) + ELAPSED=$((END_TS - START_TS)) + + if [ $RET -eq 0 ]; then + log_pass "$TESTNAME: completed OK in ${ELAPSED}s" + PASS_COUNT=$((PASS_COUNT+1)) + else + log_fail "$TESTNAME: returned $RET (see $LOG_FILE)" + FAIL_COUNT=$((FAIL_COUNT+1)) + fi + + # Always scan dmesg; STRICT decides whether to fail or warn. + if [ "$STRICT" -eq 1 ]; then + if scan_dmesg_errors "$SCRIPT_DIR" "$DMESG_MODULES_STRICT" "$DMESG_EXCLUDE_STRICT"; then + log_fail "Strict mode: critical kernel issues detected (see dmesg_errors*.log)" + STRICT_FAIL=1 + else + log_info "Strict mode: no critical kernel issues detected" + fi + else + if scan_dmesg_errors "$SCRIPT_DIR" "$DMESG_MODULES" "$DMESG_EXCLUDE"; then + log_warn "Potential kernel messages detected (see dmesg_errors*.log)" + else + log_info "No concerning kernel messages in dmesg (non-strict)" + fi + fi + + if [ -n "$JSON_OUT" ]; then + { + printf '{' + printf '"loop":%s,' "$RUN" + printf '"start_ts":%s,' "$START_TS" + printf '"end_ts":%s,' "$END_TS" + printf '"elapsed":%s,' "$ELAPSED" + printf '"ret":%s,' "$RET" + printf '"mem_mb":%s,' "$MEM_MB" + printf '"threads":%s,' "$MEM_THREADS" + printf '"duration_s":%s,' "$DURATION" + printf '"mode":"%s",' "$( [ "$SAFE" -eq 1 ] && echo SAFE || echo NORMAL )" + printf '"mem_debug":"%s",' "$MEM_DEBUG" + printf '"log":"%s"' "$LOG_FILE" + printf '}\n' + } >> "$JSON_OUT" + fi + + [ "$RUN" -lt "$LOOPS" ] && [ "$LOOP_DELAY" -gt 0 ] && sleep "$LOOP_DELAY" + RUN=$((RUN+1)) +done + +if [ -n "$JSON_OUT" ]; then + { + printf '{' + printf '"aggregate":{"loops":%s,"pass":%s,"fail":%s,"strict_fail":%s}\n' "$LOOPS" "$PASS_COUNT" "$FAIL_COUNT" "$STRICT_FAIL" + printf '}\n' + } >> "$JSON_OUT" +fi + +############################################################################### +# Final result +############################################################################### +if [ "$FAIL_COUNT" -gt 0 ] || [ "$STRICT_FAIL" -ne 0 ]; then + FINAL_FAIL=1 +else + FINAL_FAIL=0 +fi + +if [ $FINAL_FAIL -eq 0 ] ; then + echo "$TESTNAME PASS" >"$RES_FILE" + exit 0 +else + echo "$TESTNAME FAIL" >"$RES_FILE" + exit 1 +fi From e417ab6eef1b49ec9c8d46107804f7d5d2aa457d Mon Sep 17 00:00:00 2001 From: Srikanth Muppandam Date: Fri, 29 Aug 2025 19:09:50 +0530 Subject: [PATCH 2/2] stress-ng: add validation run.sh and README - Adaptive CPU/memory/disk sizing - Multi-phase workloads (CPU/math, VM, HDD) - Strict pass/fail based on throughput and dmesg scan - Configurable CLI parameters (p1, p2, mem-frac, cpu-list, temp-limit) - Added README.md with examples and troubleshooting Signed-off-by: Srikanth Muppandam --- .../suites/Kernel/Stress/Stress-ng/README.md | 308 ++++++++++++ Runner/suites/Kernel/Stress/Stress-ng/run.sh | 471 ++++++++++++++++++ 2 files changed, 779 insertions(+) create mode 100644 Runner/suites/Kernel/Stress/Stress-ng/README.md create mode 100755 Runner/suites/Kernel/Stress/Stress-ng/run.sh diff --git a/Runner/suites/Kernel/Stress/Stress-ng/README.md b/Runner/suites/Kernel/Stress/Stress-ng/README.md new file mode 100644 index 00000000..bfa7d671 --- /dev/null +++ b/Runner/suites/Kernel/Stress/Stress-ng/README.md @@ -0,0 +1,308 @@ +Scheduler/Stress Validation — stress-ng Runner + +This README explains how to use the stress-ng–based validation script (run.sh) we wrote to exercise CPU, memory, I/O, and scheduler paths on embedded Linux systems (Yocto, Debian/Ubuntu, RT & non-RT kernels, NUMA/non-NUMA). It also covers how to get stress-ng onto your target (cross-compile or sideload). + +--- + +What this test does + +Launches stress-ng stressors sized to the current machine (online CPUs, RAM, and free disk) so we don’t overcommit tiny embedded boards. + +Affines worker threads to every online CPU to make scheduler regressions obvious. + +Applies fail criteria (max latency, OOM, I/O errors, stressor non-zero exits); returns non-zero exit code on failure for CI. + +Saves a short summary and optional detailed logs; runs a dmesg scan via your functestlib.sh. + +--- + +Requirements + +stress-ng binary on the target + +Standard tools: awk, grep, sed, cut, tr, sleep, date, head, getconf + +(Optional) taskset, numactl for CPU pinning/NUMA; dd for I/O prechecks + +Your test framework’s init_env and functestlib.sh (already handled by run.sh) + +The runner reuses helpers from your existing functestlib.sh: + +check_dependencies + +find_test_case_by_name + +log_info, log_warn, log_pass, log_fail, log_skip, log_error + +scan_dmesg_errors + +--- + +Getting stress-ng + +Project: https://github.com/ColinIanKing/stress-ng + +A) Native install (Debian/Ubuntu) + +sudo apt-get update +sudo apt-get install -y stress-ng + +B) Cross-compile (Yocto) + +Add to your image or build it as an SDK tool: + +In your layer, ensure stress-ng is available (meta-openembedded has a recipe in meta-oe on many branches). + +Add to image: + +IMAGE_INSTALL:append = " stress-ng" + +Rebuild image / SDK: + +bitbake core-image-minimal + +C) Cross-compile (generic cmake/make) + +On your host: + +git clone https://github.com/ColinIanKing/stress-ng.git +cd stress-ng +make CROSS_COMPILE=aarch64-linux-gnu- # or your triplet +# artifact is src/stress-ng + +Copy the binary to your target (see “Sideload” below). + +D) Android / BusyBox targets (sideload) + +Push a statically linked stress-ng: + +adb push stress-ng /usr/local/bin/ +adb shell chmod 755 /usr/local/bin/stress-ng + +Or with SSH: + +scp stress-ng root@TARGET:/usr/local/bin/ +ssh root@TARGET chmod 755 /usr/local/bin/stress-ng + +--- + +run.sh quick start + +From the test case directory (the script finds its own path via find_test_case_by_name): + +./run.sh + +By default, it: + +Detects online CPUs, total RAM, and free disk. + +Picks safe defaults: worker threads == online CPUs, memory workers sized to a small percentage of RAM, I/O workers sized to free space. + +Runs for a sane duration (e.g., 5–10 minutes configurable). + +Fails on stressor non-zero exit, OOM, major I/O error, or dmesg anomalies. + +## Usage + +``` +Usage: ./run.sh [--p1 ] [--p2 ] [--mem-frac ] [--disk-frac ] + [--cpu-list ] [--temp-limit ] [--stressng ""] + [--repeat ] [--help] +``` + +### Options + +| Option | Description | +|---------------------|-------------| +| `--p1 ` | Phase 1 duration in seconds (default: 60) | +| `--p2 ` | Phase 2 duration in seconds (default: 60) | +| `--mem-frac ` | Percentage of total memory per worker (default: 15) | +| `--disk-frac ` | Percentage of free disk space per worker (default: 5) | +| `--cpu-list ` | Comma-separated list or range of CPUs to stress | +| `--temp-limit ` | Maximum temperature threshold | +| `--stressng ""` | Additional arguments passed to stress-ng | +| `--repeat ` | Repeat the entire test sequence N times (default: 1) | +| `--help` | Show this help message and exit | + +> Exact flags may differ slightly depending on your final script; the examples below assume the version we discussed (auto-sizing, affinity, fail criteria, reuse of functestlib.sh). + +--- + +Example invocations + +1) Quick CPU & memory smoke (auto sizing, 5 min) + +./run.sh --duration 300 --stressors cpu,vm + +2) Full platform shake (CPU+VM+I/O; pinned per-CPU) + +./run.sh --duration 600 --stressors cpu,vm,io --logs + +3) Limit footprint on small RAM systems + +./run.sh --duration 180 --stressors vm --mem-pct 5 + +4) Pin workers to a subset of CPUs + +./run.sh --cpu-list 0-3 --duration 240 --stressors cpu + +5) Exercise only I/O with conservative disk usage + +./run.sh --stressors io --disk-pct 3 --duration 120 + +6) Mixed with latency guardrail (if cyclic path is enabled) + +./run.sh --stressors cpu,vm --max-latency-us 500 --duration 300 + +7) Run with default phases, repeated 3 times + +./run.sh --repeat 3 + +8) Run on specific CPUs with temperature limit + +./run.sh --cpu-list 0-3 --temp-limit 80 + +9) Run memory-intensive workload for 90 seconds per phase + +./run.sh --mem-frac 30 --p1 90 --p2 90 + +10) Run stress-ng with a custom workload twice + +./run.sh --repeat 2 --stressng "--cpu 4 --timeout 30 --verify" + +--- + +What the script checks/fails on + +stressor exit codes (any non-zero → FAIL) + +Killed by OOM or ENOMEM patterns in stress-ng output → FAIL + +I/O failures (EIO, read/write errors) → FAIL + +dmesg anomalies via scan_dmesg_errors → WARN/FAIL as configured + +(Optional) latency threshold if you also run a small cyclic step + +Exit code: + +0 = PASS (no failures, at least one stressor ran) + +1 = FAIL (functional failure or threshold exceeded) + +2 = SKIP (dependencies missing) + +Artifacts: + +stress-ng-summary.log (always) + +stress-ng-*.log files (with --logs) + +*.res result file for your harness + +--- + +Sizing & affinity logic (how it stays safe) + +CPU workers: ≤ online CPUs (default: one worker per online CPU) + +Memory workers: uses a small percentage of total RAM (cap per worker), adjustable via --mem-pct + +I/O workers: uses a small percentage of free disk (cap per worker), adjustable via --disk-pct + +Affinity: default is on (each worker pinned to a specific online CPU); disable with --no-affine + +NUMA: if numactl exists, the script prefers local node binding where appropriate; otherwise it simply CPU-affines. + +--- + +Building stress-ng into your products + +Yocto (image integration) + +Add to your image recipe or local.conf: + +IMAGE_INSTALL:append = " stress-ng" + +Rebuild and flash your image. + +Debian/Ubuntu rootfs + +Bake into your rootfs recipe or install at first boot with a provisioning script: + +apt-get update && apt-get install -y stress-ng + +Sideload in CI + +For CI smoke on development hardware: + +scp stress-ng root@TARGET:/usr/local/bin/ +ssh root@TARGET chmod 755 /usr/local/bin/stress-ng + +--- + +Cross-compiling notes & tips + +On ARM64 build hosts with Linaro/GCC toolchains: + +make CROSS_COMPILE=aarch64-linux-gnu- +file src/stress-ng # confirm aarch64 ELF + +Prefer static if your target is minimal: + +make static + +Validate dependencies: run src/stress-ng --version on the host and then on the target after copy. + +--- + +Troubleshooting + +“stress-ng: command not found” +Not on PATH. Install natively, or place it in /usr/local/bin and chmod +x. + +Out-of-memory or system lockups +Lower --mem-pct, shorten --duration, drop io on small/flash media. + +I/O errors / read-only filesystems +Switch to a writable mount (e.g., /tmp) or adjust --disk-pct down to 1–2%. + +High kernel latency on PREEMPT_RT +Start with CPU-only tests, then introduce memory/I/O slowly; use --max-latency-us to gate. + +BusyBox environments +Ensure the script’s dependencies exist (the runner checks and SKIPs otherwise). You can pre-install missed tools or adjust the stress mix. + +--- + +Security & safety + +This script is destructive only in its I/O scratch area (e.g., under /tmp/stress-ng-io); it won’t touch other files. + +It will refuse to over-allocate RAM/disk beyond configured caps. + +Still, run on development hardware or staging boards when possible. + +--- + +License + +The test runner: BSD-3-Clause-Clear (Qualcomm Technologies, Inc. and/or its subsidiaries). + +stress-ng is licensed upstream by its author; see its repository for details. + +--- + +Appendix: Useful stress-ng commands (manual) + +See available stressors: + +stress-ng --class cpu --sequential 1 --metrics-brief --timeout 10 + +Run with maximum stress on all classes (dangerous on small boards): + +stress-ng --aggressive --all 1 --timeout 60 + +Only memory: + +stress-ng --vm 4 --vm-bytes 5% --vm-keep --timeout 120 diff --git a/Runner/suites/Kernel/Stress/Stress-ng/run.sh b/Runner/suites/Kernel/Stress/Stress-ng/run.sh new file mode 100755 index 00000000..cd643848 --- /dev/null +++ b/Runner/suites/Kernel/Stress/Stress-ng/run.sh @@ -0,0 +1,471 @@ +#!/bin/sh +# Copyright (c) Qualcomm Technologies, Inc. +# SPDX-License-Identifier: BSD-3-Clause-Clear +# +# stress-ng validation runner: +# - Default: auto-sized CPU + VM/HDD sets with strict FAIL criteria +# - Custom: pass any stress-ng CLI via --stressng-args "…" or after "--" +# - --autosize: when user references vm/hdd/cpu without sizes, add safe values +# - --append-defaults: add --times --metrics-brief --verify if not present +# - --repeat N: run the chosen workload(s) N iterations +# - --stability H: repeat chosen workload(s) for H hours (takes precedence) +# - --dryrun / --dry-run: validate via stress-ng --dry-run (no load), PASS on 0 exit +# +# Uses helpers from functestlib.sh: +# cpu_get_online_list_str, cpu_expand_list, cpu_snapshot_stat, cpu_get_active_ticks, +# mem_bytes_from_percent, disk_bytes_from_percent_free, file_has_pattern, +# test_finalize_result, scan_dmesg_errors, find_test_case_by_name, log_* + +############################################################################### +# Source init_env + functestlib.sh +############################################################################### +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +INIT_ENV="" +SEARCH="$SCRIPT_DIR" +while [ "$SEARCH" != "/" ]; do + if [ -f "$SEARCH/init_env" ]; then + INIT_ENV="$SEARCH/init_env" + break + fi + SEARCH=$(dirname "$SEARCH") +done + +if [ -z "$INIT_ENV" ]; then + echo "[ERROR] Could not find init_env (starting at $SCRIPT_DIR)" >&2 + exit 1 +fi + +if [ -z "$__INIT_ENV_LOADED" ]; then + # shellcheck disable=SC1090 + . "$INIT_ENV" +fi +# shellcheck disable=SC1090,SC1091 +. "$TOOLS/functestlib.sh" + +TESTNAME="Stress-ng" +RES_FILE="./${TESTNAME}.res" +test_path=$(find_test_case_by_name "$TESTNAME") || { echo "$TESTNAME SKIP" >"$RES_FILE"; exit 0; } +cd "$test_path" || exit 1 + +log_info "----------------------------------------------------------------------" +log_info "------------------- Starting $TESTNAME Testcase ----------------------" + +############################################################################### +# Defaults / CLI +############################################################################### +P1_SECS=60 +P2_SECS=60 +MEM_FRAC=15 +DISK_FRAC=5 +CPU_LIST="" +TEMP_LIMIT="" +SETS="cpu,vmhdd" +STABILITY_HOURS="" +REPEAT=1 +STRESSNG_ARGS="" +BY_CLASS="" +AUTOSIZE=0 +APPEND_DEFAULTS=0 +DRYRUN=0 +SHOW_HELP=0 + +print_usage() { +cat <] [--p2 ] [--mem-frac ] [--disk-frac ] + [--cpu-list ] [--temp-limit ] [--sets ] + [--repeat ] [--stability ] [--by-class ] + [--autosize] [--append-defaults] [--dryrun|--dry-run] + [--stressng-args ""] [--help] + [-- ] +EOF +} + +# Parse CLI (support pass-through after "--") +PASSTHRU="" +while [ $# -gt 0 ]; do + case "$1" in + --p1) shift; P1_SECS="$1" ;; + --p2) shift; P2_SECS="$1" ;; + --mem-frac) shift; MEM_FRAC="$1" ;; + --disk-frac) shift; DISK_FRAC="$1" ;; + --cpu-list) shift; CPU_LIST="$1" ;; + --temp-limit) shift; TEMP_LIMIT="$1" ;; + --sets) shift; SETS="$1" ;; + --repeat) shift; REPEAT="$1" ;; + --stability) shift; STABILITY_HOURS="$1" ;; + --by-class) shift; BY_CLASS="$1" ;; + --stressng-args) shift; STRESSNG_ARGS="$1" ;; + --autosize) AUTOSIZE=1 ;; + --append-defaults) APPEND_DEFAULTS=1 ;; + --dryrun|--dry-run) DRYRUN=1 ;; + --help) SHOW_HELP=1 ;; + --) shift; PASSTHRU="$*"; break ;; + *) log_error "Unknown argument: $1"; SHOW_HELP=1; shift; break ;; + esac + shift +done +[ "$SHOW_HELP" -eq 1 ] && { print_usage; exit 0; } +[ -n "$PASSTHRU" ] && STRESSNG_ARGS="$PASSTHRU" + +# Validate numeric flags +if [ -n "$STABILITY_HOURS" ]; then + case "$STABILITY_HOURS" in *[!0-9]*|"") log_error "Invalid --stability: $STABILITY_HOURS"; exit 1 ;; esac +fi +case "$REPEAT" in *[!0-9]*|"") log_error "Invalid --repeat: $REPEAT"; exit 1 ;; esac + +############################################################################### +# Dependencies +############################################################################### +check_dependencies stress-ng awk grep sed cut df stat sleep date getconf || { + test_finalize_result SKIP "$TESTNAME" "$RES_FILE" "" +} + +############################################################################### +# Helpers +############################################################################### +# Run stress-ng (argv style) +run_stress() { + logf="$1"; shift + if [ "$DRYRUN" -eq 1 ]; then + stress-ng --dry-run "$@" 2> "$logf" 1>/dev/null + else + stress-ng "$@" 2> "$logf" 1>/dev/null + fi + return $? +} + +# Run stress-ng given a single string of args (keeps user quoting) +run_stress_str() { + logf="$1"; shift + args_str="$1" + # shellcheck disable=SC2086 + eval "set -- $args_str" + if [ "$DRYRUN" -eq 1 ]; then + stress-ng --dry-run "$@" 2> "$logf" 1>/dev/null + else + stress-ng "$@" 2> "$logf" 1>/dev/null + fi + return $? +} + +# Autosize helpers for pass-through +autosize_args() { + args="$1"; logtmpdir="$2"; mem_pct="$3"; disk_pct="$4" + ONLINE_STR_LOCAL=$(cpu_get_online_list_str) + ONLINE_CPUS_LOCAL=$(cpu_expand_list "$ONLINE_STR_LOCAL") + num_cpus=$(printf "%s\n" "$ONLINE_CPUS_LOCAL" | wc -w | tr -d ' ') + [ "$num_cpus" -lt 1 ] && num_cpus=1 + + case " $args " in + *" --vm "*|*" -m "*) + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--vm-bytes([[:space:]]|=)' || { + vm_bytes=$(mem_bytes_from_percent "$mem_pct") + args="$args --vm-bytes $vm_bytes" + } + ;; + esac + case " $args " in + *" --hdd "*|*" -d "*|*" --io "*|*" -i "*|*" --iomix "*) + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--hdd-bytes([[:space:]]|=)' || { + hdd_bytes=$(disk_bytes_from_percent_free "$disk_pct" "$logtmpdir") + args="$args --hdd-bytes $hdd_bytes" + } + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--temp-path([[:space:]]|=)' || { + args="$args --temp-path $logtmpdir" + } + ;; + esac + case " $args " in + *" --cpu "*|*" -c "*) + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--cpu([[:space:]]+[0-9]+|=[0-9]+)' || { + args="$args --cpu $num_cpus" + } + ;; + esac + + if [ -n "$TEMP_LIMIT" ]; then + case " $args " in *" --temp-limit "*) : ;; *) args="$args --temp-limit $TEMP_LIMIT" ;; esac + fi + + printf "%s" "$args" +} + +append_defaults() { + args="$1" + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--times([[:space:]]|$)' || args="$args --times" + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--metrics-brief([[:space:]]|$)' || args="$args --metrics-brief" + printf "%s" "$args" | grep -Eq -- '(^|[[:space:]])--verify([[:space:]]|$)' || args="$args --verify" + printf "%s" "$args" +} + +check_logs() { + bad=0 + for f in "$@"; do + [ -s "$f" ] || continue + if file_has_pattern "$f" '0\.00[[:space:]]+ops/sec|bogo-ops:[[:space:]]*0\b'; then + log_fail "[metrics] Zero throughput in $(basename "$f")"; bad=1 + fi + if file_has_pattern "$f" 'out of memory|cannot allocate|disk full|no space left|I/O error|filesystem error|timer slack'; then + log_fail "[metrics] Resource error in $(basename "$f")"; bad=1 + fi + done + return $bad +} + +find_newest_log_dir() { + prefix="$1" + LAST_LOG="" + last_mtime=0 + list_file="./.logscan_${TESTNAME}_$$.lst" + find . -maxdepth 1 -type d -name "${prefix}*" -print 2>/dev/null > "$list_file" + while IFS= read -r d; do + [ -d "$d" ] || continue + if stat -c %Y "$d" >/dev/null 2>&1; then + mtime=$(stat -c %Y "$d") + elif stat -f %m "$d" >/dev/null 2>&1; then + mtime=$(stat -f %m "$d") + else + mtime=0 + fi + case "$mtime" in *[!0-9]*|"") mtime=0 ;; esac + if [ "$mtime" -ge "$last_mtime" ] 2>/dev/null; then + last_mtime="$mtime"; LAST_LOG="$d" + fi + done < "$list_file" + rm -f "$list_file" + [ -n "$LAST_LOG" ] && printf "%s\n" "$LAST_LOG" || printf ".\n" +} + +############################################################################### +# Built-in “auto sets” +############################################################################### +run_auto_set() { + name="$1" LOG_DIR="$2" NUM_CPUS="$3" VM_WORKERS="$4" HDD_WORKERS="$5" VM_BYTES="$6" HDD_BYTES="$7" + + case "$name" in + cpu) + log_info "[cpu] --cpu $NUM_CPUS --matrix $NUM_CPUS --timeout ${P1_SECS}s" + if [ -n "$TEMP_LIMIT" ]; then + run_stress "$LOG_DIR/cpu.log" \ + --cpu "$NUM_CPUS" --cpu-method all \ + --matrix "$NUM_CPUS" \ + --timeout "${P1_SECS}s" --times --metrics-brief \ + --temp-limit "$TEMP_LIMIT" \ + --verify + else + run_stress "$LOG_DIR/cpu.log" \ + --cpu "$NUM_CPUS" --cpu-method all \ + --matrix "$NUM_CPUS" \ + --timeout "${P1_SECS}s" --times --metrics-brief \ + --verify + fi + return $? + ;; + vmhdd) + log_info "[vmhdd] --vm $VM_WORKERS --vm-bytes $VM_BYTES --hdd $HDD_WORKERS --hdd-bytes $HDD_BYTES --timeout ${P2_SECS}s" + if [ -n "$TEMP_LIMIT" ]; then + run_stress "$LOG_DIR/vmhdd.log" \ + --vm "$VM_WORKERS" --vm-bytes "$VM_BYTES" --vm-keep \ + --hdd "$HDD_WORKERS" --hdd-bytes "$HDD_BYTES" \ + --timeout "${P2_SECS}s" --times --metrics-brief \ + --temp-limit "$TEMP_LIMIT" \ + --verify --temp-path "$LOG_DIR/tmp" + else + run_stress "$LOG_DIR/vmhdd.log" \ + --vm "$VM_WORKERS" --vm-bytes "$VM_BYTES" --vm-keep \ + --hdd "$HDD_WORKERS" --hdd-bytes "$HDD_BYTES" \ + --timeout "${P2_SECS}s" --times --metrics-brief \ + --verify --temp-path "$LOG_DIR/tmp" + fi + return $? + ;; + io) + log_info "[io] --hdd $HDD_WORKERS --hdd-bytes $HDD_BYTES --timeout ${P2_SECS}s" + if [ -n "$TEMP_LIMIT" ]; then + run_stress "$LOG_DIR/io.log" \ + --hdd "$HDD_WORKERS" --hdd-bytes "$HDD_BYTES" \ + --timeout "${P2_SECS}s" --times --metrics-brief \ + --temp-limit "$TEMP_LIMIT" \ + --verify --temp-path "$LOG_DIR/tmp" + else + run_stress "$LOG_DIR/io.log" \ + --hdd "$HDD_WORKERS" --hdd-bytes "$HDD_BYTES" \ + --timeout "${P2_SECS}s" --times --metrics-brief \ + --verify --temp-path "$LOG_DIR/tmp" + fi + return $? + ;; + *) log_warn "Unknown set '$name' (skipping)"; return 0 ;; + esac +} + +############################################################################### +# One iteration (auto / pass-through / by-class) +############################################################################### +run_iteration() { + ITER_TAG="$1" + FAIL=0 + + LOG_ROOT="./logs_${TESTNAME}_${ITER_TAG}" + mkdir -p "$LOG_ROOT/tmp" || { log_error "Cannot create $LOG_ROOT/tmp"; return 1; } + + ONLINE_STR=$(cpu_get_online_list_str) + ONLINE_CPUS=$(cpu_expand_list "$ONLINE_STR") + [ -n "$ONLINE_CPUS" ] || { log_fail "Cannot determine online CPUs"; return 1; } + + if [ -n "$CPU_LIST" ]; then REQ=$(cpu_expand_list "$CPU_LIST"); else REQ="$ONLINE_CPUS"; fi + USE_SET="" + for c in $REQ; do + printf "%s\n" "$ONLINE_CPUS" | grep -Eq "(^|[[:space:]])$c($|[[:space:]])" && USE_SET="$USE_SET $c" + done + USE_SET=$(printf "%s\n" "$USE_SET") + [ -n "$USE_SET" ] || { log_skip "$TESTNAME SKIP – no valid CPUs from requested set"; return 1; } + NUM_CPUS=$(printf "%s\n" "$USE_SET" | wc -w | tr -d ' ') + [ "$NUM_CPUS" -lt 1 ] && NUM_CPUS=1 + + VM_WORKERS=$(( (NUM_CPUS + 1) / 2 )) + HDD_WORKERS=1 + VM_BYTES=$(mem_bytes_from_percent "$MEM_FRAC") + HDD_BYTES=$(disk_bytes_from_percent_free "$DISK_FRAC" "$LOG_ROOT/tmp") + CLK_TCK=$(getconf CLK_TCK 2>/dev/null); [ -z "$CLK_TCK" ] && CLK_TCK=100 + + log_info "Iteration: $ITER_TAG" + log_info "CPUs online: $ONLINE_STR" + log_info "Using CPUs: $(printf "%s " "$USE_SET")" + log_info "VM per-worker: $VM_BYTES | HDD per-worker: $HDD_BYTES" + [ -n "$TEMP_LIMIT" ] && log_info "Temp limit: ${TEMP_LIMIT}°C" + [ "$DRYRUN" -eq 1 ] && log_info "Mode: DRY-RUN (validation only; no load)" + log_info "Logs: $LOG_ROOT" + + if [ "$DRYRUN" -eq 0 ]; then + cpu_snapshot_stat "$LOG_ROOT/stat_before" + fi + + if [ -n "$BY_CLASS" ]; then + classes=$(printf "%s" "$BY_CLASS" | tr ',' ' ') + for cls in $classes; do + args="$STRESSNG_ARGS --class $cls" + [ "$APPEND_DEFAULTS" -eq 1 ] && args=$(append_defaults "$args") + [ "$AUTOSIZE" -eq 1 ] && args=$(autosize_args "$args" "$LOG_ROOT/tmp" "$MEM_FRAC" "$DISK_FRAC") + log_info "[class:$cls] stress-ng $args" + run_stress_str "$LOG_ROOT/class_$cls.log" "$args" + rc=$?; [ "$rc" -eq 0 ] || { log_fail "[class:$cls] exited $rc"; FAIL=1; break; } + done + if [ "$DRYRUN" -eq 0 ]; then check_logs "$LOG_ROOT"/class_*.log || FAIL=1; fi + + elif [ -n "$STRESSNG_ARGS" ]; then + args="$STRESSNG_ARGS" + [ "$APPEND_DEFAULTS" -eq 1 ] && args=$(append_defaults "$args") + [ "$AUTOSIZE" -eq 1 ] && args=$(autosize_args "$args" "$LOG_ROOT/tmp" "$MEM_FRAC" "$DISK_FRAC") + log_info "[custom] stress-ng $args" + run_stress_str "$LOG_ROOT/custom.log" "$args" + rc=$?; [ "$rc" -eq 0 ] || { log_fail "[custom] exited $rc"; FAIL=1; } + if [ "$DRYRUN" -eq 0 ]; then check_logs "$LOG_ROOT/custom.log" || FAIL=1; fi + + else + for s in $(printf "%s" "$SETS" | tr ',' ' '); do + if run_auto_set "$s" "$LOG_ROOT" "$NUM_CPUS" "$VM_WORKERS" "$HDD_WORKERS" "$VM_BYTES" "$HDD_BYTES"; then + log_pass "Set '$s' PASS" + else + log_fail "Set '$s' FAIL"; FAIL=1; break + fi + done + if [ "$DRYRUN" -eq 0 ]; then + check_logs "$LOG_ROOT"/cpu.log "$LOG_ROOT"/vmhdd.log "$LOG_ROOT"/io.log || FAIL=1 + fi + fi + + if [ "$DRYRUN" -eq 0 ]; then + cpu_snapshot_stat "$LOG_ROOT/stat_after" + + # Compute TOTAL_SECS based on sets actually configured (cpu adds P1, vmhdd/io add P2) + TOTAL_SECS=0 + case ",$SETS," in + *,cpu,*) TOTAL_SECS=$((TOTAL_SECS + P1_SECS)) ;; + esac + case ",$SETS," in + *,vmhdd,*) TOTAL_SECS=$((TOTAL_SECS + P2_SECS)) ;; + esac + case ",$SETS," in + *,io,*) TOTAL_SECS=$((TOTAL_SECS + P2_SECS)) ;; + esac + [ "$TOTAL_SECS" -le 0 ] && TOTAL_SECS=$((P1_SECS + P2_SECS)) + + # Heuristic CPU activity check only for auto sets + if [ -z "$STRESSNG_ARGS" ] && [ -z "$BY_CLASS" ]; then + THRESH=$(( (TOTAL_SECS * CLK_TCK) / 8 )); [ "$THRESH" -lt 5 ] && THRESH=5 + ok=0; total=0 + for c in $USE_SET; do + b=$(cpu_get_active_ticks "$c" "$LOG_ROOT/stat_before") + a=$(cpu_get_active_ticks "$c" "$LOG_ROOT/stat_after") + [ -z "$b" ] || [ -z "$a" ] && continue + d=$((a-b)); total=$((total+1)) + log_info "[load] cpu$c delta=$d (thr=$THRESH)" + [ "$d" -ge "$THRESH" ] && ok=$((ok+1)) + done + need=$(( (total*50 + 99) / 100 )) # >= 50% + if [ "$ok" -lt "$need" ]; then log_fail "[load] Insufficient CPU activity ($ok/$total)"; FAIL=1 + else log_pass "[load] CPU activity sufficient ($ok/$total)"; fi + fi + + DMESG_MODULES='BUG:|WARNING:|rcu|lockdep|hung task|soft lockup|hard lockup|oops|stack trace|call trace' + DMESG_EXCLUDE='dummy regulator|not found|-EEXIST|thermal throttle' + if scan_dmesg_errors "$SCRIPT_DIR" "$DMESG_MODULES" "$DMESG_EXCLUDE"; then + log_fail "Concerning kernel messages during stress (logs in $LOG_ROOT)"; FAIL=1 + else + log_pass "No concerning kernel messages during stress" + fi + else + log_info "Dry-run: skipped CPU activity and dmesg checks" + fi + + [ "$FAIL" -eq 0 ] + return $? +} + +############################################################################### +# Single run / repeat loop / stability loop +############################################################################### +NEWEST_LOG="." +if [ -n "$STABILITY_HOURS" ]; then + END_TS=$(( $(date +%s) + STABILITY_HOURS*3600 )) + ITER=1 + ANY_FAIL=0 + while : ; do + now=$(date +%s) + [ "$now" -ge "$END_TS" ] && break + TAG="iter${ITER}_$(date +%Y%m%d-%H%M%S)" + log_info "===== Stability: $TESTNAME iteration $ITER =====" + if run_iteration "$TAG"; then + log_pass "Iteration $ITER PASS" + else + log_fail "Iteration $ITER FAIL"; ANY_FAIL=1; break + fi + ITER=$((ITER+1)) + done + NEWEST_LOG=$(find_newest_log_dir "logs_${TESTNAME}_") + if [ "$ANY_FAIL" -eq 0 ]; then + test_finalize_result PASS "$TESTNAME" "$RES_FILE" "$NEWEST_LOG" + else + test_finalize_result FAIL "$TESTNAME" "$RES_FILE" "$NEWEST_LOG" + fi +else + i=1 + ANY_FAIL=0 + while [ "$i" -le "$REPEAT" ]; do + TAG="iter${i}_$(date +%Y%m%d-%H%M%S)" + log_info "===== Repeat: $TESTNAME iteration $i/$REPEAT =====" + if run_iteration "$TAG"; then + log_pass "Iteration $i PASS" + else + log_fail "Iteration $i FAIL"; ANY_FAIL=1; break + fi + i=$((i+1)) + done + NEWEST_LOG=$(find_newest_log_dir "logs_${TESTNAME}_") + if [ "$ANY_FAIL" -eq 0 ]; then + test_finalize_result PASS "$TESTNAME" "$RES_FILE" "$NEWEST_LOG" + else + test_finalize_result FAIL "$TESTNAME" "$RES_FILE" "$NEWEST_LOG" + fi +fi