Skip to content

Commit 5e374a8

Browse files
authored
[rust/rqd] Add option to measure proc memory using PSS (#2089)
Overview Add a configuration option to RQD to choose between RSS (Resident Set Size) and PSS (Proportional Set Size) for measuring process memory usage on Linux systems. **Current State:** - RQD measures memory using RSS from `/proc/[pid]/statm` - RSS counts shared libraries fully for each process, leading to inflated memory reports - PSS divides shared memory proportionally among processes, providing more accurate measurements **Key Differences:** - **RSS**: Total physical memory including full count of shared libraries (can be misleading) - **PSS**: Proportional share - shared pages divided by number of processes sharing them (more accurate) - **Source**: `/proc/[pid]/smaps_rollup` (Linux kernel 4.14+) provides pre-summed PSS values
1 parent 89ddcaf commit 5e374a8

File tree

4 files changed

+86
-8
lines changed

4 files changed

+86
-8
lines changed

rust/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ resolver = "3"
55
[workspace.package]
66
authors = ["Diego Tavares <[email protected]>"]
77
edition = "2024"
8-
version = "0.1.5"
8+
version = "0.1.6"
99

1010
[workspace.dependencies]
1111
async-trait = "0.1"

rust/config/rqd.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,24 @@ machine:
140140
# # Override the operating system reported to Cuebot
141141
# os: "Rocky Linux 9"
142142

143+
# Memory metric to use for process memory measurement
144+
# Options: "rss" (default) or "pss"
145+
#
146+
# RSS (Resident Set Size): Total physical memory used by a process, including
147+
# shared libraries counted fully. This can overreport memory usage when
148+
# multiple processes share the same libraries.
149+
#
150+
# PSS (Proportional Set Size): Divides shared memory proportionally among
151+
# processes using it, providing more accurate memory accounting. For example,
152+
# if three processes share a 30MB library, each process reports 10MB instead
153+
# of 30MB.
154+
#
155+
# Note: PSS requires Linux kernel 4.14+ and reads from /proc/[pid]/smaps_rollup.
156+
# If PSS is unavailable, RQD will automatically fall back to RSS.
157+
#
158+
# Default: rss
159+
# memory_metric: "rss"
160+
143161
# =============================================================================
144162
# RUNNER CONFIGURATION
145163
# =============================================================================

rust/crates/rqd/src/config/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ lazy_static! {
1414
}
1515
//===Config Types===
1616

17+
#[derive(Default, Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
18+
#[serde(rename_all = "lowercase")]
19+
pub enum MemoryMetric {
20+
#[default]
21+
Rss,
22+
Pss,
23+
}
24+
1725
#[derive(Debug, Deserialize, Clone)]
1826
#[serde(default)]
1927
pub struct LoggingConfig {
@@ -90,6 +98,7 @@ pub struct MachineConfig {
9098
pub nimby_start_retry_interval: Duration,
9199
pub nimby_display_xauthority_path: String,
92100
pub memory_oom_margin_percentage: u32,
101+
pub memory_metric: MemoryMetric,
93102
}
94103

95104
impl Default for MachineConfig {
@@ -113,6 +122,7 @@ impl Default for MachineConfig {
113122
nimby_start_retry_interval: Duration::from_secs(60 * 5), // 5 min
114123
nimby_display_xauthority_path: "/home/{username}/Xauthority".to_string(),
115124
memory_oom_margin_percentage: 96,
125+
memory_metric: MemoryMetric::Rss,
116126
}
117127
}
118128
}

rust/crates/rqd/src/system/linux.rs

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ use opencue_proto::{
2121
report::{ChildrenProcStats, ProcStats, Stat},
2222
};
2323
use sysinfo::{DiskRefreshKind, Disks, MemoryRefreshKind, RefreshKind};
24-
use tracing::debug;
24+
use tracing::{debug, info};
2525
use uuid::Uuid;
2626

27-
use crate::{config::MachineConfig, system::reservation::ProcessorStructure};
27+
use crate::{
28+
config::{MachineConfig, MemoryMetric},
29+
system::reservation::ProcessorStructure,
30+
};
2831

2932
use super::manager::{MachineGpuStats, MachineStat, ProcessStats, SystemManager};
3033

@@ -144,6 +147,8 @@ impl LinuxSystem {
144147
.into_diagnostic()
145148
.wrap_err("SC_CLK_TCK not available")?;
146149

150+
info!("Memory metric configured: {:?}", config.memory_metric);
151+
147152
Ok(Self {
148153
config: config.clone(),
149154
static_info: MachineStaticInfo {
@@ -562,6 +567,35 @@ impl LinuxSystem {
562567
Ok(())
563568
}
564569

570+
/// Reads PSS (Proportional Set Size) from /proc/[pid]/smaps_rollup
571+
///
572+
/// PSS divides shared memory proportionally among processes using it,
573+
/// providing more accurate memory accounting than RSS.
574+
///
575+
/// Requires Linux kernel 4.14+. Returns error if unavailable.
576+
fn read_pss(&self, pid: u32) -> Result<u64> {
577+
let smaps_rollup_path = format!("/proc/{}/smaps_rollup", pid);
578+
579+
let content = std::fs::read_to_string(&smaps_rollup_path).into_diagnostic()?;
580+
581+
for line in content.lines() {
582+
if line.starts_with("Pss:") {
583+
let parts: Vec<&str> = line.split_whitespace().collect();
584+
if parts.len() >= 2 {
585+
if let Ok(pss_kb) = parts[1].parse::<u64>() {
586+
// PSS is in kB, convert to bytes
587+
return Ok(pss_kb * 1024);
588+
}
589+
}
590+
}
591+
}
592+
593+
Err(miette!(
594+
"Could not parse PSS from /proc/{}/smaps_rollup",
595+
pid
596+
))
597+
}
598+
565599
/// Reads proc data from stat and statm files:
566600
///
567601
/// # Used fields:
@@ -603,14 +637,30 @@ impl LinuxSystem {
603637
fields_statm[0].parse::<u64>(), // size
604638
fields_statm[1].parse::<u64>(), // rss
605639
) {
606-
(Ok(vsize), Ok(rss)) => (vsize, rss),
640+
(Ok(vsize), Ok(rss)) => (
641+
vsize.saturating_mul(self.static_info.page_size),
642+
rss.saturating_mul(self.static_info.page_size),
643+
),
607644
_ => Err(miette!("Invalid /proc/{pid}/statm file"))?,
608645
};
646+
let virtual_memory = vsize.saturating_mul(self.static_info.page_size);
647+
648+
// Read memory based on configured metric
649+
let memory = match self.config.memory_metric {
650+
MemoryMetric::Pss => {
651+
// Try PSS, fallback to RSS if unavailable
652+
match self.read_pss(pid) {
653+
Ok(pss) => pss,
654+
Err(_) => rss,
655+
}
656+
}
657+
MemoryMetric::Rss => {
658+
// Original RSS logic
659+
rss
660+
}
661+
};
609662

610663
let (start_time, run_time) = self.calculate_process_time(start_time);
611-
// Rss is stored in number of pages
612-
let memory = rss.saturating_mul(self.static_info.page_size);
613-
let virtual_memory = vsize.saturating_mul(self.static_info.page_size);
614664

615665
// Remove ()
616666
let name = if name.len() > 2 {
@@ -904,7 +954,7 @@ impl SystemManager for LinuxSystem {
904954

905955
#[cfg(test)]
906956
mod tests {
907-
use crate::config::MachineConfig;
957+
use crate::config::{MachineConfig, MemoryMetric};
908958
use std::fs;
909959
use std::{collections::HashMap, sync::Mutex};
910960

0 commit comments

Comments
 (0)