Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 145 additions & 2 deletions orion/src/antares.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@
//! This module provides a singleton wrapper around `scorpiofs::AntaresManager`
//! for managing overlay filesystem mounts used during build operations.

use std::{error::Error, io, path::PathBuf, sync::Arc, time::Duration};
use std::{
error::Error,
io,
path::{Path, PathBuf},
sync::Arc,
time::Duration,
};

use scorpiofs::{AntaresConfig, AntaresManager, AntaresPaths};
use tokio::sync::OnceCell;

static MANAGER: OnceCell<Arc<AntaresManager>> = OnceCell::const_new();
const TEST_BROWSE_JOB_ID: &str = "antares_test";

type DynError = Box<dyn Error + Send + Sync>;

Expand Down Expand Up @@ -128,6 +135,7 @@ pub async fn mount_job(job_id: &str, cl: Option<&str>) -> Result<AntaresConfig,
pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
tracing::info!("Initializing Antares Dicfuse during Orion startup");
let manager = get_manager().await?;
let manager_for_test_mount = Arc::clone(manager);
let dicfuse = manager.dicfuse();

// Idempotent: safe even if the manager already started import internally.
Expand All @@ -149,7 +157,17 @@ pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
)
.await
{
Ok(_) => tracing::info!("Antares Dicfuse warmup completed"),
Ok(_) => {
tracing::info!("Antares Dicfuse warmup completed");
log_dicfuse_root_tree();
if is_test_mount_enabled() {
ensure_test_mount(manager_for_test_mount.as_ref()).await;
} else {
tracing::info!(
"Antares test mount disabled by ORION_ENABLE_ANTARES_TEST_MOUNT"
);
}
}
Err(_) => tracing::warn!(
"Antares Dicfuse warmup timed out after {}s",
warmup_timeout_secs
Expand All @@ -160,6 +178,131 @@ pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
Ok(())
}

fn is_test_mount_enabled() -> bool {
match std::env::var("ORION_ENABLE_ANTARES_TEST_MOUNT") {
Ok(v) => {
let v = v.trim().to_ascii_lowercase();
!(v == "0" || v == "false" || v == "no" || v == "off")
}
Err(_) => true,
}
Comment on lines +187 to +188
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Make startup test mount opt-in instead of default-on

is_test_mount_enabled() returns true when ORION_ENABLE_ANTARES_TEST_MOUNT is unset, so every default startup creates the synthetic antares_test mount. That makes the diagnostic probe mandatory and adds persistent mount state even when operators did not request it, increasing resource usage and conflicting with the “optional test mount” intent.

Useful? React with 👍 / 👎.

}

async fn ensure_test_mount(manager: &AntaresManager) {
match manager.mount_job(TEST_BROWSE_JOB_ID, None).await {
Ok(config) => {
tracing::info!(
"Antares test mount ready: job_id={}, mountpoint={}",
TEST_BROWSE_JOB_ID,
config.mountpoint.display()
);
}
Err(err) => {
tracing::warn!(
"Failed to create Antares test mount job_id={}: {}",
TEST_BROWSE_JOB_ID,
err
);
}
}
}

fn log_dicfuse_root_tree() {
let root = PathBuf::from(scorpiofs::util::config::workspace());
let max_depth = std::env::var("ORION_DICFUSE_ROOT_TREE_DEPTH")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.unwrap_or(2);
let max_entries = std::env::var("ORION_DICFUSE_ROOT_TREE_MAX_ENTRIES")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.unwrap_or(200);

tracing::info!(
root = %root.display(),
max_depth,
max_entries,
"Dicfuse init: printing workspace root tree"
);

if !root.exists() {
tracing::warn!("Dicfuse workspace path does not exist: {}", root.display());
return;
}

let mut printed = 0usize;
tracing::info!("[dicfuse-root] /");
log_tree_recursive(&root, &root, 0, max_depth, max_entries, &mut printed);

Comment on lines +210 to +236
if printed >= max_entries {
tracing::info!(
"Dicfuse root tree output truncated at {} entries (set ORION_DICFUSE_ROOT_TREE_MAX_ENTRIES to increase)",
max_entries
);
}
}

fn log_tree_recursive(
root: &Path,
current: &Path,
depth: usize,
max_depth: usize,
max_entries: usize,
printed: &mut usize,
) {
if depth >= max_depth || *printed >= max_entries {
return;
}

let entries = match std::fs::read_dir(current) {
Ok(entries) => entries,
Err(err) => {
tracing::warn!("Failed to read {}: {}", current.display(), err);
return;
}
};

let mut children: Vec<(String, PathBuf, bool)> = Vec::new();
for entry in entries {
let entry = match entry {
Ok(entry) => entry,
Err(err) => {
tracing::warn!("read_dir entry error under {}: {}", current.display(), err);
continue;
}
};

let path = entry.path();
let name = entry.file_name().to_string_lossy().to_string();
let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
children.push((name, path, is_dir));
}

children.sort_by(|a, b| a.0.cmp(&b.0));

for (_name, path, is_dir) in children {
if *printed >= max_entries {
return;
}

let rel = path
.strip_prefix(root)
.map(|p| p.display().to_string())
.unwrap_or_else(|_| path.display().to_string());
let indent = " ".repeat(depth + 1);
if is_dir {
tracing::info!("[dicfuse-root] {}{}/", indent, rel);
} else {
tracing::info!("[dicfuse-root] {}{}", indent, rel);
}
*printed += 1;

if is_dir {
log_tree_recursive(root, &path, depth + 1, max_depth, max_entries, printed);
}
}
}

/// Unmount and cleanup a job overlay filesystem.
///
/// # Arguments
Expand Down
135 changes: 12 additions & 123 deletions orion/src/buck_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use std::{
io::BufReader,
path::{Path, PathBuf},
process::{ExitStatus, Stdio},
sync::atomic::{AtomicBool, Ordering},
};

use anyhow::anyhow;
Expand Down Expand Up @@ -46,14 +45,6 @@ static PROJECT_ROOT: Lazy<String> =
const DEFAULT_PREHEAT_SHALLOW_DEPTH: usize = 3;
static BUILD_CONFIG: Lazy<Option<BuildConfig>> = Lazy::new(load_build_config);

/// Check whether failed-build mounts should be kept alive for debugging.
/// Controlled by the `ORION_KEEP_FAILED_MOUNTS` environment variable (set to "1" to enable).
fn keep_failed_mounts() -> bool {
std::env::var("ORION_KEEP_FAILED_MOUNTS")
.map(|v| v == "1")
.unwrap_or(false)
}

/// Mount an Antares overlay filesystem for a build job.
///
/// Creates a new Antares overlay mount using scorpiofs. The underlying Dicfuse
Expand Down Expand Up @@ -512,63 +503,6 @@ async fn flush_buffer(
}
}

/// RAII guard for automatically unmounting Antares filesystem when dropped
struct MountGuard {
mount_id: String,
task_id: String,
unmounted: AtomicBool,
}

impl MountGuard {
fn new(mount_id: String, task_id: String) -> Self {
Self {
mount_id,
task_id,
unmounted: AtomicBool::new(false),
}
}

async fn unmount(&self) {
if self.unmounted.swap(true, Ordering::AcqRel) {
return;
}
match unmount_antares_fs(&self.mount_id).await {
Ok(_) => tracing::info!("[Task {}] Filesystem unmounted successfully.", self.task_id),
Err(e) => {
tracing::error!(
"[Task {}] Failed to unmount filesystem: {}",
self.task_id,
e
)
}
}
}
}

impl Drop for MountGuard {
fn drop(&mut self) {
if self.unmounted.load(Ordering::Acquire) {
return;
}
if keep_failed_mounts() {
tracing::info!(
"[Task {}] MountGuard dropped — unmount skipped (ORION_KEEP_FAILED_MOUNTS=1, mount_id={}).",
self.task_id,
self.mount_id,
);
return;
}
let mount_id = self.mount_id.clone();
let task_id: String = self.task_id.clone();
tokio::spawn(async move {
match unmount_antares_fs(&mount_id).await {
Ok(_) => tracing::info!("[Task {}] Filesystem unmounted successfully.", task_id),
Err(e) => tracing::error!("[Task {}] Failed to unmount filesystem: {}", task_id, e),
}
});
}
}

/// Executes buck build with filesystem mounting and output streaming.
///
/// Process flow:
Expand Down Expand Up @@ -612,8 +546,6 @@ pub async fn build(
const MAX_TARGETS_ATTEMPTS: usize = 2;
let mut mount_point = None;
let mut old_repo_mount_point_saved = None;
let mut mount_guard = None;
let mut mount_guard_old_repo = None;
let mut targets: Vec<TargetLabel> = Vec::new();
let mut last_targets_error: Option<anyhow::Error> = None;

Expand All @@ -629,13 +561,11 @@ pub async fn build(
// Buck2 isolates daemons by project root, so distinct mount paths
// naturally get separate daemons without needing `--isolation-dir`.
let id_for_old_repo = format!("{id}-old-{attempt}");
let (old_repo_mount_point, mount_id_old_repo) =
let (old_repo_mount_point, _mount_id_old_repo) =
mount_antares_fs(&id_for_old_repo, None).await?;
let guard_old_repo = MountGuard::new(mount_id_old_repo.clone(), id_for_old_repo);

let id_for_repo = format!("{id}-{attempt}");
let (repo_mount_point, mount_id) = mount_antares_fs(&id_for_repo, cl_arg).await?;
let guard = MountGuard::new(mount_id.clone(), id_for_repo);
let (repo_mount_point, _mount_id) = mount_antares_fs(&id_for_repo, cl_arg).await?;
Comment on lines 563 to +568

tracing::info!(
"[Task {}] Filesystem mounted successfully (attempt {}/{}).",
Expand All @@ -658,23 +588,12 @@ pub async fn build(
Ok(found_targets) => {
mount_point = Some(repo_mount_point);
old_repo_mount_point_saved = Some(old_repo_mount_point.clone());
mount_guard = Some(guard);
mount_guard_old_repo = Some(guard_old_repo);
targets = found_targets;
break;
}
Err(e) => {
if keep_failed_mounts() {
tracing::info!(
"[Task {}] Keeping failed mounts alive for debugging (ORION_KEEP_FAILED_MOUNTS=1)",
id,
);
} else {
guard.unmount().await;
guard_old_repo.unmount().await;
}
tracing::warn!(
"[Task {}] Failed to get build targets (attempt {}/{}): {}. Mounts kept alive for debugging (old={}, new={}).",
"[Task {}] Failed to get build targets (attempt {}/{}): {}. Mounts retained for debugging (old={}, new={}).",
id,
attempt,
MAX_TARGETS_ATTEMPTS,
Expand Down Expand Up @@ -712,9 +631,6 @@ pub async fn build(
return Err(err.into());
}
};
let mount_guard = mount_guard.ok_or("Mount guard missing after target discovery")?;
let mount_guard_old_repo =
mount_guard_old_repo.ok_or("Old repo mount guard missing after target discovery")?;

let build_result = async {
// Run buck2 build from the sub-project directory, not the monorepo root.
Expand Down Expand Up @@ -830,41 +746,14 @@ pub async fn build(
}
.await;

if keep_failed_mounts() {
tracing::info!(
"[Task {}] Skipping unmount (ORION_KEEP_FAILED_MOUNTS=1) — mount directories retained: \
new_repo mountpoint={}, mount_id={}; \
old_repo mountpoint={}, mount_id={}",
id,
mount_point,
mount_guard.mount_id,
old_repo_mount_point_saved.as_deref().unwrap_or("<unknown>"),
mount_guard_old_repo.mount_id,
);
// Prevent the Drop impl from unmounting.
mount_guard.unmounted.store(true, Ordering::Release);
mount_guard_old_repo
.unmounted
.store(true, Ordering::Release);
} else {
mount_guard.unmount().await;
mount_guard_old_repo.unmount().await;
}
tracing::info!(
"[Task {}] Build completed — mount directories retained for debugging: \
new_repo mountpoint={}; \
old_repo mountpoint={}",
id,
Comment on lines +749 to +753
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Restore mount cleanup for normal build completion

In build(), the post-build path now only logs that mount directories are retained and never calls unmount_antares_fs for either the new or old repo mount. In a long-lived runner this leaks at least two Antares/FUSE mounts per successful build (and additional mounts from failed target-discovery attempts), which will eventually exhaust mount/disk resources and cause later builds to fail. The previous behavior cleaned up by default and only retained mounts behind an explicit flag.

Useful? React with 👍 / 👎.

mount_point,
old_repo_mount_point_saved.as_deref().unwrap_or("<unknown>"),
);
Comment on lines +749 to +756

build_result
}

#[cfg(test)]
mod tests {
use super::*;

#[tokio::test]
async fn test_mount_guard_creation() {
let mount_guard = MountGuard::new("test_mount_id".to_string(), "test_task_id".to_string());
assert_eq!(mount_guard.mount_id, "test_mount_id");
assert_eq!(mount_guard.task_id, "test_task_id");
}

// Note: mount/unmount tests removed - they now use scorpiofs direct calls
// which require actual filesystem setup. See integration tests instead.
}
}
3 changes: 2 additions & 1 deletion orion/systemd/orion-runner.service
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH

# Basic hardening (relaxed for FUSE operations)
NoNewPrivileges=false
PrivateTmp=true
PrivateTmp=false
PrivateMounts=no

[Install]
WantedBy=multi-user.target
Loading