Skip to content

Commit f6d8cf5

Browse files
fix(orion): retain Antares mounts and improve dicfuse startup diagnostics (#2026)
Signed-off-by: Luxian <lux1an@qq.com>
1 parent 4d34cfa commit f6d8cf5

File tree

3 files changed

+159
-126
lines changed

3 files changed

+159
-126
lines changed

orion/src/antares.rs

Lines changed: 145 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,19 @@
33
//! This module provides a singleton wrapper around `scorpiofs::AntaresManager`
44
//! for managing overlay filesystem mounts used during build operations.
55
6-
use std::{error::Error, io, path::PathBuf, sync::Arc, time::Duration};
6+
use std::{
7+
error::Error,
8+
io,
9+
path::{Path, PathBuf},
10+
sync::Arc,
11+
time::Duration,
12+
};
713

814
use scorpiofs::{AntaresConfig, AntaresManager, AntaresPaths};
915
use tokio::sync::OnceCell;
1016

1117
static MANAGER: OnceCell<Arc<AntaresManager>> = OnceCell::const_new();
18+
const TEST_BROWSE_JOB_ID: &str = "antares_test";
1219

1320
type DynError = Box<dyn Error + Send + Sync>;
1421

@@ -128,6 +135,7 @@ pub async fn mount_job(job_id: &str, cl: Option<&str>) -> Result<AntaresConfig,
128135
pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
129136
tracing::info!("Initializing Antares Dicfuse during Orion startup");
130137
let manager = get_manager().await?;
138+
let manager_for_test_mount = Arc::clone(manager);
131139
let dicfuse = manager.dicfuse();
132140

133141
// Idempotent: safe even if the manager already started import internally.
@@ -149,7 +157,17 @@ pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
149157
)
150158
.await
151159
{
152-
Ok(_) => tracing::info!("Antares Dicfuse warmup completed"),
160+
Ok(_) => {
161+
tracing::info!("Antares Dicfuse warmup completed");
162+
log_dicfuse_root_tree();
163+
if is_test_mount_enabled() {
164+
ensure_test_mount(manager_for_test_mount.as_ref()).await;
165+
} else {
166+
tracing::info!(
167+
"Antares test mount disabled by ORION_ENABLE_ANTARES_TEST_MOUNT"
168+
);
169+
}
170+
}
153171
Err(_) => tracing::warn!(
154172
"Antares Dicfuse warmup timed out after {}s",
155173
warmup_timeout_secs
@@ -160,6 +178,131 @@ pub(crate) async fn warmup_dicfuse() -> Result<(), DynError> {
160178
Ok(())
161179
}
162180

181+
fn is_test_mount_enabled() -> bool {
182+
match std::env::var("ORION_ENABLE_ANTARES_TEST_MOUNT") {
183+
Ok(v) => {
184+
let v = v.trim().to_ascii_lowercase();
185+
!(v == "0" || v == "false" || v == "no" || v == "off")
186+
}
187+
Err(_) => true,
188+
}
189+
}
190+
191+
async fn ensure_test_mount(manager: &AntaresManager) {
192+
match manager.mount_job(TEST_BROWSE_JOB_ID, None).await {
193+
Ok(config) => {
194+
tracing::info!(
195+
"Antares test mount ready: job_id={}, mountpoint={}",
196+
TEST_BROWSE_JOB_ID,
197+
config.mountpoint.display()
198+
);
199+
}
200+
Err(err) => {
201+
tracing::warn!(
202+
"Failed to create Antares test mount job_id={}: {}",
203+
TEST_BROWSE_JOB_ID,
204+
err
205+
);
206+
}
207+
}
208+
}
209+
210+
fn log_dicfuse_root_tree() {
211+
let root = PathBuf::from(scorpiofs::util::config::workspace());
212+
let max_depth = std::env::var("ORION_DICFUSE_ROOT_TREE_DEPTH")
213+
.ok()
214+
.and_then(|v| v.parse::<usize>().ok())
215+
.unwrap_or(2);
216+
let max_entries = std::env::var("ORION_DICFUSE_ROOT_TREE_MAX_ENTRIES")
217+
.ok()
218+
.and_then(|v| v.parse::<usize>().ok())
219+
.unwrap_or(200);
220+
221+
tracing::info!(
222+
root = %root.display(),
223+
max_depth,
224+
max_entries,
225+
"Dicfuse init: printing workspace root tree"
226+
);
227+
228+
if !root.exists() {
229+
tracing::warn!("Dicfuse workspace path does not exist: {}", root.display());
230+
return;
231+
}
232+
233+
let mut printed = 0usize;
234+
tracing::info!("[dicfuse-root] /");
235+
log_tree_recursive(&root, &root, 0, max_depth, max_entries, &mut printed);
236+
237+
if printed >= max_entries {
238+
tracing::info!(
239+
"Dicfuse root tree output truncated at {} entries (set ORION_DICFUSE_ROOT_TREE_MAX_ENTRIES to increase)",
240+
max_entries
241+
);
242+
}
243+
}
244+
245+
fn log_tree_recursive(
246+
root: &Path,
247+
current: &Path,
248+
depth: usize,
249+
max_depth: usize,
250+
max_entries: usize,
251+
printed: &mut usize,
252+
) {
253+
if depth >= max_depth || *printed >= max_entries {
254+
return;
255+
}
256+
257+
let entries = match std::fs::read_dir(current) {
258+
Ok(entries) => entries,
259+
Err(err) => {
260+
tracing::warn!("Failed to read {}: {}", current.display(), err);
261+
return;
262+
}
263+
};
264+
265+
let mut children: Vec<(String, PathBuf, bool)> = Vec::new();
266+
for entry in entries {
267+
let entry = match entry {
268+
Ok(entry) => entry,
269+
Err(err) => {
270+
tracing::warn!("read_dir entry error under {}: {}", current.display(), err);
271+
continue;
272+
}
273+
};
274+
275+
let path = entry.path();
276+
let name = entry.file_name().to_string_lossy().to_string();
277+
let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
278+
children.push((name, path, is_dir));
279+
}
280+
281+
children.sort_by(|a, b| a.0.cmp(&b.0));
282+
283+
for (_name, path, is_dir) in children {
284+
if *printed >= max_entries {
285+
return;
286+
}
287+
288+
let rel = path
289+
.strip_prefix(root)
290+
.map(|p| p.display().to_string())
291+
.unwrap_or_else(|_| path.display().to_string());
292+
let indent = " ".repeat(depth + 1);
293+
if is_dir {
294+
tracing::info!("[dicfuse-root] {}{}/", indent, rel);
295+
} else {
296+
tracing::info!("[dicfuse-root] {}{}", indent, rel);
297+
}
298+
*printed += 1;
299+
300+
if is_dir {
301+
log_tree_recursive(root, &path, depth + 1, max_depth, max_entries, printed);
302+
}
303+
}
304+
}
305+
163306
/// Unmount and cleanup a job overlay filesystem.
164307
///
165308
/// # Arguments

orion/src/buck_controller.rs

Lines changed: 12 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ use std::{
44
io::BufReader,
55
path::{Path, PathBuf},
66
process::{ExitStatus, Stdio},
7-
sync::atomic::{AtomicBool, Ordering},
87
};
98

109
use anyhow::anyhow;
@@ -46,14 +45,6 @@ static PROJECT_ROOT: Lazy<String> =
4645
const DEFAULT_PREHEAT_SHALLOW_DEPTH: usize = 3;
4746
static BUILD_CONFIG: Lazy<Option<BuildConfig>> = Lazy::new(load_build_config);
4847

49-
/// Check whether failed-build mounts should be kept alive for debugging.
50-
/// Controlled by the `ORION_KEEP_FAILED_MOUNTS` environment variable (set to "1" to enable).
51-
fn keep_failed_mounts() -> bool {
52-
std::env::var("ORION_KEEP_FAILED_MOUNTS")
53-
.map(|v| v == "1")
54-
.unwrap_or(false)
55-
}
56-
5748
/// Mount an Antares overlay filesystem for a build job.
5849
///
5950
/// Creates a new Antares overlay mount using scorpiofs. The underlying Dicfuse
@@ -512,63 +503,6 @@ async fn flush_buffer(
512503
}
513504
}
514505

515-
/// RAII guard for automatically unmounting Antares filesystem when dropped
516-
struct MountGuard {
517-
mount_id: String,
518-
task_id: String,
519-
unmounted: AtomicBool,
520-
}
521-
522-
impl MountGuard {
523-
fn new(mount_id: String, task_id: String) -> Self {
524-
Self {
525-
mount_id,
526-
task_id,
527-
unmounted: AtomicBool::new(false),
528-
}
529-
}
530-
531-
async fn unmount(&self) {
532-
if self.unmounted.swap(true, Ordering::AcqRel) {
533-
return;
534-
}
535-
match unmount_antares_fs(&self.mount_id).await {
536-
Ok(_) => tracing::info!("[Task {}] Filesystem unmounted successfully.", self.task_id),
537-
Err(e) => {
538-
tracing::error!(
539-
"[Task {}] Failed to unmount filesystem: {}",
540-
self.task_id,
541-
e
542-
)
543-
}
544-
}
545-
}
546-
}
547-
548-
impl Drop for MountGuard {
549-
fn drop(&mut self) {
550-
if self.unmounted.load(Ordering::Acquire) {
551-
return;
552-
}
553-
if keep_failed_mounts() {
554-
tracing::info!(
555-
"[Task {}] MountGuard dropped — unmount skipped (ORION_KEEP_FAILED_MOUNTS=1, mount_id={}).",
556-
self.task_id,
557-
self.mount_id,
558-
);
559-
return;
560-
}
561-
let mount_id = self.mount_id.clone();
562-
let task_id: String = self.task_id.clone();
563-
tokio::spawn(async move {
564-
match unmount_antares_fs(&mount_id).await {
565-
Ok(_) => tracing::info!("[Task {}] Filesystem unmounted successfully.", task_id),
566-
Err(e) => tracing::error!("[Task {}] Failed to unmount filesystem: {}", task_id, e),
567-
}
568-
});
569-
}
570-
}
571-
572506
/// Executes buck build with filesystem mounting and output streaming.
573507
///
574508
/// Process flow:
@@ -612,8 +546,6 @@ pub async fn build(
612546
const MAX_TARGETS_ATTEMPTS: usize = 2;
613547
let mut mount_point = None;
614548
let mut old_repo_mount_point_saved = None;
615-
let mut mount_guard = None;
616-
let mut mount_guard_old_repo = None;
617549
let mut targets: Vec<TargetLabel> = Vec::new();
618550
let mut last_targets_error: Option<anyhow::Error> = None;
619551

@@ -629,13 +561,11 @@ pub async fn build(
629561
// Buck2 isolates daemons by project root, so distinct mount paths
630562
// naturally get separate daemons without needing `--isolation-dir`.
631563
let id_for_old_repo = format!("{id}-old-{attempt}");
632-
let (old_repo_mount_point, mount_id_old_repo) =
564+
let (old_repo_mount_point, _mount_id_old_repo) =
633565
mount_antares_fs(&id_for_old_repo, None).await?;
634-
let guard_old_repo = MountGuard::new(mount_id_old_repo.clone(), id_for_old_repo);
635566

636567
let id_for_repo = format!("{id}-{attempt}");
637-
let (repo_mount_point, mount_id) = mount_antares_fs(&id_for_repo, cl_arg).await?;
638-
let guard = MountGuard::new(mount_id.clone(), id_for_repo);
568+
let (repo_mount_point, _mount_id) = mount_antares_fs(&id_for_repo, cl_arg).await?;
639569

640570
tracing::info!(
641571
"[Task {}] Filesystem mounted successfully (attempt {}/{}).",
@@ -658,23 +588,12 @@ pub async fn build(
658588
Ok(found_targets) => {
659589
mount_point = Some(repo_mount_point);
660590
old_repo_mount_point_saved = Some(old_repo_mount_point.clone());
661-
mount_guard = Some(guard);
662-
mount_guard_old_repo = Some(guard_old_repo);
663591
targets = found_targets;
664592
break;
665593
}
666594
Err(e) => {
667-
if keep_failed_mounts() {
668-
tracing::info!(
669-
"[Task {}] Keeping failed mounts alive for debugging (ORION_KEEP_FAILED_MOUNTS=1)",
670-
id,
671-
);
672-
} else {
673-
guard.unmount().await;
674-
guard_old_repo.unmount().await;
675-
}
676595
tracing::warn!(
677-
"[Task {}] Failed to get build targets (attempt {}/{}): {}. Mounts kept alive for debugging (old={}, new={}).",
596+
"[Task {}] Failed to get build targets (attempt {}/{}): {}. Mounts retained for debugging (old={}, new={}).",
678597
id,
679598
attempt,
680599
MAX_TARGETS_ATTEMPTS,
@@ -712,9 +631,6 @@ pub async fn build(
712631
return Err(err.into());
713632
}
714633
};
715-
let mount_guard = mount_guard.ok_or("Mount guard missing after target discovery")?;
716-
let mount_guard_old_repo =
717-
mount_guard_old_repo.ok_or("Old repo mount guard missing after target discovery")?;
718634

719635
let build_result = async {
720636
// Run buck2 build from the sub-project directory, not the monorepo root.
@@ -830,41 +746,14 @@ pub async fn build(
830746
}
831747
.await;
832748

833-
if keep_failed_mounts() {
834-
tracing::info!(
835-
"[Task {}] Skipping unmount (ORION_KEEP_FAILED_MOUNTS=1) — mount directories retained: \
836-
new_repo mountpoint={}, mount_id={}; \
837-
old_repo mountpoint={}, mount_id={}",
838-
id,
839-
mount_point,
840-
mount_guard.mount_id,
841-
old_repo_mount_point_saved.as_deref().unwrap_or("<unknown>"),
842-
mount_guard_old_repo.mount_id,
843-
);
844-
// Prevent the Drop impl from unmounting.
845-
mount_guard.unmounted.store(true, Ordering::Release);
846-
mount_guard_old_repo
847-
.unmounted
848-
.store(true, Ordering::Release);
849-
} else {
850-
mount_guard.unmount().await;
851-
mount_guard_old_repo.unmount().await;
852-
}
749+
tracing::info!(
750+
"[Task {}] Build completed — mount directories retained for debugging: \
751+
new_repo mountpoint={}; \
752+
old_repo mountpoint={}",
753+
id,
754+
mount_point,
755+
old_repo_mount_point_saved.as_deref().unwrap_or("<unknown>"),
756+
);
853757

854758
build_result
855-
}
856-
857-
#[cfg(test)]
858-
mod tests {
859-
use super::*;
860-
861-
#[tokio::test]
862-
async fn test_mount_guard_creation() {
863-
let mount_guard = MountGuard::new("test_mount_id".to_string(), "test_task_id".to_string());
864-
assert_eq!(mount_guard.mount_id, "test_mount_id");
865-
assert_eq!(mount_guard.task_id, "test_task_id");
866-
}
867-
868-
// Note: mount/unmount tests removed - they now use scorpiofs direct calls
869-
// which require actual filesystem setup. See integration tests instead.
870-
}
759+
}

orion/systemd/orion-runner.service

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH
3030

3131
# Basic hardening (relaxed for FUSE operations)
3232
NoNewPrivileges=false
33-
PrivateTmp=true
33+
PrivateTmp=false
34+
PrivateMounts=no
3435

3536
[Install]
3637
WantedBy=multi-user.target

0 commit comments

Comments
 (0)