Skip to content

Commit 9c07db3

Browse files
committed
perf: Decouple filesystem walk from git index construction
Split find_untracked_files into two phases that can run in parallel: 1. walk_candidate_files (I/O-bound): Enumerates all non-gitignored files within scope using the ignore crate's native gitignore support. Only needs the git root path and package prefixes — no tracked index. 2. filter_untracked_from_candidates (CPU-bound): Binary-searches each candidate against ls_tree_hashes and status_entries to identify truly untracked files. Runs after the tracked index is ready. The git root is sent via a oneshot channel as soon as SCM::new() resolves it (~5ms), while new_from_gix_index continues (~267ms). The walk starts immediately and runs in parallel with index construction. Benchmark (110-package monorepo, 30 runs, sandbox): baseline: 853ms ± 19ms improved: 619ms ± 6ms (1.38x faster)
1 parent 24bd765 commit 9c07db3

File tree

4 files changed

+398
-27
lines changed

4 files changed

+398
-27
lines changed

crates/turborepo-lib/src/run/builder.rs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ impl RunBuilder {
262262
let start_at = Local::now();
263263

264264
let (tracked_index_tx, tracked_index_rx) =
265-
tokio::sync::oneshot::channel::<(SCM, Option<turborepo_scm::RepoGitIndex>)>();
265+
tokio::sync::oneshot::channel::<Option<turborepo_scm::RepoGitIndex>>();
266+
let (git_root_tx, git_root_rx) =
267+
tokio::sync::oneshot::channel::<Option<turbopath::AbsoluteSystemPathBuf>>();
266268
let scm_task = {
267269
let repo_root = self.repo_root.clone();
268270
let git_root = self.opts.git_root.clone();
@@ -271,8 +273,11 @@ impl RunBuilder {
271273
Some(root) => SCM::new_with_git_root(&repo_root, root),
272274
None => SCM::new(&repo_root),
273275
};
276+
// Send git root immediately so the filesystem walk can start
277+
// while index construction continues.
278+
let _ = git_root_tx.send(scm.git_root().map(|r| r.to_owned()));
274279
let repo_index = scm.build_tracked_repo_index_eager();
275-
let _ = tracked_index_tx.send((scm.clone(), repo_index));
280+
let _ = tracked_index_tx.send(repo_index);
276281
scm
277282
})
278283
};
@@ -349,40 +354,39 @@ impl RunBuilder {
349354
repo_telemetry.track_size(pkg_dep_graph.len());
350355
run_telemetry.track_run_type(self.opts.run_opts.dry_run.is_some());
351356

352-
// Spawn the untracked-file walk as soon as the package graph is ready.
353-
// We use all-package prefixes (superset of any filtered selection) so
354-
// the walk can start before filter resolution. Per-package hash queries
355-
// use binary-search range scoping, so extra untracked files outside a
356-
// queried package are never returned.
357+
// Spawn the filesystem walk as soon as the git root is resolved.
358+
// It only needs the git root and package prefixes, not the tracked
359+
// index. The walk runs in parallel with new_from_gix_index (~267ms).
357360
let all_prefixes = Self::all_package_prefixes(&pkg_dep_graph);
358-
let repo_index_task = if all_prefixes.is_empty() {
361+
let walk_task = if all_prefixes.is_empty() {
359362
None
360363
} else {
361364
Some(tokio::task::spawn(async move {
362-
let (scm, tracked_index) = match tracked_index_rx.await {
363-
Ok(pair) => pair,
364-
Err(_) => return None,
365+
let git_root = match git_root_rx.await {
366+
Ok(Some(root)) => root,
367+
_ => return None,
365368
};
366-
let tracked_index = tracked_index?;
367369
tokio::task::spawn_blocking(move || {
368-
let _span = tracing::info_span!("repo_index_scope_untracked").entered();
369-
let mut repo_index = tracked_index;
370-
match scm.populate_repo_index_untracked(&mut repo_index, &all_prefixes) {
371-
Ok(()) => Some(repo_index),
372-
Err(err) => {
373-
tracing::debug!(
374-
"failed to scope repo git index with untracked files: {}. Will \
375-
hash per-package.",
376-
err,
377-
);
378-
None
379-
}
380-
}
370+
let _span = tracing::info_span!("walk_candidate_files").entered();
371+
turborepo_scm::walk_candidate_files(git_root.as_std_path(), Some(&all_prefixes))
372+
.ok()
381373
})
382374
.await
383375
.ok()?
384376
}))
385377
};
378+
379+
// Combine the walk results with the tracked index once both are ready.
380+
let repo_index_task = walk_task.map(|walk_task| {
381+
tokio::task::spawn(async move {
382+
let (candidates, tracked_index) = tokio::join!(walk_task, tracked_index_rx);
383+
let candidates = candidates.ok()??;
384+
let tracked_index = tracked_index.ok()??;
385+
let mut repo_index = tracked_index;
386+
repo_index.populate_untracked_from_candidates(candidates);
387+
Some(repo_index)
388+
})
389+
});
386390
let micro_frontend_configs = {
387391
let _span = tracing::info_span!("micro_frontends_from_disk").entered();
388392
match MicrofrontendsConfigs::from_disk(&self.repo_root, &pkg_dep_graph) {

crates/turborepo-scm/src/git_index_regression_tests.rs

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
1313
use turbopath::{AnchoredSystemPathBuf, RelativeUnixPathBuf};
1414

15-
use crate::{GitHashes, RepoGitIndex, SCM, test_utils};
15+
use crate::{GitHashes, RepoGitIndex, SCM, test_utils, walk_candidate_files};
1616

1717
fn path(s: &str) -> RelativeUnixPathBuf {
1818
RelativeUnixPathBuf::new(s).unwrap()
@@ -61,6 +61,23 @@ impl TestRepo {
6161
.expect("failed to build repo index")
6262
}
6363

64+
fn build_split_repo_index(&self, prefixes: &[&str]) -> RepoGitIndex {
65+
let scm = self.scm();
66+
let prefix_paths = prefixes
67+
.iter()
68+
.map(|prefix| path(prefix))
69+
.collect::<Vec<_>>();
70+
71+
let candidates = walk_candidate_files(self.root.as_std_path(), Some(&prefix_paths))
72+
.expect("walk candidates failed");
73+
74+
let mut index = scm
75+
.build_tracked_repo_index_eager()
76+
.expect("failed to build tracked repo index");
77+
index.populate_untracked_from_candidates(candidates);
78+
index
79+
}
80+
6481
fn build_scoped_repo_index(&self, prefixes: &[&str]) -> RepoGitIndex {
6582
let scm = self.scm();
6683
let mut index = scm
@@ -1421,6 +1438,157 @@ fn test_superset_walk_untracked_in_other_pkg_does_not_affect_queried_pkg() {
14211438
assert!(pkg_b_hashes.contains_key(&path("extra5.ts")));
14221439
}
14231440

1441+
// Category 6: Split walk/filter regression tests
1442+
//
1443+
// These tests validate that the two-phase approach (walk_candidate_files
1444+
// followed by populate_untracked_from_candidates) produces identical
1445+
// results to the original single-pass find_untracked_files.
1446+
1447+
#[test]
1448+
fn test_split_walk_matches_original_path() {
1449+
let repo = TestRepo::new();
1450+
1451+
repo.create_file("pkg-a/src/index.ts", "a code");
1452+
repo.create_file("pkg-a/package.json", "{}");
1453+
repo.create_file("pkg-b/src/index.ts", "b code");
1454+
repo.create_file("pkg-b/package.json", "{}");
1455+
repo.create_file("package.json", "{}");
1456+
repo.commit_all();
1457+
1458+
repo.create_file("pkg-a/untracked.ts", "new a");
1459+
repo.create_file("pkg-b/untracked.ts", "new b");
1460+
1461+
let original = repo.build_scoped_repo_index(&["pkg-a", "pkg-b"]);
1462+
let split = repo.build_split_repo_index(&["pkg-a", "pkg-b"]);
1463+
1464+
let orig_a = repo.get_hashes_with_index("pkg-a", &original);
1465+
let split_a = repo.get_hashes_with_index("pkg-a", &split);
1466+
assert_eq!(orig_a, split_a, "pkg-a: split walk must match original");
1467+
1468+
let orig_b = repo.get_hashes_with_index("pkg-b", &original);
1469+
let split_b = repo.get_hashes_with_index("pkg-b", &split);
1470+
assert_eq!(orig_b, split_b, "pkg-b: split walk must match original");
1471+
1472+
let no_idx_a = repo.get_hashes_no_index("pkg-a");
1473+
let no_idx_b = repo.get_hashes_no_index("pkg-b");
1474+
assert_eq!(split_a, no_idx_a, "pkg-a: split vs no-index");
1475+
assert_eq!(split_b, no_idx_b, "pkg-b: split vs no-index");
1476+
}
1477+
1478+
#[test]
1479+
fn test_split_walk_respects_gitignore() {
1480+
let repo = TestRepo::new();
1481+
1482+
repo.create_gitignore(".gitignore", "*.log\nbuild/\nnode_modules/\n");
1483+
repo.create_gitignore("pkg-b/.gitignore", "tmp/\n");
1484+
repo.create_file("pkg-a/src/index.ts", "a");
1485+
repo.create_file("pkg-a/package.json", "{}");
1486+
repo.create_file("pkg-b/src/index.ts", "b");
1487+
repo.create_file("pkg-b/package.json", "{}");
1488+
repo.create_file("package.json", "{}");
1489+
repo.commit_all();
1490+
1491+
repo.create_file("pkg-a/debug.log", "log");
1492+
repo.create_file("pkg-a/build/out.js", "out");
1493+
repo.create_file("pkg-a/node_modules/dep/index.js", "dep");
1494+
repo.create_file("pkg-b/tmp/cache.dat", "cache");
1495+
repo.create_file("pkg-b/build/out.js", "out");
1496+
repo.create_file("pkg-a/new.ts", "new");
1497+
repo.create_file("pkg-b/new.ts", "new");
1498+
1499+
let original = repo.build_scoped_repo_index(&["pkg-a", "pkg-b"]);
1500+
let split = repo.build_split_repo_index(&["pkg-a", "pkg-b"]);
1501+
1502+
let orig_a = repo.get_hashes_with_index("pkg-a", &original);
1503+
let split_a = repo.get_hashes_with_index("pkg-a", &split);
1504+
assert_eq!(
1505+
orig_a, split_a,
1506+
"pkg-a: split must match original with gitignore"
1507+
);
1508+
assert!(split_a.contains_key(&path("new.ts")));
1509+
assert!(!split_a.contains_key(&path("debug.log")));
1510+
assert!(!split_a.contains_key(&path("build/out.js")));
1511+
assert!(!split_a.contains_key(&path("node_modules/dep/index.js")));
1512+
1513+
let orig_b = repo.get_hashes_with_index("pkg-b", &original);
1514+
let split_b = repo.get_hashes_with_index("pkg-b", &split);
1515+
assert_eq!(
1516+
orig_b, split_b,
1517+
"pkg-b: split must match original with gitignore"
1518+
);
1519+
assert!(split_b.contains_key(&path("new.ts")));
1520+
assert!(!split_b.contains_key(&path("tmp/cache.dat")));
1521+
}
1522+
1523+
#[test]
1524+
fn test_split_walk_with_untracked_gitignore() {
1525+
let repo = TestRepo::new();
1526+
1527+
repo.create_file("pkg-a/src/index.ts", "a");
1528+
repo.create_file("pkg-a/package.json", "{}");
1529+
repo.create_file("package.json", "{}");
1530+
repo.commit_all();
1531+
1532+
repo.create_gitignore("pkg-a/.gitignore", "generated/\n");
1533+
repo.create_file("pkg-a/generated/output.js", "out");
1534+
repo.create_file("pkg-a/new.ts", "new");
1535+
1536+
let original = repo.build_scoped_repo_index(&["pkg-a"]);
1537+
let split = repo.build_split_repo_index(&["pkg-a"]);
1538+
1539+
let orig_a = repo.get_hashes_with_index("pkg-a", &original);
1540+
let split_a = repo.get_hashes_with_index("pkg-a", &split);
1541+
1542+
assert!(split_a.contains_key(&path("new.ts")));
1543+
assert!(orig_a.contains_key(&path("new.ts")));
1544+
assert!(
1545+
!split_a.contains_key(&path("generated/output.js")),
1546+
"split walk should respect untracked .gitignore"
1547+
);
1548+
assert!(
1549+
!orig_a.contains_key(&path("generated/output.js")),
1550+
"original walk should respect untracked .gitignore"
1551+
);
1552+
assert!(split_a.contains_key(&path(".gitignore")));
1553+
assert!(orig_a.contains_key(&path(".gitignore")));
1554+
}
1555+
1556+
#[test]
1557+
fn test_split_walk_nested_gitignore_scoping() {
1558+
let repo = TestRepo::new();
1559+
1560+
repo.create_gitignore(".gitignore", "*.log\n");
1561+
repo.create_gitignore("pkg-a/.gitignore", "output/\n");
1562+
repo.create_file("pkg-a/src/index.ts", "a");
1563+
repo.create_file("pkg-a/package.json", "{}");
1564+
repo.create_file("pkg-b/src/index.ts", "b");
1565+
repo.create_file("pkg-b/package.json", "{}");
1566+
repo.create_file("package.json", "{}");
1567+
repo.commit_all();
1568+
1569+
repo.create_file("pkg-a/output/bundle.js", "a bundle");
1570+
repo.create_file("pkg-b/output/bundle.js", "b bundle");
1571+
1572+
let split = repo.build_split_repo_index(&["pkg-a", "pkg-b"]);
1573+
1574+
let split_a = repo.get_hashes_with_index("pkg-a", &split);
1575+
assert!(
1576+
!split_a.contains_key(&path("output/bundle.js")),
1577+
"pkg-a output/ should be ignored by pkg-a/.gitignore"
1578+
);
1579+
1580+
let split_b = repo.get_hashes_with_index("pkg-b", &split);
1581+
assert!(
1582+
split_b.contains_key(&path("output/bundle.js")),
1583+
"pkg-b output/ should NOT be ignored — the output/ rule is scoped to pkg-a"
1584+
);
1585+
1586+
let no_idx_a = repo.get_hashes_no_index("pkg-a");
1587+
let no_idx_b = repo.get_hashes_no_index("pkg-b");
1588+
assert_eq!(split_a, no_idx_a, "pkg-a split vs no-index");
1589+
assert_eq!(split_b, no_idx_b, "pkg-b split vs no-index");
1590+
}
1591+
14241592
#[test]
14251593
fn test_nested_gitignore_scoping() {
14261594
// Gitignore rules in a nested .gitignore should only apply to that

crates/turborepo-scm/src/lib.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ mod git_index_regression_tests;
3434
#[cfg(test)]
3535
mod test_utils;
3636

37-
pub use repo_index::RepoGitIndex;
37+
pub use repo_index::{RepoGitIndex, walk_candidate_files};
3838
pub use turborepo_hash::OidHash;
3939
pub use worktree::WorktreeInfo;
4040

@@ -311,6 +311,13 @@ impl SCM {
311311
matches!(self, SCM::Manual)
312312
}
313313

314+
pub fn git_root(&self) -> Option<&AbsoluteSystemPath> {
315+
match self {
316+
SCM::Git(git) => Some(&git.root),
317+
SCM::Manual => None,
318+
}
319+
}
320+
314321
/// Build a repo-wide git index that caches `git ls-tree` and `git status`
315322
/// results. Returns `None` for manual SCM mode or when the package count
316323
/// is too small to benefit. Callers should build this once before parallel

0 commit comments

Comments
 (0)