From 18f55cf299c30d1a67193c5d3bbf0a1b9bf870e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Wed, 27 Aug 2025 12:01:26 +0200 Subject: [PATCH 1/4] Change directory structure of toolchain cache directory To make it easier to figure out how many toolchains we have. --- collector/src/toolchain.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/src/toolchain.rs b/collector/src/toolchain.rs index 2f2caed3d..457de6f6a 100644 --- a/collector/src/toolchain.rs +++ b/collector/src/toolchain.rs @@ -43,7 +43,7 @@ impl Sysroot { triple: &str, backends: &[CodegenBackend], ) -> Result { - let cache_directory = cache_directory.join(triple).join(&sha); + let cache_directory = cache_directory.join(&sha).join(triple); fs::create_dir_all(&cache_directory).map_err(|e| SysrootDownloadError::IO(e.into()))?; let download = SysrootDownload { From 027ecfbf02ddd66b367f8b27394093ab8bdd5b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Wed, 27 Aug 2025 12:00:24 +0200 Subject: [PATCH 2/4] Implement tidying of collector cache directory --- collector/src/bin/collector.rs | 41 ++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/collector/src/bin/collector.rs b/collector/src/bin/collector.rs index e91720dcd..38d4691e4 100644 --- a/collector/src/bin/collector.rs +++ b/collector/src/bin/collector.rs @@ -65,8 +65,13 @@ use database::{ CommitType, Connection, Pool, }; +/// Directory used to cache downloaded Rust toolchains on disk. const TOOLCHAIN_CACHE_DIRECTORY: &str = "cache"; +/// Maximum allowed number of toolchains in the toolchain cache directory. +/// If the directory will have more toolchains, it will be purged. +const TOOLCHAIN_CACHE_MAX_TOOLCHAINS: usize = 30; + fn n_normal_benchmarks_remaining(n: usize) -> String { let suffix = if n == 1 { "" } else { "s" }; format!("{n} normal benchmark{suffix} remaining") @@ -1434,6 +1439,8 @@ async fn run_job_queue_benchmarks( all_compile_benchmarks: Vec, check_git_sha: bool, ) -> anyhow::Result<()> { + let _ = tidy_toolchain_cache_dir(); + let mut last_request_tag = None; while let Some((benchmark_job, artifact_id)) = conn @@ -1444,20 +1451,25 @@ async fn run_job_queue_benchmarks( ) .await? { + // Are we benchmarking a different benchmark request than in the previous iteration of the + // loop? + let is_new_request = last_request_tag.is_some() + && last_request_tag.as_deref() != Some(benchmark_job.request_tag()); + if is_new_request { + let _ = tidy_toolchain_cache_dir(); + } + // Here we check if we should update our commit SHA, if rustc-perf has been updated. // We only check for updates when we switch *benchmark requests*, not *benchmark jobs*, // to avoid changing code in the middle of benchmarking the same request. // Note that if an update happens, the job that we have just dequeued will have its deque // counter increased. But since updates are relatively rare, that shouldn't be a big deal, // it will be dequeued again when the collector starts again. - if check_git_sha - && last_request_tag.is_some() - && last_request_tag.as_deref() != Some(benchmark_job.request_tag()) - && needs_git_update(collector) - { + if check_git_sha && is_new_request && needs_git_update(collector) { log::warn!("Exiting collector to update itself from git."); return Ok(()); } + last_request_tag = Some(benchmark_job.request_tag().to_string()); log::info!("Dequeued job {benchmark_job:?}, artifact_id {artifact_id:?}"); @@ -1523,6 +1535,23 @@ async fn run_job_queue_benchmarks( Ok(()) } +/// Check the toolchain cache directory and delete it if it grows too large. +/// Currently, we just assume that "too large" means "has more than N toolchains". +fn tidy_toolchain_cache_dir() -> std::io::Result<()> { + let dir_count = Path::new(TOOLCHAIN_CACHE_DIRECTORY) + .read_dir()? + .filter_map(|e| e.ok()) + .filter_map(|d| d.file_type().ok()) + .filter(|t| t.is_dir()) + .count(); + if dir_count > TOOLCHAIN_CACHE_MAX_TOOLCHAINS { + log::warn!("Purging toolchain cache directory at {TOOLCHAIN_CACHE_DIRECTORY}"); + // Just remove the whole directory, to avoid having to figure out which toolchains are old + std::fs::remove_dir_all(TOOLCHAIN_CACHE_DIRECTORY)?; + } + Ok(()) +} + /// Returns true if the commit SHA of collector does not match the latest commit SHA of the master /// branch of https://github.com/rust-lang/rustc-perf. fn needs_git_update(collector: &CollectorConfig) -> bool { @@ -1606,8 +1635,6 @@ async fn run_benchmark_job( }; // Avoid redownloading the same sysroot multiple times for different jobs, even // across collector restarts. - - // TODO: Periodically clear the cache directory to avoid running out of disk space. sysroot.preserve(); Toolchain::from_sysroot(&sysroot, commit.sha.clone()) } From b45389282cc0eb7e88748af9db368268267b4c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Wed, 27 Aug 2025 15:51:31 +0200 Subject: [PATCH 3/4] Add shared function for getting the latest SHA of a git repository --- collector/src/bin/collector.rs | 45 ++++++++++++++++------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/collector/src/bin/collector.rs b/collector/src/bin/collector.rs index 38d4691e4..93ba85cac 100644 --- a/collector/src/bin/collector.rs +++ b/collector/src/bin/collector.rs @@ -1274,15 +1274,8 @@ fn main_result() -> anyhow::Result { } Commands::InstallNext { codegen_backends } => { - let last_sha = Command::new("git") - .arg("ls-remote") - .arg("https://github.com/rust-lang/rust.git") - .arg("master") - .output() - .unwrap(); - let last_sha = String::from_utf8(last_sha.stdout).expect("utf8"); - let last_sha = last_sha.split_whitespace().next().expect(&last_sha); - let commit = get_commit_or_fake_it(last_sha).expect("success"); + let last_sha = get_latest_sha("https://github.com/rust-lang/rust").unwrap(); + let commit = get_commit_or_fake_it(&last_sha).expect("success"); let rt = build_async_runtime(); let mut sysroot = rt @@ -1559,21 +1552,8 @@ fn needs_git_update(collector: &CollectorConfig) -> bool { return false; }; - let mut cmd = Command::new("git"); - cmd.arg("ls-remote") - .arg("https://github.com/rust-lang/rustc-perf") - .arg("HEAD"); - let upstream_sha = match command_output(&mut cmd) { - Ok(output) => String::from_utf8(output.stdout) - .unwrap() - .split_whitespace() - .next() - .unwrap() - .to_string(), - Err(error) => { - log::error!("Cannot determine latest SHA of rustc-perf: {error:?}"); - return false; - } + let Ok(upstream_sha) = get_latest_sha("https://github.com/rust-lang/rustc-perf") else { + return false; }; if commit_sha != upstream_sha { log::warn!( @@ -1585,6 +1565,23 @@ fn needs_git_update(collector: &CollectorConfig) -> bool { } } +/// Returns the latest known sha of the default branch of the specified `repo`. +fn get_latest_sha(repo: &str) -> anyhow::Result { + let mut cmd = Command::new("git"); + cmd.arg("ls-remote").arg(repo).arg("HEAD"); + match command_output(&mut cmd) { + Ok(output) => Ok(String::from_utf8(output.stdout)? + .split_whitespace() + .next() + .unwrap() + .to_string()), + Err(error) => { + log::error!("Cannot determine latest SHA of {repo}: {error:?}"); + Err(error) + } + } +} + /// Error that happened during benchmarking of a job. enum BenchmarkJobError { /// The error is non-recoverable. From 2c12d1499b121d98aad51077fcf1edc6c9213dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Wed, 27 Aug 2025 16:07:45 +0200 Subject: [PATCH 4/4] Change toolchain cache directory layout --- collector/src/toolchain.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/collector/src/toolchain.rs b/collector/src/toolchain.rs index 457de6f6a..946312487 100644 --- a/collector/src/toolchain.rs +++ b/collector/src/toolchain.rs @@ -43,7 +43,13 @@ impl Sysroot { triple: &str, backends: &[CodegenBackend], ) -> Result { - let cache_directory = cache_directory.join(&sha).join(triple); + // The structure of this directory is load-bearing. + // We use the commit SHA as the top-level key, to have a quick way of estimating how many + // toolchains have been installed in the cache directory. + // We also use a nested directory below the target tuple, because rustc outputs weird things + // when we query it with `--print sysroot` and its sysroot is located in a directory that + // corresponds to a valid target name. + let cache_directory = cache_directory.join(&sha).join(triple).join("toolchain"); fs::create_dir_all(&cache_directory).map_err(|e| SysrootDownloadError::IO(e.into()))?; let download = SysrootDownload {