diff --git a/.github/workflows/hf-xet-tests.yml b/.github/workflows/hf-xet-tests.yml index 0b15d3ea..977b07c6 100644 --- a/.github/workflows/hf-xet-tests.yml +++ b/.github/workflows/hf-xet-tests.yml @@ -45,4 +45,9 @@ jobs: - name: Check Cargo.lock has no uncommitted changes run: | # the Build wheel step would update hf_xet/Cargo.lock if it is out of date - test -z "$(git status --porcelain hf_xet/Cargo.lock)" || (echo "hf_xet/Cargo.lock has uncommitted changes!" && exit 1) \ No newline at end of file + if [ -n "$(git status --porcelain hf_xet/Cargo.lock)" ]; then + echo "::error::hf_xet/Cargo.lock has uncommitted changes!" + echo "Diff:" + git diff hf_xet/Cargo.lock + exit 1 + fi diff --git a/Cargo.lock b/Cargo.lock index 3b6fc9aa..041ff488 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -939,6 +939,7 @@ dependencies = [ "dirs", "error_printer", "hub_client", + "humansize", "jsonwebtoken", "lazy_static", "mdb_shard", @@ -1730,6 +1731,15 @@ dependencies = [ "urlencoding", ] +[[package]] +name = "humansize" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cb51c9a029ddc91b07a787f1d86b53ccfa49b0e86688c946ebe8d3555685dd7" +dependencies = [ + "libm", +] + [[package]] name = "humantime" version = "2.3.0" @@ -2182,6 +2192,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + [[package]] name = "libredox" version = "0.1.3" diff --git a/Cargo.toml b/Cargo.toml index a6fef7f3..1ed9dfcb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ getrandom = "0.3" git-url-parse = "0.4" git2 = "0.20" half = "2.4" +humansize = "2.1" heapify = "0.2" heed = "0.11" http = "1" diff --git a/data/Cargo.toml b/data/Cargo.toml index f7d7b91d..50015d18 100644 --- a/data/Cargo.toml +++ b/data/Cargo.toml @@ -42,6 +42,7 @@ xet_runtime = { path = "../xet_runtime" } anyhow = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } +humansize = { workspace = true } chrono = { workspace = true } clap = { workspace = true } jsonwebtoken = { workspace = true } diff --git a/data/src/bin/xtool.rs b/data/src/bin/xtool.rs index 801c9020..bf8b7f6f 100644 --- a/data/src/bin/xtool.rs +++ b/data/src/bin/xtool.rs @@ -12,6 +12,7 @@ use data::data_client::default_config; use data::migration_tool::hub_client_token_refresher::HubClientTokenRefresher; use data::migration_tool::migrate::migrate_files_impl; use hub_client::{BearerCredentialHelper, HubClient, Operation, RepoInfo}; +use humansize::{BINARY, DECIMAL, format_size}; use merklehash::MerkleHash; use utils::auth::TokenRefresher; use walkdir::WalkDir; @@ -76,6 +77,8 @@ enum Command { Dedup(DedupArg), /// Queries reconstruction information about a file. Query(QueryArg), + /// Calculates the compressed size of a xet-file by summing url_range sizes. + CompressedSize(CompressedSizeArg), } #[derive(Args)] @@ -116,6 +119,12 @@ struct QueryArg { bytes_range: Option, } +#[derive(Args)] +struct CompressedSizeArg { + /// Xet-hash of a file. + hash: String, +} + impl Command { async fn run(self, hub_client: HubClient) -> Result<()> { match self { @@ -161,6 +170,44 @@ impl Command { Ok(()) }, + Command::CompressedSize(arg) => { + let file_hash = MerkleHash::from_hex(&arg.hash)?; + // Query reconstruction for full file (no Range header) + let ret = query_reconstruction(file_hash, None, hub_client).await?; + + match ret { + Some(response) => { + let mut total_compressed_size = 0u64; + + for fetch_infos in response.fetch_info.values() { + for fetch_info in fetch_infos { + let range_size = fetch_info.url_range.end - fetch_info.url_range.start; + total_compressed_size += range_size; + } + } + + let total_uncompressed_size: u64 = + response.terms.iter().map(|term| term.unpacked_length as u64).sum(); + + // Count unique XORBs + let unique_xorbs: std::collections::HashSet<_> = + response.terms.iter().map(|term| &term.hash).collect(); + + println!("Compressed Size: {}", format_bytes_with_units(total_compressed_size)); + println!("Uncompressed Size: {}", format_bytes_with_units(total_uncompressed_size)); + println!( + "Compression Ratio: {:.2}%", + (total_compressed_size as f64 / total_uncompressed_size as f64) * 100.0 + ); + println!("XORBs: {} unique", unique_xorbs.len()); + Ok(()) + }, + None => { + eprintln!("No reconstruction information found for hash {}", arg.hash); + Ok(()) + }, + } + }, } } } @@ -193,6 +240,13 @@ fn is_git_special_files(path: &str) -> bool { matches!(path, ".git" | ".gitignore" | ".gitattributes") } +/// Format bytes with binary and decimal units on one line +fn format_bytes_with_units(bytes: u64) -> String { + let binary = format_size(bytes, BINARY); + let decimal = format_size(bytes, DECIMAL); + format!("{} bytes {} {}", bytes, binary, decimal) +} + async fn query_reconstruction( file_hash: MerkleHash, bytes_range: Option, diff --git a/hf_xet/Cargo.lock b/hf_xet/Cargo.lock index b89ced9a..a6125fd0 100644 --- a/hf_xet/Cargo.lock +++ b/hf_xet/Cargo.lock @@ -726,6 +726,7 @@ dependencies = [ "deduplication", "error_printer", "hub_client", + "humansize", "jsonwebtoken", "lazy_static", "mdb_shard", @@ -1375,6 +1376,15 @@ dependencies = [ "urlencoding", ] +[[package]] +name = "humansize" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cb51c9a029ddc91b07a787f1d86b53ccfa49b0e86688c946ebe8d3555685dd7" +dependencies = [ + "libm", +] + [[package]] name = "humantime" version = "2.2.0" @@ -1769,6 +1779,12 @@ version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + [[package]] name = "libredox" version = "0.1.9"