Skip to content

Commit 544b551

Browse files
committed
feat(xtool): add compressed-size
ex: ``` xtool --repo-type dataset --repo-id $REPO compressed-size $HASH Compressed Size: 6181022260 bytes 5.757 GiB 6.181 GB Uncompressed Size: 9710109696 bytes 9.043 GiB 9.710 GB Compression Ratio: 63.66% ``` Signed-off-by: Adrien Delorme <[email protected]>
1 parent 499d9a1 commit 544b551

File tree

4 files changed

+71
-0
lines changed

4 files changed

+71
-0
lines changed

Cargo.lock

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ getrandom = "0.3"
5757
git-url-parse = "0.4"
5858
git2 = "0.20"
5959
half = "2.4"
60+
humansize = "2.1"
6061
heapify = "0.2"
6162
heed = "0.11"
6263
http = "1"

data/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ xet_runtime = { path = "../xet_runtime" }
4242
anyhow = { workspace = true }
4343
async-trait = { workspace = true }
4444
bytes = { workspace = true }
45+
humansize = { workspace = true }
4546
chrono = { workspace = true }
4647
clap = { workspace = true }
4748
jsonwebtoken = { workspace = true }

data/src/bin/xtool.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use data::data_client::default_config;
1212
use data::migration_tool::hub_client_token_refresher::HubClientTokenRefresher;
1313
use data::migration_tool::migrate::migrate_files_impl;
1414
use hub_client::{BearerCredentialHelper, HubClient, Operation, RepoInfo};
15+
use humansize::{BINARY, DECIMAL, format_size};
1516
use merklehash::MerkleHash;
1617
use utils::auth::TokenRefresher;
1718
use walkdir::WalkDir;
@@ -76,6 +77,8 @@ enum Command {
7677
Dedup(DedupArg),
7778
/// Queries reconstruction information about a file.
7879
Query(QueryArg),
80+
/// Calculates the compressed size of a xet-file by summing url_range sizes.
81+
CompressedSize(CompressedSizeArg),
7982
}
8083

8184
#[derive(Args)]
@@ -116,6 +119,12 @@ struct QueryArg {
116119
bytes_range: Option<FileRange>,
117120
}
118121

122+
#[derive(Args)]
123+
struct CompressedSizeArg {
124+
/// Xet-hash of a file.
125+
hash: String,
126+
}
127+
119128
impl Command {
120129
async fn run(self, hub_client: HubClient) -> Result<()> {
121130
match self {
@@ -161,6 +170,43 @@ impl Command {
161170

162171
Ok(())
163172
},
173+
Command::CompressedSize(arg) => {
174+
let file_hash = MerkleHash::from_hex(&arg.hash)?;
175+
// Query reconstruction for full file (no Range header)
176+
let ret = query_reconstruction(file_hash, None, hub_client).await?;
177+
178+
match ret {
179+
Some(response) => {
180+
let mut total_compressed_size = 0u64;
181+
182+
// Iterate through all fetch_info items (HashMap of arrays)
183+
for fetch_infos in response.fetch_info.values() {
184+
for fetch_info in fetch_infos {
185+
// Calculate end - start for each url_range as specified
186+
let range_size = fetch_info.url_range.end - fetch_info.url_range.start;
187+
total_compressed_size += range_size;
188+
}
189+
}
190+
191+
// Calculate uncompressed size by summing unpacked_length from all terms
192+
let total_uncompressed_size: u64 =
193+
response.terms.iter().map(|term| term.unpacked_length as u64).sum();
194+
195+
// Display sizes on single lines
196+
println!("Compressed Size: {}", format_bytes_with_units(total_compressed_size));
197+
println!("Uncompressed Size: {}", format_bytes_with_units(total_uncompressed_size));
198+
println!(
199+
"Compression Ratio: {:.2}%",
200+
(total_compressed_size as f64 / total_uncompressed_size as f64) * 100.0
201+
);
202+
Ok(())
203+
},
204+
None => {
205+
eprintln!("No reconstruction information found for hash {}", arg.hash);
206+
Ok(())
207+
},
208+
}
209+
},
164210
}
165211
}
166212
}
@@ -193,6 +239,13 @@ fn is_git_special_files(path: &str) -> bool {
193239
matches!(path, ".git" | ".gitignore" | ".gitattributes")
194240
}
195241

242+
/// Format bytes with binary and decimal units on one line
243+
fn format_bytes_with_units(bytes: u64) -> String {
244+
let binary = format_size(bytes, BINARY);
245+
let decimal = format_size(bytes, DECIMAL);
246+
format!("{} bytes {} {}", bytes, binary, decimal)
247+
}
248+
196249
async fn query_reconstruction(
197250
file_hash: MerkleHash,
198251
bytes_range: Option<FileRange>,

0 commit comments

Comments
 (0)