Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions chunk_cache/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ required-features = ["analysis"]

[features]
analysis = ["dep:clap"]
no-default-cache = []
15 changes: 10 additions & 5 deletions chunk_cache/src/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ pub mod test_utils;

// consistently use URL_SAFE (also file path safe) base64 codec
pub(crate) const BASE64_ENGINE: GeneralPurpose = URL_SAFE;
#[cfg(not(feature = "no-default-cache"))]
pub const DEFAULT_CHUNK_CACHE_CAPACITY: u64 = 10_000_000_000; // 10 GB
const MAX_CACHE_FILE_SIZE: u64 = 10_000_000_000; // 10 GB - max size for a single cache file
#[cfg(feature = "no-default-cache")]
pub const DEFAULT_CHUNK_CACHE_CAPACITY: u64 = 0;
const PREFIX_DIR_NAME_LEN: usize = 2;

type OptionResult<T, E> = Result<Option<T>, E>;
Expand Down Expand Up @@ -688,10 +690,10 @@ fn try_parse_cache_file(file_result: io::Result<DirEntry>, capacity: u64) -> Opt
if !md.is_file() {
return Ok(None);
}
if md.len() > MAX_CACHE_FILE_SIZE {
if md.len() > DEFAULT_CHUNK_CACHE_CAPACITY {
return Err(ChunkCacheError::general(format!(
"Cache directory contains a file larger than {} GB, cache directory state is invalid",
(MAX_CACHE_FILE_SIZE as f64 / (1 << 30) as f64)
(DEFAULT_CHUNK_CACHE_CAPACITY as f64 / (1 << 30) as f64)
)));
}

Expand Down Expand Up @@ -821,9 +823,10 @@ mod tests {
use tempdir::TempDir;
use utils::output_bytes;

use super::{DEFAULT_CHUNK_CACHE_CAPACITY, DiskCache};
use crate::disk::test_utils::*;
use crate::disk::try_parse_key;
use crate::{CacheConfig, ChunkCache, DEFAULT_CHUNK_CACHE_CAPACITY, DiskCache};
use crate::{CacheConfig, ChunkCache};

const RANDOM_SEED: u64 = 9089 << 20 | 120043;

Expand Down Expand Up @@ -1259,7 +1262,9 @@ mod tests {
mod concurrency_tests {
use tempdir::TempDir;

use crate::{CacheConfig, ChunkCache, DEFAULT_CHUNK_CACHE_CAPACITY, DiskCache, RANGE_LEN, RandomEntryIterator};
use super::DiskCache;
use crate::disk::DEFAULT_CHUNK_CACHE_CAPACITY;
use crate::{CacheConfig, ChunkCache, RANGE_LEN, RandomEntryIterator};

const NUM_ITEMS_PER_TASK: usize = 20;
const RANDOM_SEED: u64 = 878987298749287;
Expand Down
13 changes: 0 additions & 13 deletions data/src/configurations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,6 @@ impl TranslatorConfig {
}
}

pub fn with_cache_size(self, cache_size: u64) -> Self {
Self {
data_config: DataConfig {
cache_config: CacheConfig {
cache_size,
..self.data_config.cache_config
},
..self.data_config
},
..self
}
}

pub fn with_session_id(self, session_id: &str) -> Self {
if session_id.is_empty() {
return self;
Expand Down
19 changes: 4 additions & 15 deletions data/src/data_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,16 @@ pub async fn upload_bytes_async(
token_info: Option<(String, u64)>,
token_refresher: Option<Arc<dyn TokenRefresher>>,
progress_updater: Option<Arc<dyn TrackingProgressUpdater>>,
cache_size: Option<u64>,
user_agent: String,
) -> errors::Result<Vec<XetFileInfo>> {
let mut config = default_config(
let config = default_config(
endpoint.unwrap_or(DEFAULT_CAS_ENDPOINT.clone()),
None,
token_info,
token_refresher,
user_agent,
)?;
if let Some(size) = cache_size {
config = config.with_cache_size(size);
}

Span::current().record("session_id", &config.session_id);

let semaphore = XetRuntime::current().global_semaphore(*CONCURRENT_FILE_INGESTION_LIMITER);
Expand Down Expand Up @@ -152,23 +149,19 @@ pub async fn upload_async(
token_info: Option<(String, u64)>,
token_refresher: Option<Arc<dyn TokenRefresher>>,
progress_updater: Option<Arc<dyn TrackingProgressUpdater>>,
cache_size: Option<u64>,
user_agent: String,
) -> errors::Result<Vec<XetFileInfo>> {
// chunk files
// produce Xorbs + Shards
// upload shards and xorbs
// for each file, return the filehash
let mut config = default_config(
let config = default_config(
endpoint.unwrap_or(DEFAULT_CAS_ENDPOINT.clone()),
None,
token_info,
token_refresher,
user_agent,
)?;
if let Some(size) = cache_size {
config = config.with_cache_size(size);
}

let span = Span::current();

Expand Down Expand Up @@ -199,7 +192,6 @@ pub async fn download_async(
token_info: Option<(String, u64)>,
token_refresher: Option<Arc<dyn TokenRefresher>>,
progress_updaters: Option<Vec<Arc<dyn TrackingProgressUpdater>>>,
cache_size: Option<u64>,
user_agent: String,
) -> errors::Result<Vec<String>> {
lazy_static! {
Expand All @@ -212,16 +204,13 @@ pub async fn download_async(
{
return Err(DataProcessingError::ParameterError("updaters are not same length as pointer_files".to_string()));
}
let mut config = default_config(
let config = default_config(
endpoint.unwrap_or(DEFAULT_CAS_ENDPOINT.to_string()),
None,
token_info,
token_refresher,
user_agent,
)?;
if let Some(size) = cache_size {
config = config.with_cache_size(size);
}
Span::current().record("session_id", &config.session_id);

let processor = Arc::new(FileDownloader::new(config.into()).await?);
Expand Down
1 change: 1 addition & 0 deletions hf_xet/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions hf_xet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ crate-type = ["cdylib"]

[dependencies]
cas_client = { path = "../cas_client" }
chunk_cache = { path = "../chunk_cache" }
data = { path = "../data" }
error_printer = { path = "../error_printer" }
progress_tracking = { path = "../progress_tracking" }
Expand Down Expand Up @@ -47,8 +48,10 @@ signal-hook = "0.3"
ctrlc = "3.4"

[features]
default = ["no-default-cache"] # By default, hf_xet disables the disk cache.
native-tls = ["cas_client/native-tls-vendored"]
native-tls-vendored = ["cas_client/native-tls-vendored"]
no-default-cache = ["chunk_cache/no-default-cache"]
profiling = ["pprof"]
tokio-console = ["xet_logging/tokio-console"]

Expand Down
3 changes: 0 additions & 3 deletions hf_xet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ pub fn upload_bytes(
token_info,
refresher.map(|v| v as Arc<_>),
updater.map(|v| v as Arc<_>),
Some(0), // Disable DiskCache for hf_xet
USER_AGENT.to_string(),
)
.await
Expand Down Expand Up @@ -112,7 +111,6 @@ pub fn upload_files(
token_info,
refresher.map(|v| v as Arc<_>),
updater.map(|v| v as Arc<_>),
Some(0), // Disable DiskCache for hf_xet
USER_AGENT.to_string(),
)
.await
Expand Down Expand Up @@ -157,7 +155,6 @@ pub fn download_files(
token_info,
refresher.map(|v| v as Arc<_>),
updaters,
Some(0), // Disable DiskCache for hf_xet
USER_AGENT.to_string(),
)
.await
Expand Down
Loading