diff --git a/lib/dictBuilder/cover.rs b/lib/dictBuilder/cover.rs index 44d4e36e..2f4e21b6 100644 --- a/lib/dictBuilder/cover.rs +++ b/lib/dictBuilder/cover.rs @@ -555,9 +555,7 @@ fn COVER_ctx_init<'a>( totalSamplesSize }; let testSamplesSize = if splitPoint < 1.0f64 { - samplesSizes[nbTrainSamples..][..nbTestSamples] - .iter() - .sum() + samplesSizes[nbTrainSamples..][..nbTestSamples].iter().sum() } else { totalSamplesSize }; @@ -803,6 +801,39 @@ pub(super) const unsafe fn assume_init_ref(slice: &[MaybeUninit]) -> &[T] unsafe { &*(slice as *const [MaybeUninit] as *const [T]) } } +/// Train a dictionary from an array of samples using the COVER algorithm. +/// +/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an +/// array of sizes `samplesSizes`, providing the size of each sample, in order. +/// +/// The resulting dictionary will be saved into `dictBuffer`. +/// +/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or +/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a +/// few thousands samples, though this can vary a lot. It's recommended that total size of all +/// samples be about ~x100 times the target size of dictionary. +/// +/// # Returns +/// +/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) +/// - an error code, which can be tested with [`ZDICT_isError`] +/// +/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if +/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training +/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective +/// anyways. If you believe your samples would benefit from a dictionary please open an issue with +/// details, and we can look into it. +/// +/// # Safety +/// +/// Behavior is undefined if any of the following conditions are violated: +/// +/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements +/// of [`core::slice::from_raw_parts_mut`]. +/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. +/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer_cover))] pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover( dictBuffer: *mut core::ffi::c_void, @@ -810,9 +841,33 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover( samplesBuffer: *const core::ffi::c_void, samplesSizes: *const size_t, nbSamples: core::ffi::c_uint, + parameters: ZDICT_cover_params_t, +) -> size_t { + let dict = unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) }; + + let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 { + &[] + } else { + core::slice::from_raw_parts(samplesSizes, nbSamples as usize) + }; + let totalSamplesSize = samplesSizes.iter().sum::(); + let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 { + &[] + } else { + core::slice::from_raw_parts(samplesBuffer.cast::(), totalSamplesSize) + }; + + train_from_buffer_cover(dict, samples, samplesSizes, parameters) +} + +fn train_from_buffer_cover( + dict: &mut [MaybeUninit], + samples: &[u8], + samplesSizes: &[usize], mut parameters: ZDICT_cover_params_t, ) -> size_t { - let dict = dictBuffer as *mut u8; + let dictBufferCapacity = dict.len(); + let mut ctx = COVER_ctx_t::default(); let displayLevel = parameters.zParams.notificationLevel as core::ffi::c_int; parameters.splitPoint = 1.0f64; @@ -822,7 +877,7 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover( } return Error::parameter_outOfBound.to_error_code(); } - if nbSamples == 0 { + if samplesSizes.is_empty() { if displayLevel >= 1 { eprintln!("Cover must have at least one input file"); } @@ -835,18 +890,6 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover( return Error::dstSize_tooSmall.to_error_code(); } - let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 { - &[] - } else { - core::slice::from_raw_parts(samplesSizes, nbSamples as usize) - }; - let totalSamplesSize = samplesSizes.iter().sum::(); - let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 { - &[] - } else { - core::slice::from_raw_parts(samplesBuffer.cast::(), totalSamplesSize) - }; - let initVal = COVER_ctx_init( &mut ctx, samples, @@ -866,25 +909,24 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover( } let mut freqs = core::mem::take(&mut ctx.freqs); - let dict_tail = COVER_buildDictionary( - &ctx, - &mut freqs, - &mut activeDmers, - unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) }, - parameters, - ); + let dict_tail = COVER_buildDictionary(&ctx, &mut freqs, &mut activeDmers, dict, parameters); ctx.freqs = freqs; - let dictionarySize = ZDICT_finalizeDictionary( - dict as *mut core::ffi::c_void, - dictBufferCapacity, - dict_tail.as_ptr() as *const core::ffi::c_void, - dict_tail.len(), - samplesBuffer, - samplesSizes.as_ptr(), - nbSamples, - parameters.zParams, - ); + let customDictContentSize = dict_tail.len(); + let dictBuffer = dict.as_mut_ptr() as *mut core::ffi::c_void; + let customDictContent = dictBuffer.wrapping_add(dictBufferCapacity - customDictContentSize); + let dictionarySize = unsafe { + ZDICT_finalizeDictionary( + dictBuffer, + dictBufferCapacity, + customDictContent, + customDictContentSize, + samples.as_ptr() as *const core::ffi::c_void, + samplesSizes.as_ptr(), + samplesSizes.len() as core::ffi::c_uint, + parameters.zParams, + ) + }; if !ERR_isError(dictionarySize) && displayLevel >= 2 { eprintln!("Constructed dictionary of size {}", dictionarySize,); } @@ -1181,11 +1223,41 @@ fn COVER_tryParameters(data: Box) { drop(freqs); } +/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and +/// picks the best parameters. +/// +/// `*parameters` is filled with the best parameters found, and the dictionary constructed with +/// those parameters is stored in `dictBuffer`. +/// +/// The parameters `d`, `k`, and `steps` are optional: +/// - If `d` is zero, we check `d` in 6..8. +/// - If `k` is zero, we check `d` in 50..2000. +/// - If `steps` is zero it defaults to its default value (40). +/// +/// # Returns +/// +/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) +/// - an error code, which can be tested with [`ZDICT_isError`] +/// +/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if +/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training +/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective +/// anyways. If you believe your samples would benefit from a dictionary please open an issue with +/// details, and we can look into it. +/// +/// On success `*parameters` contains the parameters selected. +/// /// # Safety /// /// Behavior is undefined if any of the following conditions are violated: /// -/// - `parameters` satisfies the conditions of [`pointer::as_mut`] +/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements +/// of [`core::slice::from_raw_parts_mut`]. +/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. +/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. +/// - `parameters` satisfies the requirements of `pointer::as_mut` #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_cover))] pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_cover( dictBuffer: *mut core::ffi::c_void, diff --git a/lib/dictBuilder/fastcover.rs b/lib/dictBuilder/fastcover.rs index 882b667d..b5322964 100644 --- a/lib/dictBuilder/fastcover.rs +++ b/lib/dictBuilder/fastcover.rs @@ -468,6 +468,32 @@ fn FASTCOVER_convertToFastCoverParams( fastCoverParams.shrinkDict = coverParams.shrinkDict; } +/// Train a dictionary from an array of samples using a modified version of COVER algorithm. +/// +/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an +/// array of sizes `samplesSizes`, providing the size of each sample, in order. +/// +/// Only parameters `d` and `k` are required. All other parameters will use default values if not +/// provided. +/// +/// The resulting dictionary will be saved into `dictBuffer`. +/// +/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or +/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a +/// few thousands samples, though this can vary a lot. It's recommended that total size of all +/// samples be about ~x100 times the target size of dictionary. +/// +/// # Returns +/// +/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) +/// - an error code, which can be tested with [`crate::ZDICT_isError`] +/// +/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if +/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training +/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective +/// anyways. If you believe your samples would benefit from a dictionary please open an issue with +/// details, and we can look into it. +/// /// # Safety /// /// Behavior is undefined if any of the following conditions are violated: @@ -604,6 +630,31 @@ fn train_from_buffer_fastcover( dictionarySize } +/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and +/// picks the best parameters. +/// +/// `*parameters` is filled with the best parameters found, and the dictionary constructed with +/// those parameters is stored in `dictBuffer`. +/// +/// The parameters `d`, `k`, `steps`, and `accel` are optional: +/// - If `d` is zero, we check `d` in 6..8. +/// - If `k` is zero, we check `d` in 50..2000. +/// - If `steps` is zero it defaults to its default value (40). +/// - If `accel` is zero, the default value of 1 is used. +/// +/// # Returns +/// +/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) +/// - an error code, which can be tested with [`crate::ZDICT_isError`] +/// +/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if +/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training +/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective +/// anyways. If you believe your samples would benefit from a dictionary please open an issue with +/// details, and we can look into it. +/// +/// On success `*parameters` contains the parameters selected. +/// /// # Safety /// /// Behavior is undefined if any of the following conditions are violated: @@ -614,7 +665,7 @@ fn train_from_buffer_fastcover( /// of [`core::slice::from_raw_parts`]. /// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements /// of [`core::slice::from_raw_parts`]. -/// - `parameters` satisfies the requirements of [`pointer::as_mut`] +/// - `parameters` satisfies the requirements of `pointer::as_mut` #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_fastCover))] pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_fastCover( dictBuffer: *mut core::ffi::c_void, diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index e38db8be..abc3e083 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -1587,6 +1587,41 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_legacy( ) } +/// Train a dictionary from an array of samples. +/// +/// Calls single-threaded [`ZDICT_optimizeTrainFromBuffer_fastCover`], with `d=8`, `steps=4`, +/// `f=20`, and `accel=1`. +/// +/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an +/// array of sizes `samplesSizes`, providing the size of each sample, in order. The resulting +/// dictionary will be saved into `dictBuffer`. +/// +/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or +/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a +/// few thousands samples, though this can vary a lot. It's recommended that total size of all +/// samples be about ~x100 times the target size of dictionary. +/// +/// # Returns +/// +/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) +/// - an error code, which can be tested with [`ZDICT_isError`] +/// +/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if +/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training +/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective +/// anyways. If you believe your samples would benefit from a dictionary please open an issue with +/// details, and we can look into it. +/// +/// # Safety +/// +/// Behavior is undefined if any of the following conditions are violated: +/// +/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements +/// of [`core::slice::from_raw_parts_mut`]. +/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. +/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements +/// of [`core::slice::from_raw_parts`]. #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer))] pub unsafe extern "C" fn ZDICT_trainFromBuffer( dictBuffer: *mut core::ffi::c_void,