trifectatechfoundation · folkertdev · Nov 11, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/lib/dictBuilder/cover.rs b/lib/dictBuilder/cover.rs
@@ -555,9 +555,7 @@ fn COVER_ctx_init<'a>(
         totalSamplesSize
     };
     let testSamplesSize = if splitPoint < 1.0f64 {
-        samplesSizes[nbTrainSamples..][..nbTestSamples]
-            .iter()
-            .sum()
+        samplesSizes[nbTrainSamples..][..nbTestSamples].iter().sum()
     } else {
         totalSamplesSize
     };
@@ -803,16 +801,73 @@ pub(super) const unsafe fn assume_init_ref<T>(slice: &[MaybeUninit<T>]) -> &[T]
     unsafe { &*(slice as *const [MaybeUninit<T>] as *const [T]) }
 }
 
+/// Train a dictionary from an array of samples using the COVER algorithm.
+///
+/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an
+/// array of sizes `samplesSizes`, providing the size of each sample, in order.
+///
+/// The resulting dictionary will be saved into `dictBuffer`.
+///
+/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
+/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
+/// few thousands samples, though this can vary a lot. It's recommended that total size of all
+/// samples be about ~x100 times the target size of dictionary.
+///
+/// # Returns
+///
+/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+/// - an error code, which can be tested with [`ZDICT_isError`]
+///
+/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
+/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
+/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
+/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
+/// details, and we can look into it.
+///
+/// # Safety
+///
+/// Behavior is undefined if any of the following conditions are violated:
+///
+/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
+///   of [`core::slice::from_raw_parts_mut`].
+/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
+/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
 #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer_cover))]
 pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
     dictBuffer: *mut core::ffi::c_void,
     dictBufferCapacity: size_t,
     samplesBuffer: *const core::ffi::c_void,
     samplesSizes: *const size_t,
     nbSamples: core::ffi::c_uint,
+    parameters: ZDICT_cover_params_t,
+) -> size_t {
+    let dict = unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) };
+
+    let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 {
+        &[]
+    } else {
+        core::slice::from_raw_parts(samplesSizes, nbSamples as usize)
+    };
+    let totalSamplesSize = samplesSizes.iter().sum::<usize>();
+    let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 {
+        &[]
+    } else {
+        core::slice::from_raw_parts(samplesBuffer.cast::<u8>(), totalSamplesSize)
+    };
+
+    train_from_buffer_cover(dict, samples, samplesSizes, parameters)
+}
+
+fn train_from_buffer_cover(
+    dict: &mut [MaybeUninit<u8>],
+    samples: &[u8],
+    samplesSizes: &[usize],
     mut parameters: ZDICT_cover_params_t,
 ) -> size_t {
-    let dict = dictBuffer as *mut u8;
+    let dictBufferCapacity = dict.len();
+
     let mut ctx = COVER_ctx_t::default();
     let displayLevel = parameters.zParams.notificationLevel as core::ffi::c_int;
     parameters.splitPoint = 1.0f64;
@@ -822,7 +877,7 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
         }
         return Error::parameter_outOfBound.to_error_code();
     }
-    if nbSamples == 0 {
+    if samplesSizes.is_empty() {
         if displayLevel >= 1 {
             eprintln!("Cover must have at least one input file");
         }
@@ -835,18 +890,6 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
         return Error::dstSize_tooSmall.to_error_code();
     }
 
-    let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 {
-        &[]
-    } else {
-        core::slice::from_raw_parts(samplesSizes, nbSamples as usize)
-    };
-    let totalSamplesSize = samplesSizes.iter().sum::<usize>();
-    let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 {
-        &[]
-    } else {
-        core::slice::from_raw_parts(samplesBuffer.cast::<u8>(), totalSamplesSize)
-    };
-
     let initVal = COVER_ctx_init(
         &mut ctx,
         samples,
@@ -866,25 +909,24 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
     }
 
     let mut freqs = core::mem::take(&mut ctx.freqs);
-    let dict_tail = COVER_buildDictionary(
-        &ctx,
-        &mut freqs,
-        &mut activeDmers,
-        unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) },
-        parameters,
-    );
+    let dict_tail = COVER_buildDictionary(&ctx, &mut freqs, &mut activeDmers, dict, parameters);
     ctx.freqs = freqs;
 
-    let dictionarySize = ZDICT_finalizeDictionary(
-        dict as *mut core::ffi::c_void,
-        dictBufferCapacity,
-        dict_tail.as_ptr() as *const core::ffi::c_void,
-        dict_tail.len(),
-        samplesBuffer,
-        samplesSizes.as_ptr(),
-        nbSamples,
-        parameters.zParams,
-    );
+    let customDictContentSize = dict_tail.len();
+    let dictBuffer = dict.as_mut_ptr() as *mut core::ffi::c_void;
+    let customDictContent = dictBuffer.wrapping_add(dictBufferCapacity - customDictContentSize);
+    let dictionarySize = unsafe {
+        ZDICT_finalizeDictionary(
+            dictBuffer,
+            dictBufferCapacity,
+            customDictContent,
+            customDictContentSize,
+            samples.as_ptr() as *const core::ffi::c_void,
+            samplesSizes.as_ptr(),
+            samplesSizes.len() as core::ffi::c_uint,
+            parameters.zParams,
+        )
+    };
     if !ERR_isError(dictionarySize) && displayLevel >= 2 {
         eprintln!("Constructed dictionary of size {}", dictionarySize,);
     }
@@ -1181,11 +1223,41 @@ fn COVER_tryParameters(data: Box<COVER_tryParameters_data_t>) {
     drop(freqs);
 }
 
+/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and
+/// picks the best parameters.
+///
+/// `*parameters` is filled with the best parameters found, and the dictionary constructed with
+/// those parameters is stored in `dictBuffer`.
+///
+/// The parameters `d`, `k`, and `steps` are optional:
+/// - If `d` is zero, we check `d` in 6..8.
+/// - If `k` is zero, we check `d` in 50..2000.
+/// - If `steps` is zero it defaults to its default value (40).
+///
+/// # Returns
+///
+/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+/// - an error code, which can be tested with [`ZDICT_isError`]
+///
+/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
+/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
+/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
+/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
+/// details, and we can look into it.
+///
+/// On success `*parameters` contains the parameters selected.
+///
 /// # Safety
 ///
 /// Behavior is undefined if any of the following conditions are violated:
 ///
-/// - `parameters` satisfies the conditions of [`pointer::as_mut`]
+/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
+///   of [`core::slice::from_raw_parts_mut`].
+/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
+/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
+/// - `parameters` satisfies the requirements of `pointer::as_mut`
 #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_cover))]
 pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_cover(
     dictBuffer: *mut core::ffi::c_void,

diff --git a/lib/dictBuilder/fastcover.rs b/lib/dictBuilder/fastcover.rs
@@ -468,6 +468,32 @@ fn FASTCOVER_convertToFastCoverParams(
     fastCoverParams.shrinkDict = coverParams.shrinkDict;
 }
 
+/// Train a dictionary from an array of samples using a modified version of COVER algorithm.
+///
+/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`,  supplied with an
+/// array of sizes `samplesSizes`, providing the size of each sample, in order.
+///
+/// Only parameters `d` and `k` are required. All other parameters will use default values if not
+/// provided.
+///
+/// The resulting dictionary will be saved into `dictBuffer`.
+///
+/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
+/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
+/// few thousands samples, though this can vary a lot. It's recommended that total size of all
+/// samples be about ~x100 times the target size of dictionary.
+///
+/// # Returns
+///
+/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+/// - an error code, which can be tested with [`crate::ZDICT_isError`]
+///
+/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
+/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
+/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
+/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
+/// details, and we can look into it.
+///
 /// # Safety
 ///
 /// Behavior is undefined if any of the following conditions are violated:
@@ -604,6 +630,31 @@ fn train_from_buffer_fastcover(
     dictionarySize
 }
 
+/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and
+/// picks the best parameters.
+///
+/// `*parameters` is filled with the best parameters found, and the dictionary constructed with
+/// those parameters is stored in `dictBuffer`.
+///
+/// The parameters `d`, `k`, `steps`, and `accel` are optional:
+/// - If `d` is zero, we check `d` in 6..8.
+/// - If `k` is zero, we check `d` in 50..2000.
+/// - If `steps` is zero it defaults to its default value (40).
+/// - If `accel` is zero, the default value of 1 is used.
+///
+/// # Returns
+///
+/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+/// - an error code, which can be tested with [`crate::ZDICT_isError`]
+///
+/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
+/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
+/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
+/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
+/// details, and we can look into it.
+///
+/// On success `*parameters` contains the parameters selected.
+///
 /// # Safety
 ///
 /// Behavior is undefined if any of the following conditions are violated:
@@ -614,7 +665,7 @@ fn train_from_buffer_fastcover(
 ///   of [`core::slice::from_raw_parts`].
 /// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
 ///   of [`core::slice::from_raw_parts`].
-/// - `parameters` satisfies the requirements of [`pointer::as_mut`]
+/// - `parameters` satisfies the requirements of `pointer::as_mut`
 #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_fastCover))]
 pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_fastCover(
     dictBuffer: *mut core::ffi::c_void,

diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs
@@ -1587,6 +1587,41 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_legacy(
     )
 }
 
+/// Train a dictionary from an array of samples.
+///
+/// Calls single-threaded [`ZDICT_optimizeTrainFromBuffer_fastCover`], with `d=8`, `steps=4`,
+/// `f=20`, and `accel=1`.
+///
+/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`,  supplied with an
+/// array of sizes `samplesSizes`, providing the size of each sample, in order. The resulting
+/// dictionary will be saved into `dictBuffer`.
+///
+/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
+/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
+/// few thousands samples, though this can vary a lot. It's recommended that total size of all
+/// samples be about ~x100 times the target size of dictionary.
+///
+/// # Returns
+///
+/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+/// - an error code, which can be tested with [`ZDICT_isError`]
+///
+/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
+/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
+/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
+/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
+/// details, and we can look into it.
+///
+/// # Safety
+///
+/// Behavior is undefined if any of the following conditions are violated:
+///
+/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
+///   of [`core::slice::from_raw_parts_mut`].
+/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
+/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
+///   of [`core::slice::from_raw_parts`].
 #[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer))]
 pub unsafe extern "C" fn ZDICT_trainFromBuffer(
     dictBuffer: *mut core::ffi::c_void,