Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 107 additions & 35 deletions lib/dictBuilder/cover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -555,9 +555,7 @@ fn COVER_ctx_init<'a>(
totalSamplesSize
};
let testSamplesSize = if splitPoint < 1.0f64 {
samplesSizes[nbTrainSamples..][..nbTestSamples]
.iter()
.sum()
samplesSizes[nbTrainSamples..][..nbTestSamples].iter().sum()
} else {
totalSamplesSize
};
Expand Down Expand Up @@ -803,16 +801,73 @@ pub(super) const unsafe fn assume_init_ref<T>(slice: &[MaybeUninit<T>]) -> &[T]
unsafe { &*(slice as *const [MaybeUninit<T>] as *const [T]) }
}

/// Train a dictionary from an array of samples using the COVER algorithm.
///
/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an
/// array of sizes `samplesSizes`, providing the size of each sample, in order.
///
/// The resulting dictionary will be saved into `dictBuffer`.
///
/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
/// few thousands samples, though this can vary a lot. It's recommended that total size of all
/// samples be about ~x100 times the target size of dictionary.
///
/// # Returns
///
/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
/// - an error code, which can be tested with [`ZDICT_isError`]
///
/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
/// details, and we can look into it.
///
/// # Safety
///
/// Behavior is undefined if any of the following conditions are violated:
///
/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
/// of [`core::slice::from_raw_parts_mut`].
/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
#[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer_cover))]
pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
dictBuffer: *mut core::ffi::c_void,
dictBufferCapacity: size_t,
samplesBuffer: *const core::ffi::c_void,
samplesSizes: *const size_t,
nbSamples: core::ffi::c_uint,
parameters: ZDICT_cover_params_t,
) -> size_t {
let dict = unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) };

let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 {
&[]
} else {
core::slice::from_raw_parts(samplesSizes, nbSamples as usize)
};
let totalSamplesSize = samplesSizes.iter().sum::<usize>();
let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 {
&[]
} else {
core::slice::from_raw_parts(samplesBuffer.cast::<u8>(), totalSamplesSize)
};

train_from_buffer_cover(dict, samples, samplesSizes, parameters)
}

fn train_from_buffer_cover(
dict: &mut [MaybeUninit<u8>],
samples: &[u8],
samplesSizes: &[usize],
mut parameters: ZDICT_cover_params_t,
) -> size_t {
let dict = dictBuffer as *mut u8;
let dictBufferCapacity = dict.len();

let mut ctx = COVER_ctx_t::default();
let displayLevel = parameters.zParams.notificationLevel as core::ffi::c_int;
parameters.splitPoint = 1.0f64;
Expand All @@ -822,7 +877,7 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
}
return Error::parameter_outOfBound.to_error_code();
}
if nbSamples == 0 {
if samplesSizes.is_empty() {
if displayLevel >= 1 {
eprintln!("Cover must have at least one input file");
}
Expand All @@ -835,18 +890,6 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
return Error::dstSize_tooSmall.to_error_code();
}

let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 {
&[]
} else {
core::slice::from_raw_parts(samplesSizes, nbSamples as usize)
};
let totalSamplesSize = samplesSizes.iter().sum::<usize>();
let samples = if samplesBuffer.is_null() || totalSamplesSize == 0 {
&[]
} else {
core::slice::from_raw_parts(samplesBuffer.cast::<u8>(), totalSamplesSize)
};

let initVal = COVER_ctx_init(
&mut ctx,
samples,
Expand All @@ -866,25 +909,24 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_cover(
}

let mut freqs = core::mem::take(&mut ctx.freqs);
let dict_tail = COVER_buildDictionary(
&ctx,
&mut freqs,
&mut activeDmers,
unsafe { core::slice::from_raw_parts_mut(dictBuffer.cast(), dictBufferCapacity) },
parameters,
);
let dict_tail = COVER_buildDictionary(&ctx, &mut freqs, &mut activeDmers, dict, parameters);
ctx.freqs = freqs;

let dictionarySize = ZDICT_finalizeDictionary(
dict as *mut core::ffi::c_void,
dictBufferCapacity,
dict_tail.as_ptr() as *const core::ffi::c_void,
dict_tail.len(),
samplesBuffer,
samplesSizes.as_ptr(),
nbSamples,
parameters.zParams,
);
let customDictContentSize = dict_tail.len();
let dictBuffer = dict.as_mut_ptr() as *mut core::ffi::c_void;
let customDictContent = dictBuffer.wrapping_add(dictBufferCapacity - customDictContentSize);
let dictionarySize = unsafe {
ZDICT_finalizeDictionary(
dictBuffer,
dictBufferCapacity,
customDictContent,
customDictContentSize,
samples.as_ptr() as *const core::ffi::c_void,
samplesSizes.as_ptr(),
samplesSizes.len() as core::ffi::c_uint,
parameters.zParams,
)
};
if !ERR_isError(dictionarySize) && displayLevel >= 2 {
eprintln!("Constructed dictionary of size {}", dictionarySize,);
}
Expand Down Expand Up @@ -1181,11 +1223,41 @@ fn COVER_tryParameters(data: Box<COVER_tryParameters_data_t>) {
drop(freqs);
}

/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and
/// picks the best parameters.
///
/// `*parameters` is filled with the best parameters found, and the dictionary constructed with
/// those parameters is stored in `dictBuffer`.
///
/// The parameters `d`, `k`, and `steps` are optional:
/// - If `d` is zero, we check `d` in 6..8.
/// - If `k` is zero, we check `d` in 50..2000.
/// - If `steps` is zero it defaults to its default value (40).
///
/// # Returns
///
/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
/// - an error code, which can be tested with [`ZDICT_isError`]
///
/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
/// details, and we can look into it.
///
/// On success `*parameters` contains the parameters selected.
///
/// # Safety
///
/// Behavior is undefined if any of the following conditions are violated:
///
/// - `parameters` satisfies the conditions of [`pointer::as_mut`]
/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
/// of [`core::slice::from_raw_parts_mut`].
/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
/// - `parameters` satisfies the requirements of `pointer::as_mut`
#[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_cover))]
pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_cover(
dictBuffer: *mut core::ffi::c_void,
Expand Down
53 changes: 52 additions & 1 deletion lib/dictBuilder/fastcover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,32 @@ fn FASTCOVER_convertToFastCoverParams(
fastCoverParams.shrinkDict = coverParams.shrinkDict;
}

/// Train a dictionary from an array of samples using a modified version of COVER algorithm.
///
/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an
/// array of sizes `samplesSizes`, providing the size of each sample, in order.
///
/// Only parameters `d` and `k` are required. All other parameters will use default values if not
/// provided.
///
/// The resulting dictionary will be saved into `dictBuffer`.
///
/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
/// few thousands samples, though this can vary a lot. It's recommended that total size of all
/// samples be about ~x100 times the target size of dictionary.
///
/// # Returns
///
/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
/// - an error code, which can be tested with [`crate::ZDICT_isError`]
///
/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
/// details, and we can look into it.
///
/// # Safety
///
/// Behavior is undefined if any of the following conditions are violated:
Expand Down Expand Up @@ -604,6 +630,31 @@ fn train_from_buffer_fastcover(
dictionarySize
}

/// This function tries many parameter combinations (specifically, `k` and `d` combinations) and
/// picks the best parameters.
///
/// `*parameters` is filled with the best parameters found, and the dictionary constructed with
/// those parameters is stored in `dictBuffer`.
///
/// The parameters `d`, `k`, `steps`, and `accel` are optional:
/// - If `d` is zero, we check `d` in 6..8.
/// - If `k` is zero, we check `d` in 50..2000.
/// - If `steps` is zero it defaults to its default value (40).
/// - If `accel` is zero, the default value of 1 is used.
///
/// # Returns
///
/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
/// - an error code, which can be tested with [`crate::ZDICT_isError`]
///
/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
/// details, and we can look into it.
///
/// On success `*parameters` contains the parameters selected.
///
/// # Safety
///
/// Behavior is undefined if any of the following conditions are violated:
Expand All @@ -614,7 +665,7 @@ fn train_from_buffer_fastcover(
/// of [`core::slice::from_raw_parts`].
/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
/// - `parameters` satisfies the requirements of [`pointer::as_mut`]
/// - `parameters` satisfies the requirements of `pointer::as_mut`
#[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_optimizeTrainFromBuffer_fastCover))]
pub unsafe extern "C" fn ZDICT_optimizeTrainFromBuffer_fastCover(
dictBuffer: *mut core::ffi::c_void,
Expand Down
35 changes: 35 additions & 0 deletions lib/dictBuilder/zdict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1587,6 +1587,41 @@ pub unsafe extern "C" fn ZDICT_trainFromBuffer_legacy(
)
}

/// Train a dictionary from an array of samples.
///
/// Calls single-threaded [`ZDICT_optimizeTrainFromBuffer_fastCover`], with `d=8`, `steps=4`,
/// `f=20`, and `accel=1`.
///
/// Samples must be stored concatenated in a single flat buffer `samplesBuffer`, supplied with an
/// array of sizes `samplesSizes`, providing the size of each sample, in order. The resulting
/// dictionary will be saved into `dictBuffer`.
///
/// In general, a reasonable dictionary has a size of ~100 KB. It's possible to select smaller or
/// larger size, just by specifying `dictBufferCapacity`. In general, it's recommended to provide a
/// few thousands samples, though this can vary a lot. It's recommended that total size of all
/// samples be about ~x100 times the target size of dictionary.
///
/// # Returns
///
/// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
/// - an error code, which can be tested with [`ZDICT_isError`]
///
/// Dictionary training will fail if there are not enough samples to construct a dictionary, or if
/// most of the samples are too small (< 8 bytes being the lower limit). If dictionary training
/// fails, you should use zstd without a dictionary, as the dictionary would've been ineffective
/// anyways. If you believe your samples would benefit from a dictionary please open an issue with
/// details, and we can look into it.
///
/// # Safety
///
/// Behavior is undefined if any of the following conditions are violated:
///
/// - `dictBufferCapacity` is 0 or `dictBuffer` and `dictBufferCapacity` satisfy the requirements
/// of [`core::slice::from_raw_parts_mut`].
/// - `nbSamples` is 0 or `samplesSizes` and `nbSamples` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
/// - `sum(samplesSizes)` is 0 or `samplesBuffer` and `sum(samplesSizes)` satisfy the requirements
/// of [`core::slice::from_raw_parts`].
#[cfg_attr(feature = "export-symbols", export_name = crate::prefix!(ZDICT_trainFromBuffer))]
pub unsafe extern "C" fn ZDICT_trainFromBuffer(
dictBuffer: *mut core::ffi::c_void,
Expand Down