diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 6f9107dd..70708dac 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -1,11 +1,12 @@ +use std::mem::MaybeUninit; use std::time::{Duration, Instant}; -use libc::{free, malloc, memcpy, size_t}; +use libc::size_t; -use crate::lib::common::bits::{ZSTD_NbCommonBytes, ZSTD_highbit32}; +use crate::lib::common::bits::ZSTD_highbit32; use crate::lib::common::error_private::{ERR_getErrorName, ERR_isError, Error}; use crate::lib::common::huf::{HUF_CElt, HUF_CTABLE_WORKSPACE_SIZE_U32, HUF_WORKSPACE_SIZE}; -use crate::lib::common::mem::{MEM_readLE32, MEM_readST, MEM_writeLE32}; +use crate::lib::common::mem::{MEM_readLE32, MEM_writeLE32}; use crate::lib::common::xxhash::ZSTD_XXH64; use crate::lib::common::zstd_internal::{ repStartValue, LLFSELog, MLFSELog, MaxLL, MaxML, OffFSELog, ZSTD_REP_NUM, @@ -27,7 +28,7 @@ use crate::lib::zdict::experimental::{ use crate::lib::zdict::ZDICT_params_t; use crate::lib::zstd::*; -#[derive(Copy, Clone)] +#[derive(Clone)] #[repr(C)] struct EStats_ress_t { /// dictionary @@ -35,7 +36,7 @@ struct EStats_ress_t { /// working context zc: *mut ZSTD_CCtx, /// must be ZSTD_BLOCKSIZE_MAX allocated - workPlace: *mut core::ffi::c_void, + workPlace: Box<[MaybeUninit]>, } #[derive(Copy, Clone, Default)] @@ -61,10 +62,10 @@ impl DictItem { } } -const MINRATIO: u32 = 4; +const MINRATIO: usize = 4; const ZDICT_MAX_SAMPLES_SIZE: usize = 2000 << 20; #[expect(deprecated)] -const ZDICT_MIN_SAMPLES_SIZE: usize = ZDICT_CONTENTSIZE_MIN as usize * MINRATIO as usize; +const ZDICT_MIN_SAMPLES_SIZE: usize = ZDICT_CONTENTSIZE_MIN as usize * MINRATIO; const NOISELENGTH: usize = 32; static g_selectivity_default: u32 = 9; @@ -128,31 +129,21 @@ pub unsafe extern "C" fn ZDICT_getDictHeaderSize( ) } -unsafe fn ZDICT_count( - mut pIn: *const core::ffi::c_void, - mut pMatch: *const core::ffi::c_void, -) -> size_t { - let pStart = pIn as *const core::ffi::c_char; - loop { - let diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if diff == 0 { - pIn = pIn.byte_add(::core::mem::size_of::()); - pMatch = pMatch.byte_add(::core::mem::size_of::()); - } else { - pIn = pIn.byte_offset(ZSTD_NbCommonBytes(diff) as isize); - return pIn.byte_offset_from(pStart) as core::ffi::c_long as size_t; - } - } +fn ZDICT_count(pIn: &[u8], pMatch: &[u8]) -> size_t { + pIn.iter() + .zip(pMatch) + .position(|(a, b)| a != b) + .unwrap_or(0) } const LLIMIT: usize = 64; const MINMATCHLENGTH: usize = 7; -unsafe fn ZDICT_analyzePos( +fn ZDICT_analyzePos( doneMarks: &mut [bool], suffix_slice: &[u32], mut start: u32, buffer: &[u8], - minRatio: u32, + minRatio: usize, notificationLevel: u32, ) -> DictItem { let mut lengthList = [0u32; LLIMIT]; @@ -195,10 +186,7 @@ unsafe fn ZDICT_analyzePos( let mut length: size_t = 0; loop { end = end.wrapping_add(1); - length = ZDICT_count( - buffer[pos..].as_ptr() as *const core::ffi::c_void, - buffer[suffix(end as usize) as usize..].as_ptr() as *const core::ffi::c_void, - ); + length = ZDICT_count(&buffer[pos..], &buffer[suffix(end as usize) as usize..]); if length < MINMATCHLENGTH { break; } @@ -208,9 +196,8 @@ unsafe fn ZDICT_analyzePos( let mut length_0: size_t = 0; loop { length_0 = ZDICT_count( - buffer[pos..].as_ptr() as *const core::ffi::c_void, - buffer[suffix((start as usize).wrapping_sub(1)) as usize..].as_ptr() - as *const core::ffi::c_void, + &buffer[pos..], + &buffer[suffix((start as usize).wrapping_sub(1)) as usize..], ); if length_0 >= MINMATCHLENGTH { start = start.wrapping_sub(1); @@ -221,7 +208,7 @@ unsafe fn ZDICT_analyzePos( } // exit if not found a minimum number of repetitions - if end.wrapping_sub(start) < minRatio { + if end.wrapping_sub(start) < minRatio as u32 { for idx in start..end { doneMarks[suffix(idx as usize) as usize] = true; } @@ -269,7 +256,7 @@ unsafe fn ZDICT_analyzePos( selectedCount = currentCount; selectedID = currentID; } - if selectedCount < minRatio { + if selectedCount < minRatio as u32 { break; } refinedStart = selectedID; @@ -285,10 +272,7 @@ unsafe fn ZDICT_analyzePos( // look forward loop { end = end.wrapping_add(1); - let mut length = ZDICT_count( - buffer[pos..].as_ptr() as *const core::ffi::c_void, - buffer[suffix(end as usize) as usize..].as_ptr() as *const core::ffi::c_void, - ); + let mut length = ZDICT_count(&buffer[pos..], &buffer[suffix(end as usize) as usize..]); if length >= LLIMIT { length = LLIMIT - 1; } @@ -302,9 +286,8 @@ unsafe fn ZDICT_analyzePos( let mut length_2 = MINMATCHLENGTH; while (length_2 >= MINMATCHLENGTH) as core::ffi::c_int & (start > 0) as core::ffi::c_int != 0 { length_2 = ZDICT_count( - buffer[pos..].as_ptr() as *const core::ffi::c_void, - buffer[suffix(start.wrapping_sub(1) as usize) as usize..].as_ptr() - as *const core::ffi::c_void, + &buffer[pos..], + &buffer[suffix(start.wrapping_sub(1) as usize) as usize..], ); if length_2 >= LLIMIT { length_2 = LLIMIT - 1; @@ -327,7 +310,7 @@ unsafe fn ZDICT_analyzePos( let mut u_0: core::ffi::c_uint = 0; u_0 = (LLIMIT - 1) as core::ffi::c_uint; while u_0 >= MINMATCHLENGTH as core::ffi::c_uint { - if cumulLength[u_0 as usize] >= minRatio { + if cumulLength[u_0 as usize] >= minRatio as u32 { break; } u_0 = u_0.wrapping_sub(1); @@ -376,10 +359,7 @@ unsafe fn ZDICT_analyzePos( if testedPos as size_t == pos { length_3 = solution.length; } else { - length_3 = ZDICT_count( - buffer[pos..].as_ptr() as *const core::ffi::c_void, - buffer[testedPos as usize..].as_ptr() as *const core::ffi::c_void, - ) as u32; + length_3 = ZDICT_count(&buffer[pos..], &buffer[testedPos as usize..]) as u32; if length_3 > solution.length { length_3 = solution.length; } @@ -525,7 +505,7 @@ fn ZDICT_insertDictItem(table: &mut [DictItem], elt: DictItem, buffer: &[u8]) { table[0].pos = nextElt.wrapping_add(1); } -unsafe fn ZDICT_dictSize(dictList: &[DictItem]) -> u32 { +fn ZDICT_dictSize(dictList: &[DictItem]) -> u32 { let mut u: u32 = 0; let mut dictSize = 0u32; u = 1; @@ -536,13 +516,13 @@ unsafe fn ZDICT_dictSize(dictList: &[DictItem]) -> u32 { dictSize } -unsafe fn ZDICT_trainBuffer_legacy( +fn ZDICT_trainBuffer_legacy( dictList: &mut [DictItem], buffer: &[u8], mut bufferSize: size_t, - fileSizes: *const size_t, - mut nbFiles: core::ffi::c_uint, - mut minRatio: core::ffi::c_uint, + fileSizes: &[size_t], + mut nbFiles: usize, + mut minRatio: usize, notificationLevel: u32, ) -> size_t { let mut displayClock = Instant::now(); @@ -566,7 +546,7 @@ unsafe fn ZDICT_trainBuffer_legacy( } while bufferSize > ZDICT_MAX_SAMPLES_SIZE { nbFiles = nbFiles.wrapping_sub(1); - bufferSize = bufferSize.wrapping_sub(*fileSizes.offset(nbFiles as isize)); + bufferSize = bufferSize.wrapping_sub(fileSizes[nbFiles]); } // sort @@ -580,7 +560,7 @@ unsafe fn ZDICT_trainBuffer_legacy( let mut suffix = vec![0u32; bufferSize]; let divSuftSortResult = divsufsort( &buffer[..bufferSize], - std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix), + unsafe { std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix[..]) }, false, ); if divSuftSortResult != 0 { @@ -595,11 +575,11 @@ unsafe fn ZDICT_trainBuffer_legacy( // Note: filePos tracks borders between samples. // It's not used at this stage, but planned to become useful in a later update - let mut filePos = vec![0u32; nbFiles as usize]; + let mut filePos = vec![0u32; nbFiles]; // filePos[0] is intentionally left 0 for pos in 1..nbFiles as size_t { filePos[pos] = - (filePos[pos - 1] as size_t).wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) as u32; + (filePos[pos - 1] as size_t).wrapping_add(fileSizes[pos.wrapping_sub(1)]) as u32; } if notificationLevel >= 2 { @@ -659,22 +639,19 @@ fn fill_noise(buffer: &mut [u8]) { const MAXREPOFFSET: u32 = 1024; unsafe fn ZDICT_countEStats( - esr: EStats_ress_t, + esr: &mut EStats_ress_t, params: &ZSTD_parameters, countLit: &mut [u32; 256], offsetcodeCount: &mut [u32; 31], matchlengthCount: &mut [u32; 53], litlengthCount: &mut [u32; 36], repOffsets: &mut [u32; 1024], - src: *const core::ffi::c_void, - mut srcSize: size_t, + src: &[u8], notificationLevel: u32, ) { let blockSizeMax = Ord::min(1 << 17, 1 << params.cParams.windowLog); let mut cSize: size_t = 0; - if srcSize > blockSizeMax { - srcSize = blockSizeMax; - } + let srcSize = Ord::min(src.len(), blockSizeMax); let errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict); if ERR_isError(errorCode) { if notificationLevel >= 1 { @@ -684,9 +661,9 @@ unsafe fn ZDICT_countEStats( } cSize = ZSTD_compressBlock_deprecated( esr.zc, - esr.workPlace, + esr.workPlace.as_mut_ptr().cast(), ZSTD_BLOCKSIZE_MAX as size_t, - src, + src.as_ptr().cast::(), srcSize, ); if ERR_isError(cSize) { @@ -780,7 +757,7 @@ unsafe fn ZDICT_analyzeEntropy( let mut esr = EStats_ress_t { dict: core::ptr::null_mut(), zc: core::ptr::null_mut(), - workPlace: core::ptr::null_mut(), + workPlace: Box::default(), }; let eSize = analyze_entropy_internal( @@ -797,7 +774,6 @@ unsafe fn ZDICT_analyzeEntropy( ZSTD_freeCDict(esr.dict); ZSTD_freeCCtx(esr.zc); - free(esr.workPlace); eSize } @@ -859,8 +835,8 @@ unsafe fn analyze_entropy_internal( ZSTD_customMem::default(), ); esr.zc = ZSTD_createCCtx(); - esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX as size_t); - if (esr.dict).is_null() || (esr.zc).is_null() || (esr.workPlace).is_null() { + esr.workPlace = Box::new_uninit_slice(ZSTD_BLOCKSIZE_MAX as size_t); + if (esr.dict).is_null() || (esr.zc).is_null() { if notificationLevel >= 1 { eprintln!("Not enough memory"); } @@ -871,15 +847,14 @@ unsafe fn analyze_entropy_internal( let mut pos = 0usize; for fileSize in fileSizes { ZDICT_countEStats( - *esr, + esr, ¶ms, &mut countLit, &mut offcodeCount, &mut matchLengthCount, &mut litLengthCount, &mut repOffset, - src[pos..].as_ptr() as *const core::ffi::c_void, - *fileSize, + &src[pos..][..*fileSize], notificationLevel, ); pos = pos.wrapping_add(*fileSize); @@ -1205,13 +1180,15 @@ unsafe fn finalize_dictionary( } unsafe fn ZDICT_addEntropyTablesFromBuffer_advanced( - dictBuffer: *mut core::ffi::c_void, + dictBuffer: &mut [MaybeUninit], dictContentSize: size_t, - dictBufferCapacity: size_t, samples: &[u8], samplesSizes: &[usize], params: ZDICT_params_t, ) -> size_t { + let dictBufferCapacity = dictBuffer.len(); + let dictBuffer = dictBuffer.as_mut_ptr().cast::(); + let compressionLevel = if params.compressionLevel == 0 { ZSTD_CLEVEL_DEFAULT } else { @@ -1233,7 +1210,7 @@ unsafe fn ZDICT_addEntropyTablesFromBuffer_advanced( samplesSizes, dictBuffer .byte_add(dictBufferCapacity) - .byte_offset(-(dictContentSize as isize)), + .byte_sub(dictContentSize), dictContentSize, notificationLevel, ); @@ -1247,7 +1224,7 @@ unsafe fn ZDICT_addEntropyTablesFromBuffer_advanced( let randomID = ZSTD_XXH64( dictBuffer .byte_add(dictBufferCapacity) - .byte_offset(-(dictContentSize as isize)), + .byte_sub(dictContentSize), dictContentSize, 0, ); @@ -1262,10 +1239,10 @@ unsafe fn ZDICT_addEntropyTablesFromBuffer_advanced( if hSize.wrapping_add(dictContentSize) < dictBufferCapacity { core::ptr::copy( - (dictBuffer as *mut core::ffi::c_char) - .add(dictBufferCapacity) - .sub(dictContentSize), - (dictBuffer as *mut core::ffi::c_char).add(hSize), + dictBuffer + .byte_add(dictBufferCapacity) + .byte_sub(dictContentSize), + dictBuffer.byte_add(hSize), dictContentSize, ) } @@ -1278,15 +1255,15 @@ unsafe fn ZDICT_addEntropyTablesFromBuffer_advanced( /// /// - the size of the dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) /// - an error code, which can be tested with [`ZDICT_isError`] -unsafe fn ZDICT_trainFromBuffer_unsafe_legacy( +fn ZDICT_trainFromBuffer_unsafe_legacy( dictBuffer: *mut core::ffi::c_void, maxDictSize: size_t, samples: &[u8], samplesSizes: &[usize], params: ZDICT_legacy_params_t, ) -> size_t { - let nbSamples = samplesSizes.len() as u32; - let dictListSize = Ord::max(Ord::max(10000, nbSamples), (maxDictSize / 16) as u32); + let nbSamples = samplesSizes.len(); + let dictListSize = Ord::max(Ord::max(10000, nbSamples), maxDictSize / 16); let mut dictList = vec![DictItem::default(); dictListSize as size_t]; let selectivity = if params.selectivityLevel == 0 { g_selectivity_default @@ -1322,7 +1299,7 @@ unsafe fn ZDICT_trainFromBuffer_unsafe_legacy( &mut dictList, samples, samplesBuffSize, - samplesSizes.as_ptr(), + samplesSizes, nbSamples, minRep, notificationLevel, @@ -1427,32 +1404,54 @@ unsafe fn ZDICT_trainFromBuffer_unsafe_legacy( dictList[0].pos = n; dictContentSize_0 = currentSize; - // build dictionary content - let mut ptr = (dictBuffer as *mut u8).add(maxDictSize); - for u in 1..dictList[0].pos { - let l = (dictList[u as usize]).length; - ptr = ptr.offset(-(l as isize)); - debug_assert!(ptr >= dictBuffer as *mut u8); - if ptr < dictBuffer as *mut u8 { - return Error::GENERIC.to_error_code(); // should not happen + let dictBuffer = unsafe { + if dictBuffer.is_null() || maxDictSize == 0 { + &mut [] + } else { + core::slice::from_raw_parts_mut(dictBuffer.cast::>(), maxDictSize) } - memcpy( - ptr as *mut core::ffi::c_void, - samples[(dictList[u as usize]).pos as usize..].as_ptr() as *const core::ffi::c_void, - l as size_t, - ); + }; + + if let Err(e) = build_dictionary_content(dictBuffer, samples, &dictList) { + return e.to_error_code(); } - + unsafe { + ZDICT_addEntropyTablesFromBuffer_advanced( + dictBuffer, + dictContentSize_0 as size_t, + samples, + samplesSizes, + params.zParams, + ) + } +} - ZDICT_addEntropyTablesFromBuffer_advanced( - dictBuffer, - dictContentSize_0 as size_t, - maxDictSize, - samples, - samplesSizes, - params.zParams, - ) +fn build_dictionary_content( + dictBuffer: &mut [MaybeUninit], + samples: &[u8], + dictList: &[DictItem], +) -> Result<(), Error> { + // convention: table[0].pos stores the number of elements + let max = dictList[0].pos; + + let mut ptr = dictBuffer.len(); + for item in &dictList[1..max as usize] { + let l = item.length as usize; + ptr = match ptr.checked_sub(l) { + None => return Err(Error::GENERIC), // should not happen + Some(v) => v, + }; + + let src = uninit_slice(&samples[item.pos as usize..][..l]); + dictBuffer[ptr..][..l].copy_from_slice(src); + } + + Ok(()) +} + +fn uninit_slice(slice: &[T]) -> &[MaybeUninit] { + unsafe { &*(slice as *const [T] as *const [MaybeUninit]) } } /// Train a dictionary from an array of samples. @@ -1596,6 +1595,12 @@ pub unsafe extern "C" fn ZDICT_addEntropyTablesFromBuffer( samplesSizes: *const size_t, nbSamples: core::ffi::c_uint, ) -> size_t { + let dictBuffer = if dictBuffer.is_null() || dictBufferCapacity == 0 { + &mut [] + } else { + core::slice::from_raw_parts_mut(dictBuffer.cast::>(), dictBufferCapacity) + }; + let samplesSizes = if samplesSizes.is_null() || nbSamples == 0 { &[] } else { @@ -1612,7 +1617,6 @@ pub unsafe extern "C" fn ZDICT_addEntropyTablesFromBuffer( ZDICT_addEntropyTablesFromBuffer_advanced( dictBuffer, dictContentSize, - dictBufferCapacity, samples, samplesSizes, params,