diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 2a5dad9b..82492b26 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -6,7 +6,7 @@ use libc::{free, malloc, memcpy, size_t}; use crate::lib::common::bits::{ZSTD_NbCommonBytes, ZSTD_highbit32}; use crate::lib::common::error_private::{ERR_getErrorName, ERR_isError, Error}; use crate::lib::common::huf::{HUF_CElt, HUF_CTABLE_WORKSPACE_SIZE_U32, HUF_WORKSPACE_SIZE}; -use crate::lib::common::mem::{MEM_read16, MEM_read64, MEM_readLE32, MEM_readST, MEM_writeLE32}; +use crate::lib::common::mem::{MEM_read64, MEM_readLE32, MEM_readST, MEM_writeLE32}; use crate::lib::common::xxhash::ZSTD_XXH64; use crate::lib::common::zstd_internal::{ repStartValue, LLFSELog, MLFSELog, MaxLL, MaxML, OffFSELog, ZSTD_REP_NUM, @@ -149,66 +149,64 @@ unsafe fn ZDICT_count( const LLIMIT: usize = 64; const MINMATCHLENGTH: usize = 7; unsafe fn ZDICT_analyzePos( - doneMarks: *mut u8, - suffix: *const core::ffi::c_uint, + doneMarks: &mut [u8], + suffix: &[u32], mut start: u32, - buffer: *const core::ffi::c_void, + buffer: &[u8], minRatio: u32, notificationLevel: u32, ) -> DictItem { let mut lengthList = [0u32; LLIMIT]; let mut cumulLength = [0u32; LLIMIT]; let mut savings = [0u32; LLIMIT]; - let b = buffer as *const u8; let mut maxLength = LLIMIT; - let mut pos = *suffix.offset(start as isize) as size_t; + let mut pos = suffix[start as usize] as size_t; let mut end = start; let mut solution = DictItem::default(); - *doneMarks.add(pos) = 1; - if MEM_read16(b.add(pos) as *const core::ffi::c_void) as core::ffi::c_int - == MEM_read16(b.add(pos).add(2) as *const core::ffi::c_void) as core::ffi::c_int - || MEM_read16(b.add(pos).add(1) as *const core::ffi::c_void) as core::ffi::c_int - == MEM_read16(b.add(pos).add(3) as *const core::ffi::c_void) as core::ffi::c_int - || MEM_read16(b.add(pos).add(2) as *const core::ffi::c_void) as core::ffi::c_int - == MEM_read16(b.add(pos).add(4) as *const core::ffi::c_void) as core::ffi::c_int + + doneMarks[pos] = 1; + + // trivial repetition cases + if buffer[pos..pos + 2] == buffer[pos + 2..pos + 4] + || buffer[pos + 1..pos + 3] == buffer[pos + 3..pos + 5] + || buffer[pos + 2..pos + 4] == buffer[pos + 4..pos + 6] { - let pattern16 = MEM_read16(b.add(pos).add(4) as *const core::ffi::c_void); - let mut u: u32 = 0; - let mut patternEnd = 6u32; - while MEM_read16(b.add(pos).offset(patternEnd as isize) as *const core::ffi::c_void) - as core::ffi::c_int - == pattern16 as core::ffi::c_int - { - patternEnd = patternEnd.wrapping_add(2); + // skip and mark segment + let pattern16 = &buffer[pos + 4..pos + 6]; + let mut patternEnd = 6usize; + while buffer[pos + patternEnd..pos + patternEnd + 2] == *pattern16 { + patternEnd += 2; } - if *b.add(pos.wrapping_add(patternEnd as size_t)) as core::ffi::c_int - == *b.add(pos.wrapping_add(patternEnd as size_t).wrapping_sub(1)) as core::ffi::c_int - { - patternEnd = patternEnd.wrapping_add(1); + if buffer[pos + patternEnd] == buffer[pos + patternEnd - 1] { + patternEnd += 1; } - u = 1; + let mut u = 1; while u < patternEnd { - *doneMarks.add(pos.wrapping_add(u as size_t)) = 1; + doneMarks[pos.wrapping_add(u as size_t)] = 1; u = u.wrapping_add(1); } return solution; } + + // look forward let mut length: size_t = 0; loop { end = end.wrapping_add(1); length = ZDICT_count( - b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(end as isize) as isize) as *const core::ffi::c_void, + buffer[pos..].as_ptr() as *const core::ffi::c_void, + buffer[suffix[end as usize] as usize..].as_ptr() as *const core::ffi::c_void, ); if length < MINMATCHLENGTH { break; } } + + // look backward let mut length_0: size_t = 0; loop { length_0 = ZDICT_count( - b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(start as isize).sub(1) as isize) as *const core::ffi::c_void, + buffer[pos..].as_ptr() as *const core::ffi::c_void, + buffer[suffix[start as usize - 1] as usize..].as_ptr() as *const core::ffi::c_void, ); if length_0 >= MINMATCHLENGTH { start = start.wrapping_sub(1); @@ -217,19 +215,23 @@ unsafe fn ZDICT_analyzePos( break; } } + + // exit if not found a minimum number of repetitions if end.wrapping_sub(start) < minRatio { let mut idx: u32 = 0; idx = start; while idx < end { - *doneMarks.offset(*suffix.offset(idx as isize) as isize) = 1; + doneMarks[suffix[idx as usize] as usize] = 1; idx = idx.wrapping_add(1); } return solution; } + let mut i: core::ffi::c_int = 0; let mut mml: u32 = 0; let mut refinedStart = start; let mut refinedEnd = end; + if notificationLevel >= 4 { eprintln!(); eprint!( @@ -240,6 +242,7 @@ unsafe fn ZDICT_analyzePos( ); eprintln!(); } + mml = MINMATCHLENGTH as u32; loop { let mut currentChar = 0; @@ -250,16 +253,13 @@ unsafe fn ZDICT_analyzePos( let mut selectedID = currentID; id = refinedStart; while id < refinedEnd { - if *b.offset((*suffix.offset(id as isize)).wrapping_add(mml) as isize) - as core::ffi::c_int - != currentChar as core::ffi::c_int - { + if buffer[(suffix[id as usize] + mml) as usize] != currentChar { if currentCount > selectedCount { selectedCount = currentCount; selectedID = currentID; } currentID = id; - currentChar = *b.offset((*suffix.offset(id as isize)).wrapping_add(mml) as isize); + currentChar = buffer[(suffix[id as usize] + mml) as usize]; currentCount = 0; } currentCount = currentCount.wrapping_add(1); @@ -276,20 +276,24 @@ unsafe fn ZDICT_analyzePos( refinedEnd = refinedStart.wrapping_add(selectedCount); mml = mml.wrapping_add(1); } + + // evaluate gain based on new dict start = refinedStart; - pos = *suffix.offset(refinedStart as isize) as size_t; + pos = suffix[refinedStart as usize] as size_t; end = start; ptr::write_bytes( lengthList.as_mut_ptr() as *mut u8, 0, ::core::mem::size_of::<[u32; 64]>(), ); + + // look forward let mut length_1: size_t = 0; loop { end = end.wrapping_add(1); length_1 = ZDICT_count( - b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(end as isize) as isize) as *const core::ffi::c_void, + buffer[pos..].as_ptr() as *const core::ffi::c_void, + buffer[suffix[end as usize] as usize..].as_ptr() as *const core::ffi::c_void, ); if length_1 >= LLIMIT { length_1 = LLIMIT - 1; @@ -300,11 +304,13 @@ unsafe fn ZDICT_analyzePos( break; } } + + // look backward let mut length_2 = MINMATCHLENGTH; while (length_2 >= MINMATCHLENGTH) as core::ffi::c_int & (start > 0) as core::ffi::c_int != 0 { length_2 = ZDICT_count( - b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(start.wrapping_sub(1) as isize) as isize) + buffer[pos..].as_ptr() as *const core::ffi::c_void, + buffer[suffix[start.wrapping_sub(1) as usize] as usize..].as_ptr() as *const core::ffi::c_void, ); if length_2 >= LLIMIT { @@ -316,6 +322,8 @@ unsafe fn ZDICT_analyzePos( start = start.wrapping_sub(1); } } + + // largest useful length ptr::write_bytes( cumulLength.as_mut_ptr() as *mut u8, 0, @@ -339,17 +347,19 @@ unsafe fn ZDICT_analyzePos( u_0 = u_0.wrapping_sub(1); } maxLength = u_0 as size_t; + + // reduce maxLength in case of final into repetitive data let mut l = maxLength as u32; - let c = *b.add(pos.wrapping_add(maxLength).wrapping_sub(1)); - while *b.add(pos.wrapping_add(l as size_t).wrapping_sub(2)) as core::ffi::c_int - == c as core::ffi::c_int - { + let c = buffer[pos + maxLength - 1]; + while buffer[pos + l as usize - 2] == c { l = l.wrapping_sub(1); } maxLength = l as size_t; if maxLength < MINMATCHLENGTH { - return solution; + return solution; // skip: no long-enough solution available } + + // calculate savings *savings.as_mut_ptr().add(5) = 0; let mut u_1: core::ffi::c_uint = 0; u_1 = MINMATCHLENGTH as core::ffi::c_uint; @@ -360,6 +370,7 @@ unsafe fn ZDICT_analyzePos( ); u_1 = u_1.wrapping_add(1); } + if notificationLevel >= 4 { eprintln!( "Selected dict at position {}, of length {} : saves {} (ratio: {:.2}) ", @@ -370,22 +381,25 @@ unsafe fn ZDICT_analyzePos( / maxLength as core::ffi::c_double, ); } + solution.pos = pos as u32; solution.length = maxLength as u32; solution.savings = *savings.as_mut_ptr().add(maxLength); + + // mark positions done let mut id_0: u32 = 0; id_0 = start; while id_0 < end { let mut p: u32 = 0; let mut pEnd: u32 = 0; let mut length_3: u32 = 0; - let testedPos = *suffix.offset(id_0 as isize); + let testedPos = suffix[id_0 as usize]; if testedPos as size_t == pos { length_3 = solution.length; } else { length_3 = ZDICT_count( - b.add(pos) as *const core::ffi::c_void, - b.offset(testedPos as isize) as *const core::ffi::c_void, + buffer[pos..].as_ptr() as *const core::ffi::c_void, + buffer[testedPos as usize..].as_ptr() as *const core::ffi::c_void, ) as u32; if length_3 > solution.length { length_3 = solution.length; @@ -394,11 +408,12 @@ unsafe fn ZDICT_analyzePos( pEnd = testedPos.wrapping_add(length_3); p = testedPos; while p < pEnd { - *doneMarks.offset(p as isize) = 1; + doneMarks[p as usize] = 1; p = p.wrapping_add(1); } id_0 = id_0.wrapping_add(1); } + solution } @@ -585,133 +600,108 @@ unsafe fn ZDICT_trainBuffer_legacy( mut minRatio: core::ffi::c_uint, notificationLevel: u32, ) -> size_t { - let suffix0 = malloc( - bufferSize - .wrapping_add(2) - .wrapping_mul(::core::mem::size_of::()), - ) as *mut core::ffi::c_uint; - let suffix = suffix0.add(1); - let reverseSuffix = malloc(bufferSize.wrapping_mul(::core::mem::size_of::())) as *mut u32; - let doneMarks = malloc( - bufferSize - .wrapping_add(16) - .wrapping_mul(::core::mem::size_of::()), - ) as *mut u8; - let filePos = - malloc((nbFiles as size_t).wrapping_mul(::core::mem::size_of::())) as *mut u32; - let mut result = 0; let mut displayClock = Instant::now(); let refresh_rate = Duration::from_millis(300); + let buffer = core::slice::from_raw_parts(buffer as *const u8, bufferSize); + // init if notificationLevel >= 2 { eprintln!("\r{:70 }\r", ""); // clean display line } - if suffix0.is_null() || reverseSuffix.is_null() || doneMarks.is_null() || filePos.is_null() { - result = Error::memory_allocation.to_error_code(); - } else { - if minRatio < MINRATIO { - minRatio = MINRATIO; - } - core::ptr::write_bytes(doneMarks, 0, bufferSize.wrapping_add(16)); - // limit sample set size (divsufsort limitation) - if bufferSize > ZDICT_MAX_SAMPLES_SIZE && notificationLevel >= 3 { - eprintln!( - "sample set too large : reduced to {} MB ...", - (2000) << 20 >> 20, - ); - } - while bufferSize > ZDICT_MAX_SAMPLES_SIZE { - nbFiles = nbFiles.wrapping_sub(1); - bufferSize = bufferSize.wrapping_sub(*fileSizes.offset(nbFiles as isize)); - } + if minRatio < MINRATIO { + minRatio = MINRATIO; + } - // sort - if notificationLevel >= 2 { - eprintln!( - "sorting {} files of total size {} MB ...", - nbFiles, - bufferSize >> 20, - ); + // limit sample set size (divsufsort limitation) + if bufferSize > ZDICT_MAX_SAMPLES_SIZE && notificationLevel >= 3 { + eprintln!( + "sample set too large : reduced to {} MB ...", + (2000) << 20 >> 20, + ); + } + while bufferSize > ZDICT_MAX_SAMPLES_SIZE { + nbFiles = nbFiles.wrapping_sub(1); + bufferSize = bufferSize.wrapping_sub(*fileSizes.offset(nbFiles as isize)); + } + + // sort + if notificationLevel >= 2 { + eprintln!( + "sorting {} files of total size {} MB ...", + nbFiles, + bufferSize >> 20, + ); + } + let mut suffix = vec![0u32; bufferSize + 1]; + let divSuftSortResult = divsufsort( + buffer, + std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix[0..bufferSize]), + false, + ); + if divSuftSortResult != 0 { + return Error::GENERIC.to_error_code(); + } + + suffix[bufferSize] = bufferSize as core::ffi::c_uint; + + // build reverse suffix sort + let mut reverseSuffix = vec![0u32; bufferSize]; + for pos in 0..bufferSize { + reverseSuffix[suffix[pos] as usize] = pos as u32; + } + + // Note: filePos tracks borders between samples. + // It's not used at this stage, but planned to become useful in a later update + let mut filePos = vec![0u32; nbFiles as usize]; + // filePos[0] is intentionally left 0 + for pos in 1..nbFiles as size_t { + filePos[pos] = + (filePos[pos - 1] as size_t).wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) as u32; + } + + if notificationLevel >= 2 { + eprintln!("finding patterns ..."); + } + if notificationLevel >= 3 { + eprintln!("minimum ratio : {} ", minRatio); + } + + let mut doneMarks = vec![0u8; bufferSize + 16]; + let mut cursor = 0usize; + while cursor < bufferSize { + if doneMarks[cursor] != 0 { + cursor += 1; + continue; } - let divSuftSortResult = divsufsort( - core::slice::from_raw_parts(buffer as *const u8, bufferSize), - core::slice::from_raw_parts_mut(suffix as *mut i32, bufferSize), - false, + + let solution = ZDICT_analyzePos( + &mut doneMarks, + &suffix, + reverseSuffix[cursor], + buffer, + minRatio, + notificationLevel, ); - if divSuftSortResult != 0 { - result = Error::GENERIC.to_error_code(); - } else { - *suffix.add(bufferSize) = bufferSize as core::ffi::c_uint; - *suffix0 = bufferSize as core::ffi::c_uint; - - // build reverse suffix sort - let mut pos: size_t = 0; - pos = 0; - while pos < bufferSize { - *reverseSuffix.offset(*suffix.add(pos) as isize) = pos as u32; - pos = pos.wrapping_add(1); - } - // Note: filePos tracks borders between samples. - // It's not used at this stage, but planned to become useful in a later update - *filePos = 0; - pos = 1; - while pos < nbFiles as size_t { - *filePos.add(pos) = (*filePos.add(pos.wrapping_sub(1)) as size_t) - .wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) - as u32; - pos = pos.wrapping_add(1); - } + if solution.length == 0 { + cursor += 1; + continue; + } - if notificationLevel >= 2 { - eprintln!("finding patterns ..."); - } - if notificationLevel >= 3 { - eprintln!("minimum ratio : {} ", minRatio); - } + ZDICT_insertDictItem(dictList, dictListSize, solution, buffer.as_ptr().cast()); + cursor += solution.length as usize; - let mut cursor: u32 = 0; - cursor = 0; - while (cursor as size_t) < bufferSize { - let mut solution = DictItem::default(); - if *doneMarks.offset(cursor as isize) != 0 { - cursor = cursor.wrapping_add(1); - } else { - solution = ZDICT_analyzePos( - doneMarks, - suffix, - *reverseSuffix.offset(cursor as isize), - buffer, - minRatio, - notificationLevel, - ); - if solution.length == 0 { - cursor = cursor.wrapping_add(1); - } else { - ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); - cursor = cursor.wrapping_add(solution.length); - if notificationLevel >= 2 { - if displayClock.elapsed() > refresh_rate { - displayClock = Instant::now(); - eprint!( - "\r{:4.2} % \r", - cursor as core::ffi::c_double - / bufferSize as core::ffi::c_double - * 100.0f64, - ); - } - } - } - } - } + if notificationLevel >= 2 && displayClock.elapsed() > refresh_rate { + displayClock = Instant::now(); + eprint!( + "\r{:4.2} % \r", + cursor as core::ffi::c_double / bufferSize as core::ffi::c_double * 100.0f64, + ); } } - free(suffix0 as *mut core::ffi::c_void); - free(reverseSuffix as *mut core::ffi::c_void); - free(doneMarks as *mut core::ffi::c_void); - free(filePos as *mut core::ffi::c_void); - result + + 0 } fn fill_noise(buffer: &mut [u8]) {