From e535cd3072e02e93eb951744c37e2ae602dd7aff Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 10:32:17 +0100 Subject: [PATCH 01/10] `ZDICT_trainBuffer_legacy`: make `suffix` a vec --- lib/dictBuilder/zdict.rs | 53 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 2a5dad9b..1f60bd5d 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -150,7 +150,7 @@ const LLIMIT: usize = 64; const MINMATCHLENGTH: usize = 7; unsafe fn ZDICT_analyzePos( doneMarks: *mut u8, - suffix: *const core::ffi::c_uint, + suffix_slice: &[u32], mut start: u32, buffer: *const core::ffi::c_void, minRatio: u32, @@ -161,7 +161,17 @@ unsafe fn ZDICT_analyzePos( let mut savings = [0u32; LLIMIT]; let b = buffer as *const u8; let mut maxLength = LLIMIT; - let mut pos = *suffix.offset(start as isize) as size_t; + + // The C implementation maps index `len` and `-1` to the length of the suffix array. + let suffix = |index| { + if index == usize::MAX || index == suffix_slice.len() { + suffix_slice.len() as u32 + } else { + suffix_slice[index] + } + }; + + let mut pos = suffix(start as usize) as size_t; let mut end = start; let mut solution = DictItem::default(); *doneMarks.add(pos) = 1; @@ -198,7 +208,7 @@ unsafe fn ZDICT_analyzePos( end = end.wrapping_add(1); length = ZDICT_count( b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(end as isize) as isize) as *const core::ffi::c_void, + b.offset(suffix(end as usize) as isize) as *const core::ffi::c_void, ); if length < MINMATCHLENGTH { break; @@ -208,7 +218,7 @@ unsafe fn ZDICT_analyzePos( loop { length_0 = ZDICT_count( b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(start as isize).sub(1) as isize) as *const core::ffi::c_void, + b.offset(suffix((start as usize).wrapping_sub(1)) as isize) as *const core::ffi::c_void, ); if length_0 >= MINMATCHLENGTH { start = start.wrapping_sub(1); @@ -221,7 +231,7 @@ unsafe fn ZDICT_analyzePos( let mut idx: u32 = 0; idx = start; while idx < end { - *doneMarks.offset(*suffix.offset(idx as isize) as isize) = 1; + *doneMarks.offset(suffix(idx as usize) as isize) = 1; idx = idx.wrapping_add(1); } return solution; @@ -250,8 +260,7 @@ unsafe fn ZDICT_analyzePos( let mut selectedID = currentID; id = refinedStart; while id < refinedEnd { - if *b.offset((*suffix.offset(id as isize)).wrapping_add(mml) as isize) - as core::ffi::c_int + if *b.offset((suffix(id as usize)).wrapping_add(mml) as isize) as core::ffi::c_int != currentChar as core::ffi::c_int { if currentCount > selectedCount { @@ -259,7 +268,7 @@ unsafe fn ZDICT_analyzePos( selectedID = currentID; } currentID = id; - currentChar = *b.offset((*suffix.offset(id as isize)).wrapping_add(mml) as isize); + currentChar = *b.offset((suffix(id as usize)).wrapping_add(mml) as isize); currentCount = 0; } currentCount = currentCount.wrapping_add(1); @@ -277,7 +286,7 @@ unsafe fn ZDICT_analyzePos( mml = mml.wrapping_add(1); } start = refinedStart; - pos = *suffix.offset(refinedStart as isize) as size_t; + pos = suffix(refinedStart as usize) as size_t; end = start; ptr::write_bytes( lengthList.as_mut_ptr() as *mut u8, @@ -289,7 +298,7 @@ unsafe fn ZDICT_analyzePos( end = end.wrapping_add(1); length_1 = ZDICT_count( b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(end as isize) as isize) as *const core::ffi::c_void, + b.offset(suffix(end as usize) as isize) as *const core::ffi::c_void, ); if length_1 >= LLIMIT { length_1 = LLIMIT - 1; @@ -304,8 +313,7 @@ unsafe fn ZDICT_analyzePos( while (length_2 >= MINMATCHLENGTH) as core::ffi::c_int & (start > 0) as core::ffi::c_int != 0 { length_2 = ZDICT_count( b.add(pos) as *const core::ffi::c_void, - b.offset(*suffix.offset(start.wrapping_sub(1) as isize) as isize) - as *const core::ffi::c_void, + b.offset(suffix(start.wrapping_sub(1) as usize) as isize) as *const core::ffi::c_void, ); if length_2 >= LLIMIT { length_2 = LLIMIT - 1; @@ -379,7 +387,7 @@ unsafe fn ZDICT_analyzePos( let mut p: u32 = 0; let mut pEnd: u32 = 0; let mut length_3: u32 = 0; - let testedPos = *suffix.offset(id_0 as isize); + let testedPos = suffix(id_0 as usize); if testedPos as size_t == pos { length_3 = solution.length; } else { @@ -585,12 +593,7 @@ unsafe fn ZDICT_trainBuffer_legacy( mut minRatio: core::ffi::c_uint, notificationLevel: u32, ) -> size_t { - let suffix0 = malloc( - bufferSize - .wrapping_add(2) - .wrapping_mul(::core::mem::size_of::()), - ) as *mut core::ffi::c_uint; - let suffix = suffix0.add(1); + let mut suffix = vec![0u32; bufferSize]; let reverseSuffix = malloc(bufferSize.wrapping_mul(::core::mem::size_of::())) as *mut u32; let doneMarks = malloc( bufferSize @@ -607,7 +610,7 @@ unsafe fn ZDICT_trainBuffer_legacy( if notificationLevel >= 2 { eprintln!("\r{:70 }\r", ""); // clean display line } - if suffix0.is_null() || reverseSuffix.is_null() || doneMarks.is_null() || filePos.is_null() { + if reverseSuffix.is_null() || doneMarks.is_null() || filePos.is_null() { result = Error::memory_allocation.to_error_code(); } else { if minRatio < MINRATIO { @@ -637,20 +640,17 @@ unsafe fn ZDICT_trainBuffer_legacy( } let divSuftSortResult = divsufsort( core::slice::from_raw_parts(buffer as *const u8, bufferSize), - core::slice::from_raw_parts_mut(suffix as *mut i32, bufferSize), + std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix[..]), false, ); if divSuftSortResult != 0 { result = Error::GENERIC.to_error_code(); } else { - *suffix.add(bufferSize) = bufferSize as core::ffi::c_uint; - *suffix0 = bufferSize as core::ffi::c_uint; - // build reverse suffix sort let mut pos: size_t = 0; pos = 0; while pos < bufferSize { - *reverseSuffix.offset(*suffix.add(pos) as isize) = pos as u32; + *reverseSuffix.offset(suffix[pos] as isize) = pos as u32; pos = pos.wrapping_add(1); } // Note: filePos tracks borders between samples. @@ -680,7 +680,7 @@ unsafe fn ZDICT_trainBuffer_legacy( } else { solution = ZDICT_analyzePos( doneMarks, - suffix, + &suffix, *reverseSuffix.offset(cursor as isize), buffer, minRatio, @@ -707,7 +707,6 @@ unsafe fn ZDICT_trainBuffer_legacy( } } } - free(suffix0 as *mut core::ffi::c_void); free(reverseSuffix as *mut core::ffi::c_void); free(doneMarks as *mut core::ffi::c_void); free(filePos as *mut core::ffi::c_void); From ffd4d2688230b071d9d0a708286324608ff4232a Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 10:49:07 +0100 Subject: [PATCH 02/10] `ZDICT_trainBuffer_legacy`: make `reverseSuffix` a vec --- lib/dictBuilder/zdict.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 1f60bd5d..2f8413b5 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -593,8 +593,6 @@ unsafe fn ZDICT_trainBuffer_legacy( mut minRatio: core::ffi::c_uint, notificationLevel: u32, ) -> size_t { - let mut suffix = vec![0u32; bufferSize]; - let reverseSuffix = malloc(bufferSize.wrapping_mul(::core::mem::size_of::())) as *mut u32; let doneMarks = malloc( bufferSize .wrapping_add(16) @@ -610,7 +608,7 @@ unsafe fn ZDICT_trainBuffer_legacy( if notificationLevel >= 2 { eprintln!("\r{:70 }\r", ""); // clean display line } - if reverseSuffix.is_null() || doneMarks.is_null() || filePos.is_null() { + if doneMarks.is_null() || filePos.is_null() { result = Error::memory_allocation.to_error_code(); } else { if minRatio < MINRATIO { @@ -638,6 +636,7 @@ unsafe fn ZDICT_trainBuffer_legacy( bufferSize >> 20, ); } + let mut suffix = vec![0u32; bufferSize]; let divSuftSortResult = divsufsort( core::slice::from_raw_parts(buffer as *const u8, bufferSize), std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix[..]), @@ -647,10 +646,11 @@ unsafe fn ZDICT_trainBuffer_legacy( result = Error::GENERIC.to_error_code(); } else { // build reverse suffix sort + let mut reverseSuffix = vec![0u32; bufferSize]; let mut pos: size_t = 0; pos = 0; while pos < bufferSize { - *reverseSuffix.offset(suffix[pos] as isize) = pos as u32; + reverseSuffix[suffix[pos] as usize] = pos as u32; pos = pos.wrapping_add(1); } // Note: filePos tracks borders between samples. @@ -681,7 +681,7 @@ unsafe fn ZDICT_trainBuffer_legacy( solution = ZDICT_analyzePos( doneMarks, &suffix, - *reverseSuffix.offset(cursor as isize), + reverseSuffix[cursor as usize], buffer, minRatio, notificationLevel, @@ -707,7 +707,6 @@ unsafe fn ZDICT_trainBuffer_legacy( } } } - free(reverseSuffix as *mut core::ffi::c_void); free(doneMarks as *mut core::ffi::c_void); free(filePos as *mut core::ffi::c_void); result From 1210a481419f5e83a6434a47e47a2bdcd1e70eba Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 11:00:45 +0100 Subject: [PATCH 03/10] `ZDICT_trainBuffer_legacy`: clean up loops --- lib/dictBuilder/zdict.rs | 70 ++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 2f8413b5..53aef0d4 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -647,21 +647,17 @@ unsafe fn ZDICT_trainBuffer_legacy( } else { // build reverse suffix sort let mut reverseSuffix = vec![0u32; bufferSize]; - let mut pos: size_t = 0; - pos = 0; - while pos < bufferSize { + for pos in 0..bufferSize { reverseSuffix[suffix[pos] as usize] = pos as u32; - pos = pos.wrapping_add(1); } + // Note: filePos tracks borders between samples. // It's not used at this stage, but planned to become useful in a later update *filePos = 0; - pos = 1; - while pos < nbFiles as size_t { + for pos in 1..nbFiles as size_t { *filePos.add(pos) = (*filePos.add(pos.wrapping_sub(1)) as size_t) .wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) as u32; - pos = pos.wrapping_add(1); } if notificationLevel >= 2 { @@ -671,38 +667,36 @@ unsafe fn ZDICT_trainBuffer_legacy( eprintln!("minimum ratio : {} ", minRatio); } - let mut cursor: u32 = 0; - cursor = 0; - while (cursor as size_t) < bufferSize { - let mut solution = DictItem::default(); - if *doneMarks.offset(cursor as isize) != 0 { - cursor = cursor.wrapping_add(1); - } else { - solution = ZDICT_analyzePos( - doneMarks, - &suffix, - reverseSuffix[cursor as usize], - buffer, - minRatio, - notificationLevel, + let mut cursor = 0usize; + while cursor < bufferSize { + if *doneMarks.add(cursor) != 0 { + cursor += 1; + continue; + } + + let solution = ZDICT_analyzePos( + doneMarks, + &suffix, + reverseSuffix[cursor], + buffer, + minRatio, + notificationLevel, + ); + if solution.length == 0 { + cursor += 1; + continue; + } + + ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); + cursor += solution.length as usize; + + if notificationLevel >= 2 && displayClock.elapsed() > refresh_rate { + displayClock = Instant::now(); + eprint!( + "\r{:4.2} % \r", + cursor as core::ffi::c_double / bufferSize as core::ffi::c_double + * 100.0f64, ); - if solution.length == 0 { - cursor = cursor.wrapping_add(1); - } else { - ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); - cursor = cursor.wrapping_add(solution.length); - if notificationLevel >= 2 { - if displayClock.elapsed() > refresh_rate { - displayClock = Instant::now(); - eprint!( - "\r{:4.2} % \r", - cursor as core::ffi::c_double - / bufferSize as core::ffi::c_double - * 100.0f64, - ); - } - } - } } } } From 629581ed2146fe1a8fcda0508fdb824b8cb77ac8 Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 11:14:14 +0100 Subject: [PATCH 04/10] `ZDICT_trainBuffer_legacy`: make `doneMarks` a vec --- lib/dictBuilder/zdict.rs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 53aef0d4..f6b4684e 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -149,7 +149,7 @@ unsafe fn ZDICT_count( const LLIMIT: usize = 64; const MINMATCHLENGTH: usize = 7; unsafe fn ZDICT_analyzePos( - doneMarks: *mut u8, + doneMarks: &mut [u8], suffix_slice: &[u32], mut start: u32, buffer: *const core::ffi::c_void, @@ -174,7 +174,7 @@ unsafe fn ZDICT_analyzePos( let mut pos = suffix(start as usize) as size_t; let mut end = start; let mut solution = DictItem::default(); - *doneMarks.add(pos) = 1; + doneMarks[pos] = 1; if MEM_read16(b.add(pos) as *const core::ffi::c_void) as core::ffi::c_int == MEM_read16(b.add(pos).add(2) as *const core::ffi::c_void) as core::ffi::c_int || MEM_read16(b.add(pos).add(1) as *const core::ffi::c_void) as core::ffi::c_int @@ -198,7 +198,7 @@ unsafe fn ZDICT_analyzePos( } u = 1; while u < patternEnd { - *doneMarks.add(pos.wrapping_add(u as size_t)) = 1; + doneMarks[pos.wrapping_add(u as size_t)] = 1; u = u.wrapping_add(1); } return solution; @@ -231,7 +231,7 @@ unsafe fn ZDICT_analyzePos( let mut idx: u32 = 0; idx = start; while idx < end { - *doneMarks.offset(suffix(idx as usize) as isize) = 1; + doneMarks[suffix(idx as usize) as usize] = 1; idx = idx.wrapping_add(1); } return solution; @@ -402,7 +402,7 @@ unsafe fn ZDICT_analyzePos( pEnd = testedPos.wrapping_add(length_3); p = testedPos; while p < pEnd { - *doneMarks.offset(p as isize) = 1; + doneMarks[p as usize] = 1; p = p.wrapping_add(1); } id_0 = id_0.wrapping_add(1); @@ -593,11 +593,6 @@ unsafe fn ZDICT_trainBuffer_legacy( mut minRatio: core::ffi::c_uint, notificationLevel: u32, ) -> size_t { - let doneMarks = malloc( - bufferSize - .wrapping_add(16) - .wrapping_mul(::core::mem::size_of::()), - ) as *mut u8; let filePos = malloc((nbFiles as size_t).wrapping_mul(::core::mem::size_of::())) as *mut u32; let mut result = 0; @@ -608,13 +603,12 @@ unsafe fn ZDICT_trainBuffer_legacy( if notificationLevel >= 2 { eprintln!("\r{:70 }\r", ""); // clean display line } - if doneMarks.is_null() || filePos.is_null() { + if filePos.is_null() { result = Error::memory_allocation.to_error_code(); } else { if minRatio < MINRATIO { minRatio = MINRATIO; } - core::ptr::write_bytes(doneMarks, 0, bufferSize.wrapping_add(16)); // limit sample set size (divsufsort limitation) if bufferSize > ZDICT_MAX_SAMPLES_SIZE && notificationLevel >= 3 { @@ -667,15 +661,16 @@ unsafe fn ZDICT_trainBuffer_legacy( eprintln!("minimum ratio : {} ", minRatio); } + let mut doneMarks = vec![0u8; bufferSize + 16]; let mut cursor = 0usize; while cursor < bufferSize { - if *doneMarks.add(cursor) != 0 { + if doneMarks[cursor] != 0 { cursor += 1; continue; } let solution = ZDICT_analyzePos( - doneMarks, + &mut doneMarks, &suffix, reverseSuffix[cursor], buffer, @@ -701,7 +696,6 @@ unsafe fn ZDICT_trainBuffer_legacy( } } } - free(doneMarks as *mut core::ffi::c_void); free(filePos as *mut core::ffi::c_void); result } From 5cb1863196142053e83ee24e9476169706b3c26d Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 11:19:46 +0100 Subject: [PATCH 05/10] `ZDICT_trainBuffer_legacy`: make `filePos` a vec --- lib/dictBuilder/zdict.rs | 167 +++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 87 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index f6b4684e..84e2ccc3 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -593,9 +593,6 @@ unsafe fn ZDICT_trainBuffer_legacy( mut minRatio: core::ffi::c_uint, notificationLevel: u32, ) -> size_t { - let filePos = - malloc((nbFiles as size_t).wrapping_mul(::core::mem::size_of::())) as *mut u32; - let mut result = 0; let mut displayClock = Instant::now(); let refresh_rate = Duration::from_millis(300); @@ -603,101 +600,97 @@ unsafe fn ZDICT_trainBuffer_legacy( if notificationLevel >= 2 { eprintln!("\r{:70 }\r", ""); // clean display line } - if filePos.is_null() { - result = Error::memory_allocation.to_error_code(); - } else { - if minRatio < MINRATIO { - minRatio = MINRATIO; - } - // limit sample set size (divsufsort limitation) - if bufferSize > ZDICT_MAX_SAMPLES_SIZE && notificationLevel >= 3 { - eprintln!( - "sample set too large : reduced to {} MB ...", - (2000) << 20 >> 20, - ); - } - while bufferSize > ZDICT_MAX_SAMPLES_SIZE { - nbFiles = nbFiles.wrapping_sub(1); - bufferSize = bufferSize.wrapping_sub(*fileSizes.offset(nbFiles as isize)); - } + if minRatio < MINRATIO { + minRatio = MINRATIO; + } - // sort - if notificationLevel >= 2 { - eprintln!( - "sorting {} files of total size {} MB ...", - nbFiles, - bufferSize >> 20, - ); - } - let mut suffix = vec![0u32; bufferSize]; - let divSuftSortResult = divsufsort( - core::slice::from_raw_parts(buffer as *const u8, bufferSize), - std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix[..]), - false, + // limit sample set size (divsufsort limitation) + if bufferSize > ZDICT_MAX_SAMPLES_SIZE && notificationLevel >= 3 { + eprintln!( + "sample set too large : reduced to {} MB ...", + (2000) << 20 >> 20, ); - if divSuftSortResult != 0 { - result = Error::GENERIC.to_error_code(); - } else { - // build reverse suffix sort - let mut reverseSuffix = vec![0u32; bufferSize]; - for pos in 0..bufferSize { - reverseSuffix[suffix[pos] as usize] = pos as u32; - } + } + while bufferSize > ZDICT_MAX_SAMPLES_SIZE { + nbFiles = nbFiles.wrapping_sub(1); + bufferSize = bufferSize.wrapping_sub(*fileSizes.offset(nbFiles as isize)); + } - // Note: filePos tracks borders between samples. - // It's not used at this stage, but planned to become useful in a later update - *filePos = 0; - for pos in 1..nbFiles as size_t { - *filePos.add(pos) = (*filePos.add(pos.wrapping_sub(1)) as size_t) - .wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) - as u32; - } + // sort + if notificationLevel >= 2 { + eprintln!( + "sorting {} files of total size {} MB ...", + nbFiles, + bufferSize >> 20, + ); + } + let mut suffix = vec![0u32; bufferSize]; + let divSuftSortResult = divsufsort( + core::slice::from_raw_parts(buffer as *const u8, bufferSize), + std::mem::transmute::<&mut [u32], &mut [i32]>(&mut suffix), + false, + ); + if divSuftSortResult != 0 { + return Error::GENERIC.to_error_code(); + } - if notificationLevel >= 2 { - eprintln!("finding patterns ..."); - } - if notificationLevel >= 3 { - eprintln!("minimum ratio : {} ", minRatio); - } + // build reverse suffix sort + let mut reverseSuffix = vec![0u32; bufferSize]; + for pos in 0..bufferSize { + reverseSuffix[suffix[pos] as usize] = pos as u32; + } - let mut doneMarks = vec![0u8; bufferSize + 16]; - let mut cursor = 0usize; - while cursor < bufferSize { - if doneMarks[cursor] != 0 { - cursor += 1; - continue; - } + // Note: filePos tracks borders between samples. + // It's not used at this stage, but planned to become useful in a later update + let mut filePos = vec![0u32; nbFiles as usize]; + // filePos[0] is intentionally left 0 + for pos in 1..nbFiles as size_t { + filePos[pos] = + (filePos[pos - 1] as size_t).wrapping_add(*fileSizes.add(pos.wrapping_sub(1))) as u32; + } - let solution = ZDICT_analyzePos( - &mut doneMarks, - &suffix, - reverseSuffix[cursor], - buffer, - minRatio, - notificationLevel, - ); - if solution.length == 0 { - cursor += 1; - continue; - } + if notificationLevel >= 2 { + eprintln!("finding patterns ..."); + } + if notificationLevel >= 3 { + eprintln!("minimum ratio : {} ", minRatio); + } - ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); - cursor += solution.length as usize; + let mut doneMarks = vec![0u8; bufferSize + 16]; + let mut cursor = 0usize; + while cursor < bufferSize { + if doneMarks[cursor] != 0 { + cursor += 1; + continue; + } - if notificationLevel >= 2 && displayClock.elapsed() > refresh_rate { - displayClock = Instant::now(); - eprint!( - "\r{:4.2} % \r", - cursor as core::ffi::c_double / bufferSize as core::ffi::c_double - * 100.0f64, - ); - } - } + let solution = ZDICT_analyzePos( + &mut doneMarks, + &suffix, + reverseSuffix[cursor], + buffer, + minRatio, + notificationLevel, + ); + if solution.length == 0 { + cursor += 1; + continue; + } + + ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); + cursor += solution.length as usize; + + if notificationLevel >= 2 && displayClock.elapsed() > refresh_rate { + displayClock = Instant::now(); + eprint!( + "\r{:4.2} % \r", + cursor as core::ffi::c_double / bufferSize as core::ffi::c_double * 100.0f64, + ); } } - free(filePos as *mut core::ffi::c_void); - result + + 0 } fn fill_noise(buffer: &mut [u8]) { From 6d1d5e6d3bb2f05a8b96d2b939a222ef2763ed78 Mon Sep 17 00:00:00 2001 From: Michiel Date: Mon, 17 Nov 2025 11:40:53 +0100 Subject: [PATCH 06/10] `ZDICT_analyzePos`: comments & new lines --- lib/dictBuilder/zdict.rs | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 84e2ccc3..c007b32e 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -174,7 +174,10 @@ unsafe fn ZDICT_analyzePos( let mut pos = suffix(start as usize) as size_t; let mut end = start; let mut solution = DictItem::default(); + doneMarks[pos] = 1; + + // trivial repetition cases if MEM_read16(b.add(pos) as *const core::ffi::c_void) as core::ffi::c_int == MEM_read16(b.add(pos).add(2) as *const core::ffi::c_void) as core::ffi::c_int || MEM_read16(b.add(pos).add(1) as *const core::ffi::c_void) as core::ffi::c_int @@ -182,6 +185,7 @@ unsafe fn ZDICT_analyzePos( || MEM_read16(b.add(pos).add(2) as *const core::ffi::c_void) as core::ffi::c_int == MEM_read16(b.add(pos).add(4) as *const core::ffi::c_void) as core::ffi::c_int { + // skip and mark segment let pattern16 = MEM_read16(b.add(pos).add(4) as *const core::ffi::c_void); let mut u: u32 = 0; let mut patternEnd = 6u32; @@ -203,6 +207,8 @@ unsafe fn ZDICT_analyzePos( } return solution; } + + // look forward let mut length: size_t = 0; loop { end = end.wrapping_add(1); @@ -214,6 +220,8 @@ unsafe fn ZDICT_analyzePos( break; } } + + // look backward let mut length_0: size_t = 0; loop { length_0 = ZDICT_count( @@ -227,6 +235,8 @@ unsafe fn ZDICT_analyzePos( break; } } + + // exit if not found a minimum number of repetitions if end.wrapping_sub(start) < minRatio { let mut idx: u32 = 0; idx = start; @@ -236,10 +246,12 @@ unsafe fn ZDICT_analyzePos( } return solution; } + let mut i: core::ffi::c_int = 0; let mut mml: u32 = 0; let mut refinedStart = start; let mut refinedEnd = end; + if notificationLevel >= 4 { eprintln!(); eprint!( @@ -250,6 +262,7 @@ unsafe fn ZDICT_analyzePos( ); eprintln!(); } + mml = MINMATCHLENGTH as u32; loop { let mut currentChar = 0; @@ -285,6 +298,8 @@ unsafe fn ZDICT_analyzePos( refinedEnd = refinedStart.wrapping_add(selectedCount); mml = mml.wrapping_add(1); } + + // evaluate gain based on new dict start = refinedStart; pos = suffix(refinedStart as usize) as size_t; end = start; @@ -293,6 +308,8 @@ unsafe fn ZDICT_analyzePos( 0, ::core::mem::size_of::<[u32; 64]>(), ); + + // look forward let mut length_1: size_t = 0; loop { end = end.wrapping_add(1); @@ -309,6 +326,8 @@ unsafe fn ZDICT_analyzePos( break; } } + + // look backward let mut length_2 = MINMATCHLENGTH; while (length_2 >= MINMATCHLENGTH) as core::ffi::c_int & (start > 0) as core::ffi::c_int != 0 { length_2 = ZDICT_count( @@ -324,6 +343,8 @@ unsafe fn ZDICT_analyzePos( start = start.wrapping_sub(1); } } + + // largest useful length ptr::write_bytes( cumulLength.as_mut_ptr() as *mut u8, 0, @@ -347,6 +368,8 @@ unsafe fn ZDICT_analyzePos( u_0 = u_0.wrapping_sub(1); } maxLength = u_0 as size_t; + + // reduce maxLength in case of final into repetitive data let mut l = maxLength as u32; let c = *b.add(pos.wrapping_add(maxLength).wrapping_sub(1)); while *b.add(pos.wrapping_add(l as size_t).wrapping_sub(2)) as core::ffi::c_int @@ -356,8 +379,10 @@ unsafe fn ZDICT_analyzePos( } maxLength = l as size_t; if maxLength < MINMATCHLENGTH { - return solution; + return solution; // skip: no long-enough solution available } + + // calculate savings *savings.as_mut_ptr().add(5) = 0; let mut u_1: core::ffi::c_uint = 0; u_1 = MINMATCHLENGTH as core::ffi::c_uint; @@ -368,6 +393,7 @@ unsafe fn ZDICT_analyzePos( ); u_1 = u_1.wrapping_add(1); } + if notificationLevel >= 4 { eprintln!( "Selected dict at position {}, of length {} : saves {} (ratio: {:.2}) ", @@ -378,9 +404,12 @@ unsafe fn ZDICT_analyzePos( / maxLength as core::ffi::c_double, ); } + solution.pos = pos as u32; solution.length = maxLength as u32; solution.savings = *savings.as_mut_ptr().add(maxLength); + + // mark positions done let mut id_0: u32 = 0; id_0 = start; while id_0 < end { @@ -407,6 +436,7 @@ unsafe fn ZDICT_analyzePos( } id_0 = id_0.wrapping_add(1); } + solution } From d74ffd32862cf987f0f55ace480207daca390864 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 23 Nov 2025 21:21:33 +0100 Subject: [PATCH 07/10] `zdict.rs`: make `doneMarks` use `bool` --- lib/dictBuilder/zdict.rs | 14 +++++++------- test-libzstd-rs-sys/src/dict_builder.rs | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index c007b32e..ff780622 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -149,7 +149,7 @@ unsafe fn ZDICT_count( const LLIMIT: usize = 64; const MINMATCHLENGTH: usize = 7; unsafe fn ZDICT_analyzePos( - doneMarks: &mut [u8], + doneMarks: &mut [bool], suffix_slice: &[u32], mut start: u32, buffer: *const core::ffi::c_void, @@ -175,7 +175,7 @@ unsafe fn ZDICT_analyzePos( let mut end = start; let mut solution = DictItem::default(); - doneMarks[pos] = 1; + doneMarks[pos] = true; // trivial repetition cases if MEM_read16(b.add(pos) as *const core::ffi::c_void) as core::ffi::c_int @@ -202,7 +202,7 @@ unsafe fn ZDICT_analyzePos( } u = 1; while u < patternEnd { - doneMarks[pos.wrapping_add(u as size_t)] = 1; + doneMarks[pos.wrapping_add(u as size_t)] = true; u = u.wrapping_add(1); } return solution; @@ -241,7 +241,7 @@ unsafe fn ZDICT_analyzePos( let mut idx: u32 = 0; idx = start; while idx < end { - doneMarks[suffix(idx as usize) as usize] = 1; + doneMarks[suffix(idx as usize) as usize] = true; idx = idx.wrapping_add(1); } return solution; @@ -431,7 +431,7 @@ unsafe fn ZDICT_analyzePos( pEnd = testedPos.wrapping_add(length_3); p = testedPos; while p < pEnd { - doneMarks[p as usize] = 1; + doneMarks[p as usize] = true; p = p.wrapping_add(1); } id_0 = id_0.wrapping_add(1); @@ -687,10 +687,10 @@ unsafe fn ZDICT_trainBuffer_legacy( eprintln!("minimum ratio : {} ", minRatio); } - let mut doneMarks = vec![0u8; bufferSize + 16]; + let mut doneMarks = vec![false; bufferSize + 16]; let mut cursor = 0usize; while cursor < bufferSize { - if doneMarks[cursor] != 0 { + if doneMarks[cursor] { cursor += 1; continue; } diff --git a/test-libzstd-rs-sys/src/dict_builder.rs b/test-libzstd-rs-sys/src/dict_builder.rs index c9c287d7..e9e376c8 100644 --- a/test-libzstd-rs-sys/src/dict_builder.rs +++ b/test-libzstd-rs-sys/src/dict_builder.rs @@ -196,7 +196,7 @@ fn test_train_from_buffer_fastcover() { #[test] #[cfg(not(target_family = "wasm"))] -#[cfg_attr(miri, ignore = "slow")] +// #[cfg_attr(miri, ignore = "slow")] fn test_train_from_buffer_legacy() { let input_data = "The quick brown fox jumps high"; From e0cbe669b914ec8cf8f17a1e81e9f2037aab0268 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 23 Nov 2025 21:23:59 +0100 Subject: [PATCH 08/10] `zdict.rs`: remove casts --- lib/dictBuilder/zdict.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index ff780622..3ed65e69 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -190,13 +190,12 @@ unsafe fn ZDICT_analyzePos( let mut u: u32 = 0; let mut patternEnd = 6u32; while MEM_read16(b.add(pos).offset(patternEnd as isize) as *const core::ffi::c_void) - as core::ffi::c_int - == pattern16 as core::ffi::c_int + == pattern16 { patternEnd = patternEnd.wrapping_add(2); } - if *b.add(pos.wrapping_add(patternEnd as size_t)) as core::ffi::c_int - == *b.add(pos.wrapping_add(patternEnd as size_t).wrapping_sub(1)) as core::ffi::c_int + if *b.add(pos.wrapping_add(patternEnd as size_t)) + == *b.add(pos.wrapping_add(patternEnd as size_t).wrapping_sub(1)) { patternEnd = patternEnd.wrapping_add(1); } From 9c5513cf59727daa62b44f69de2f743b02e93e80 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 23 Nov 2025 21:27:36 +0100 Subject: [PATCH 09/10] `zdict.rs`: use slice fill on `doneMarks` --- lib/dictBuilder/zdict.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 3ed65e69..01fa0782 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -187,11 +187,8 @@ unsafe fn ZDICT_analyzePos( { // skip and mark segment let pattern16 = MEM_read16(b.add(pos).add(4) as *const core::ffi::c_void); - let mut u: u32 = 0; - let mut patternEnd = 6u32; - while MEM_read16(b.add(pos).offset(patternEnd as isize) as *const core::ffi::c_void) - == pattern16 - { + let mut patternEnd = 6usize; + while MEM_read16(b.add(pos).add(patternEnd) as *const core::ffi::c_void) == pattern16 { patternEnd = patternEnd.wrapping_add(2); } if *b.add(pos.wrapping_add(patternEnd as size_t)) @@ -199,11 +196,7 @@ unsafe fn ZDICT_analyzePos( { patternEnd = patternEnd.wrapping_add(1); } - u = 1; - while u < patternEnd { - doneMarks[pos.wrapping_add(u as size_t)] = true; - u = u.wrapping_add(1); - } + doneMarks[pos..][1..patternEnd].fill(true); return solution; } From ae18b5707105e8333059d8263a584ce4db063107 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 23 Nov 2025 21:30:52 +0100 Subject: [PATCH 10/10] `zdict.rs`: cleanup loops --- lib/dictBuilder/zdict.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/lib/dictBuilder/zdict.rs b/lib/dictBuilder/zdict.rs index 01fa0782..4099040c 100644 --- a/lib/dictBuilder/zdict.rs +++ b/lib/dictBuilder/zdict.rs @@ -230,11 +230,8 @@ unsafe fn ZDICT_analyzePos( // exit if not found a minimum number of repetitions if end.wrapping_sub(start) < minRatio { - let mut idx: u32 = 0; - idx = start; - while idx < end { + for idx in start..end { doneMarks[suffix(idx as usize) as usize] = true; - idx = idx.wrapping_add(1); } return solution; } @@ -260,14 +257,11 @@ unsafe fn ZDICT_analyzePos( let mut currentChar = 0; let mut currentCount = 0u32; let mut currentID = refinedStart; - let mut id: u32 = 0; let mut selectedCount = 0; let mut selectedID = currentID; - id = refinedStart; - while id < refinedEnd { - if *b.offset((suffix(id as usize)).wrapping_add(mml) as isize) as core::ffi::c_int - != currentChar as core::ffi::c_int - { + + for id in refinedStart..refinedEnd { + if *b.offset((suffix(id as usize)).wrapping_add(mml) as isize) != currentChar { if currentCount > selectedCount { selectedCount = currentCount; selectedID = currentID; @@ -277,8 +271,8 @@ unsafe fn ZDICT_analyzePos( currentCount = 0; } currentCount = currentCount.wrapping_add(1); - id = id.wrapping_add(1); } + if currentCount > selectedCount { selectedCount = currentCount; selectedID = currentID;