@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap as HashMap;
4
4
use rustc_hash:: FxHashSet as HashSet ;
5
5
use std:: sync:: Arc ;
6
6
use thiserror:: Error ;
7
- use const_primes :: is_prime ;
7
+ use crate :: rollhash :: { roll_hash , roll_hash_slice } ;
8
8
9
9
/// A struct that represents an encoding scheme based on byte-pair encoding (BPE).
10
10
#[ derive( Debug ) ]
@@ -497,43 +497,6 @@ impl Default for Encoding {
497
497
}
498
498
}
499
499
500
- // Chose a prime number greater than 256 that minimizes hash collisions
501
- // for the prefixes of all mergeable ranks.
502
- // Modulus * prime must be less than 2^63-1 to avoid overflow.
503
- const PRIME : i64 = 997 ;
504
- const PRIME_INVERSE : i64 = 617853560682069 ;
505
- const MODULUS : i64 = 1e15 as i64 + 37 ;
506
-
507
- const _: ( ) = assert ! ( PRIME > 256 , "PRIME must be greater than 256 for byte-wise rolling hash" ) ;
508
- const _: ( ) = assert ! ( PRIME < MODULUS , "PRIME must be less than MODULUS" ) ;
509
- const _: ( ) = assert ! (
510
- MODULUS as i128 * PRIME as i128 <= i64 :: MAX as i128 ,
511
- "MODULUS * PRIME must not exceed i64::MAX to avoid overflow"
512
- ) ;
513
- const _: ( ) = assert ! (
514
- ( PRIME as i128 * PRIME_INVERSE as i128 ) % MODULUS as i128 == 1 ,
515
- "PRIME_INVERSE must be the modular multiplicative inverse of PRIME"
516
- ) ;
517
- const _: ( ) = assert ! ( is_prime( PRIME as u64 ) , "PRIME must be a prime number" ) ;
518
- const _: ( ) = assert ! ( is_prime( MODULUS as u64 ) , "MODULUS must be a prime number" ) ;
519
-
520
-
521
- fn roll_hash ( old : i64 , new : u8 ) -> i64 {
522
- ( ( ( old * PRIME ) % MODULUS ) + ( new as i64 ) ) % MODULUS
523
- }
524
-
525
- fn roll_hash_back ( old : i64 , new : u8 ) -> i64 {
526
- ( ( ( ( old + MODULUS ) - ( new as i64 ) ) % MODULUS ) * PRIME_INVERSE ) % MODULUS
527
- }
528
-
529
-
530
- fn roll_hash_slice ( slice : & [ u8 ] ) -> i64 {
531
- let mut hash = 0 ;
532
- for & byte in slice {
533
- hash = roll_hash ( hash, byte) ;
534
- }
535
- hash
536
- }
537
500
#[ cfg( test) ]
538
501
mod tests {
539
502
use crate :: { EncodingFactory , EncodingFactoryError } ;
@@ -542,13 +505,6 @@ mod tests {
542
505
use test_case:: test_case;
543
506
use memory_stats:: memory_stats;
544
507
545
- #[ test]
546
- fn test_roll_hash ( ) {
547
- let result = roll_hash_back ( roll_hash ( roll_hash ( 0 , 10 ) , 17 ) , 17 ) ;
548
- let r2 = roll_hash ( 0 , 10 ) ;
549
- assert_eq ! ( result, r2) ;
550
- }
551
-
552
508
#[ test_case( EncodingFactory :: llama3 ; "llama3" ) ]
553
509
#[ test_case( EncodingFactory :: codestral ; "codestral" ) ]
554
510
#[ test_case( EncodingFactory :: cl100k_im ; "cl100k_im" ) ]
0 commit comments