@@ -400,8 +400,11 @@ where
400400
401401 // Use the shared function to handle the last two chunks
402402 let final_xmm7 = get_last_two_xmms :: < T , W > (
403- & data[ CRC_CHUNK_SIZE ..] ,
404- remaining_len,
403+ DataRegion {
404+ full_data : data,
405+ offset : CRC_CHUNK_SIZE ,
406+ remaining : remaining_len,
407+ } ,
405408 xmm7,
406409 keys,
407410 reflector,
@@ -461,8 +464,11 @@ where
461464 if remaining_len > 0 {
462465 // Use the shared get_last_two_xmms function to handle the remaining bytes
463466 xmm7 = get_last_two_xmms :: < T , W > (
464- & data[ current_pos..] ,
465- remaining_len,
467+ DataRegion {
468+ full_data : data,
469+ offset : current_pos,
470+ remaining : remaining_len,
471+ } ,
466472 xmm7,
467473 keys,
468474 reflector,
@@ -475,17 +481,35 @@ where
475481 W :: perform_final_reduction ( xmm7, state. reflected , keys, ops)
476482}
477483
484+ /// Data region descriptor for overlapping SIMD reads in CRC processing
485+ struct DataRegion < ' a > {
486+ full_data : & ' a [ u8 ] ,
487+ offset : usize ,
488+ remaining : usize ,
489+ }
490+
478491/// Handle the last two chunks of data (for small inputs)
479492/// This shared implementation works for both CRC-32 and CRC-64
493+ ///
494+ /// # Safety
495+ ///
496+ /// This function requires:
497+ /// - `region.full_data` must contain at least `region.offset + region.remaining` bytes
498+ /// - `region.offset` must be >= `CRC_CHUNK_SIZE` (16 bytes) to allow reading the overlapping region
499+ /// - `region.remaining` must be in range 1..=15 (less than one chunk)
500+ /// - The caller must ensure appropriate SIMD features are available (checked via target_feature)
501+ ///
502+ /// The function performs an overlapping read: it loads 16 bytes starting from position
503+ /// `region.offset - CRC_CHUNK_SIZE + region.remaining` in the full buffer. This creates a vector
504+ /// containing both the tail of the previously processed chunk and the new remaining data.
480505#[ inline]
481506#[ cfg_attr(
482507 any( target_arch = "x86" , target_arch = "x86_64" ) ,
483508 target_feature( enable = "ssse3,sse4.1,pclmulqdq" )
484509) ]
485510#[ cfg_attr( target_arch = "aarch64" , target_feature( enable = "aes" ) ) ]
486511unsafe fn get_last_two_xmms < T : ArchOps , W : EnhancedCrcWidth > (
487- data : & [ u8 ] ,
488- remaining_len : usize ,
512+ region : DataRegion ,
489513 current_state : T :: Vector ,
490514 keys : [ u64 ; 23 ] ,
491515 reflector : & Reflector < T :: Vector > ,
@@ -495,20 +519,29 @@ unsafe fn get_last_two_xmms<T: ArchOps, W: EnhancedCrcWidth>(
495519where
496520 T :: Vector : Copy ,
497521{
522+ // Safety check: ensure we have enough data before the offset for the overlapping read
523+ debug_assert ! ( region. offset >= CRC_CHUNK_SIZE ,
524+ "offset must be >= CRC_CHUNK_SIZE to allow overlapping read" ) ;
525+ debug_assert ! ( region. remaining > 0 && region. remaining < CRC_CHUNK_SIZE ,
526+ "remaining must be 1..15 bytes" ) ;
527+ debug_assert ! ( region. offset + region. remaining <= region. full_data. len( ) ,
528+ "offset + remaining must not exceed full_data length" ) ;
498529 // Create coefficient for folding operations
499530 let coefficient = W :: create_coefficient ( keys[ 2 ] , keys[ 1 ] , reflected, ops) ;
500531
501532 let const_mask = ops. set_all_bytes ( 0x80 ) ;
502533
503534 // Get table pointer and offset based on CRC width
504- let ( table_ptr, offset) = W :: get_last_bytes_table_ptr ( reflected, remaining_len ) ;
535+ let ( table_ptr, offset) = W :: get_last_bytes_table_ptr ( reflected, region . remaining ) ;
505536
506537 if reflected {
507538 // For reflected mode (CRC-32r, CRC-64r)
508539
509- // Load the remaining data
510- // Special pointer arithmetic to match the original implementation
511- let xmm1 = ops. load_bytes ( data. as_ptr ( ) . sub ( CRC_CHUNK_SIZE ) . add ( remaining_len) ) ; // DON: looks correct
540+ // Load the remaining data using an overlapping read
541+ // This loads 16 bytes starting from (offset - CRC_CHUNK_SIZE + remaining),
542+ // which includes the tail of the previous chunk plus the remaining new data
543+ let read_offset = region. offset - CRC_CHUNK_SIZE + region. remaining ;
544+ let xmm1 = ops. load_bytes ( region. full_data . as_ptr ( ) . add ( read_offset) ) ;
512545
513546 // Load the shuffle mask
514547 let mut xmm0 = ops. load_bytes ( table_ptr. add ( offset) ) ;
@@ -549,9 +582,9 @@ where
549582 } else {
550583 // For non-reflected mode (CRC-32f, CRC-64f)
551584
552- // Load the remaining data and apply reflection if needed
553- let data_ptr = data . as_ptr ( ) . sub ( CRC_CHUNK_SIZE ) . add ( remaining_len ) ;
554- let mut xmm1 = ops. load_bytes ( data_ptr ) ;
585+ // Load the remaining data using an overlapping read (same as reflected mode)
586+ let read_offset = region . offset - CRC_CHUNK_SIZE + region . remaining ;
587+ let mut xmm1 = ops. load_bytes ( region . full_data . as_ptr ( ) . add ( read_offset ) ) ;
555588
556589 // Apply reflection if in forward mode
557590 if let Reflector :: ForwardReflector { smask } = reflector {
0 commit comments