@@ -476,18 +476,20 @@ pub mod common {
476476/// The methods here line up with the `AsBytes` and `FromBytes` traits.
477477pub mod bytes {
478478
479+ use crate :: AsBytes ;
480+
479481 /// A coupled encode/decode pair for byte sequences.
480482 pub trait EncodeDecode {
481483 /// Encoded length in number of `u64` words required.
482- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
484+ fn length_in_words < ' a , I > ( bytes : & I ) -> usize where I : AsBytes < ' a > ;
483485 /// Encoded length in number of `u8` bytes required.
484486 ///
485487 /// This method should always be eight times `Self::length_in_words`, and is provided for convenience and clarity.
486- fn length_in_bytes < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > { 8 * Self :: length_in_words ( bytes) }
488+ fn length_in_bytes < ' a , I > ( bytes : & I ) -> usize where I : AsBytes < ' a > { 8 * Self :: length_in_words ( bytes) }
487489 /// Encodes `bytes` into a sequence of `u64`.
488- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
490+ fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : & I ) where I : AsBytes < ' a > ;
489491 /// Writes `bytes` in the encoded format to an arbitrary writer.
490- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
492+ fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : & I ) -> std:: io:: Result < ( ) > where I : AsBytes < ' a > ;
491493 /// Decodes bytes from a sequence of `u64`.
492494 fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > ;
493495 }
@@ -499,18 +501,20 @@ pub mod bytes {
499501 pub use serialization:: Sequence ;
500502 mod serialization {
501503
504+ use crate :: AsBytes ;
505+
502506 /// Encodes and decodes bytes sequences, by prepending the length and appending the all sequences.
503507 pub struct Sequence ;
504508 impl super :: EncodeDecode for Sequence {
505- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
509+ fn length_in_words < ' a , I > ( bytes : & I ) -> usize where I : AsBytes < ' a > {
506510 // Each byte slice has one `u64` for the length, and then as many `u64`s as needed to hold all bytes.
507- bytes. map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
511+ bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
508512 }
509- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
510- encode ( store, bytes)
513+ fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : & I ) where I : AsBytes < ' a > {
514+ encode ( store, bytes. as_bytes ( ) )
511515 }
512- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
513- write ( writer, bytes)
516+ fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : & I ) -> std:: io:: Result < ( ) > where I : AsBytes < ' a > {
517+ write ( writer, bytes. as_bytes ( ) )
514518 }
515519 fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
516520 decode ( store)
@@ -598,6 +602,167 @@ pub mod bytes {
598602 }
599603 }
600604
605+ /// A binary encoding of sequences of byte slices.
606+ ///
607+ /// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow.
608+ /// Treating the offsets as a byte slice too, the each offset indicates the location (in bytes) of the end of its slice.
609+ /// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight.
610+ pub use serialization_neu:: Indexed ;
611+ pub mod serialization_neu {
612+
613+ use crate :: AsBytes ;
614+
615+ /// Encodes and decodes bytes sequences, using an index of offsets.
616+ pub struct Indexed ;
617+ impl super :: EncodeDecode for Indexed {
618+ fn length_in_words < ' a , I > ( bytes : & I ) -> usize where I : AsBytes < ' a > {
619+ 1 + bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum :: < usize > ( )
620+ }
621+ fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : & I ) where I : AsBytes < ' a > {
622+ encode ( store, bytes)
623+ }
624+ fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : & I ) -> std:: io:: Result < ( ) > where I : AsBytes < ' a > {
625+ write ( writer, bytes)
626+ }
627+ fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
628+ decode ( store)
629+ }
630+ }
631+
632+ /// Encodes `item` into `u64` aligned words.
633+ ///
634+ /// The sequence of byte slices are appended, with padding to have each slice start `u64` aligned.
635+ /// The sequence is then pre-pended with as many byte offsets as there are slices in `item`, plus one.
636+ /// The byte offsets indicate where each slice ends, and by rounding up to `u64` alignemnt where the next slice begins.
637+ /// The first offset indicates where the list of offsets itself ends, and where the first slice begins.
638+ ///
639+ /// We will need to visit `as_bytes` three times to extract this information, so the method should be efficient and inlined.
640+ /// The first read writes the first offset, the second writes each other offset, and the third writes the bytes themselves.
641+ ///
642+ /// The offsets are zero-based, rather than based on `store.len()`.
643+ /// If you call the method with a non-empty `store` be careful decoding.
644+ pub fn encode < ' a , I > ( store : & mut Vec < u64 > , iter : & I )
645+ where I : AsBytes < ' a > ,
646+ {
647+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
648+ // TODO: right-size `store` before first call to `push`.
649+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
650+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
651+ store. push ( offsets_end) ;
652+ // Read 2: Establish each of the offsets based on lengths of byte slices.
653+ let mut position_bytes = offsets_end;
654+ for ( align, bytes) in iter. as_bytes ( ) {
655+ assert ! ( align <= 8 ) ;
656+ // Round up to 8-byte aligned lengths.
657+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
658+ store. push ( to_push) ;
659+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
660+ position_bytes += round_len;
661+ }
662+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
663+ for ( _align, bytes) in iter. as_bytes ( ) {
664+ let whole_words = 8 * ( bytes. len ( ) / 8 ) ;
665+ // We want to extend `store` by `bytes`, but `bytes` may not be `u64` aligned.
666+ // In the latter case, init `store` and cast and copy onto it as a byte slice.
667+ if let Ok ( words) = bytemuck:: try_cast_slice ( & bytes[ .. whole_words] ) {
668+ store. extend_from_slice ( words) ;
669+ }
670+ else {
671+ let store_len = store. len ( ) ;
672+ store. resize ( store_len + whole_words/8 , 0 ) ;
673+ let slice = bytemuck:: try_cast_slice_mut ( & mut store[ store_len..] ) . expect ( "&[u64] should convert to &[u8]" ) ;
674+ slice. copy_from_slice ( & bytes[ .. whole_words] ) ;
675+ }
676+ let remaining_bytes = & bytes[ whole_words..] ;
677+ if !remaining_bytes. is_empty ( ) {
678+ let mut remainder = 0u64 ;
679+ let transmute: & mut [ u8 ] = bytemuck:: try_cast_slice_mut ( std:: slice:: from_mut ( & mut remainder) ) . expect ( "&[u64] should convert to &[u8]" ) ;
680+ for ( i, byte) in remaining_bytes. iter ( ) . enumerate ( ) {
681+ transmute[ i] = * byte;
682+ }
683+ store. push ( remainder) ;
684+ }
685+ }
686+ }
687+
688+ pub fn write < ' a , I , W > ( mut writer : W , iter : & I ) -> std:: io:: Result < ( ) >
689+ where
690+ I : AsBytes < ' a > ,
691+ W : std:: io:: Write ,
692+ {
693+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
694+ // TODO: right-size `store` before first call to `push`.
695+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
696+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
697+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & offsets_end) ) ) ?;
698+ // Read 2: Establish each of the offsets based on lengths of byte slices.
699+ let mut position_bytes = offsets_end;
700+ for ( align, bytes) in iter. as_bytes ( ) {
701+ assert ! ( align <= 8 ) ;
702+ // Round up to 8-byte aligned lengths.
703+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
704+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & to_push) ) ) ?;
705+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
706+ position_bytes += round_len;
707+ }
708+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
709+ for ( _align, bytes) in iter. as_bytes ( ) {
710+ writer. write_all ( bytes) ?;
711+ let padding = 8 - ( bytes. len ( ) % 8 ) ;
712+ if padding > 0 {
713+ writer. write_all ( & [ 0u8 ; 8 ] [ ..padding] ) ?;
714+ }
715+ }
716+
717+ Ok ( ( ) )
718+ }
719+
720+ /// Decodes an encoded sequence of byte slices. Each result will be `u64` aligned.
721+ pub fn decode ( store : & [ u64 ] ) -> impl Iterator < Item =& [ u8 ] > {
722+ assert ! ( store[ 0 ] % 8 == 0 ) ;
723+ let slices = ( store[ 0 ] / 8 ) - 1 ;
724+ ( 0 .. slices) . map ( |i| decode_index ( store, i) )
725+ }
726+
727+ /// Decodes a specific byte slice by index. It will be `u64` aligned.
728+ #[ inline( always) ]
729+ pub fn decode_index ( store : & [ u64 ] , index : u64 ) -> & [ u8 ] {
730+ debug_assert ! ( index + 1 < store[ 0 ] /8 ) ;
731+ let index: usize = index. try_into ( ) . unwrap ( ) ;
732+ let lower: usize = ( ( store[ index] + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
733+ let upper: usize = ( store[ index + 1 ] ) . try_into ( ) . unwrap ( ) ;
734+ let bytes: & [ u8 ] = bytemuck:: try_cast_slice ( store) . expect ( "&[u64] should convert to &[u8]" ) ;
735+ & bytes[ lower .. upper]
736+ }
737+
738+ #[ cfg( test) ]
739+ mod test {
740+
741+ use crate :: { Columnar , Container } ;
742+ use crate :: common:: Push ;
743+ use crate :: AsBytes ;
744+
745+ use super :: { encode, decode} ;
746+
747+ fn assert_roundtrip < ' a , AB : AsBytes < ' a > > ( item : & AB ) {
748+ let mut store = Vec :: new ( ) ;
749+ encode ( & mut store, item) ;
750+ assert ! ( item. as_bytes( ) . map( |x| x. 1 ) . eq( decode( & store) ) ) ;
751+ }
752+
753+ #[ test]
754+ fn round_trip ( ) {
755+
756+ let mut column: <Result < u64 , String > as Columnar >:: Container = Default :: default ( ) ;
757+ for i in 0 ..10000u64 {
758+ column. push ( & Ok :: < u64 , String > ( i) ) ;
759+ column. push ( & Err :: < u64 , String > ( format ! ( "{:?}" , i) ) ) ;
760+ }
761+
762+ assert_roundtrip ( & column. borrow ( ) ) ;
763+ }
764+ }
765+ }
601766
602767 #[ cfg( test) ]
603768 mod test {
@@ -635,7 +800,6 @@ pub mod bytes {
635800 }
636801 }
637802 }
638-
639803}
640804
641805/// Types that prefer to be represented by `Vec<T>`.
0 commit comments