@@ -476,18 +476,20 @@ pub mod common {
476476/// The methods here line up with the `AsBytes` and `FromBytes` traits.
477477pub mod bytes {
478478
479+ use crate :: AsBytes ;
480+
479481 /// A coupled encode/decode pair for byte sequences.
480482 pub trait EncodeDecode {
481483 /// Encoded length in number of `u64` words required.
482- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
484+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > ;
483485 /// Encoded length in number of `u8` bytes required.
484486 ///
485487 /// This method should always be eight times `Self::length_in_words`, and is provided for convenience and clarity.
486- fn length_in_bytes < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > { 8 * Self :: length_in_words ( bytes) }
488+ fn length_in_bytes < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > { 8 * Self :: length_in_words ( bytes) }
487489 /// Encodes `bytes` into a sequence of `u64`.
488- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
490+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > ;
489491 /// Writes `bytes` in the encoded format to an arbitrary writer.
490- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
492+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > ;
491493 /// Decodes bytes from a sequence of `u64`.
492494 fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > ;
493495 }
@@ -499,18 +501,20 @@ pub mod bytes {
499501 pub use serialization:: Sequence ;
500502 mod serialization {
501503
504+ use crate :: AsBytes ;
505+
502506 /// Encodes and decodes bytes sequences, by prepending the length and appending the all sequences.
503507 pub struct Sequence ;
504508 impl super :: EncodeDecode for Sequence {
505- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
509+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > {
506510 // Each byte slice has one `u64` for the length, and then as many `u64`s as needed to hold all bytes.
507- bytes. map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
511+ bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
508512 }
509- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
510- encode ( store, bytes)
513+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > {
514+ encode ( store, bytes. as_bytes ( ) )
511515 }
512- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
513- write ( writer, bytes)
516+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > {
517+ write ( writer, bytes. as_bytes ( ) )
514518 }
515519 fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
516520 decode ( store)
@@ -598,6 +602,166 @@ pub mod bytes {
598602 }
599603 }
600604
605+ /// A binary encoding of sequences of byte slices.
606+ ///
607+ /// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow.
608+ /// Treating the offsets as a byte slice too, the each offset indicates the location (in bytes) of the end of its slice.
609+ /// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight.
610+ pub use serialization_neu:: Indexed ;
611+ pub mod serialization_neu {
612+
613+ use crate :: AsBytes ;
614+
615+ /// Encodes and decodes bytes sequences, using an index of offsets.
616+ pub struct Indexed ;
617+ impl super :: EncodeDecode for Indexed {
618+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > {
619+ 1 + bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum :: < usize > ( )
620+ }
621+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > {
622+ encode ( store, bytes)
623+ }
624+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > {
625+ write ( writer, bytes)
626+ }
627+ fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
628+ decode ( store)
629+ }
630+ }
631+
632+ /// Encodes `item` into `u64` aligned words.
633+ ///
634+ /// The sequence of byte slices are appended, with padding to have each slice start `u64` aligned.
635+ /// The sequence is then pre-pended with as many byte offsets as there are slices in `item`, plus one.
636+ /// The byte offsets indicate where each slice ends, and by rounding up to `u64` alignemnt where the next slice begins.
637+ /// The first offset indicates where the list of offsets itself ends, and where the first slice begins.
638+ ///
639+ /// We will need to visit `as_bytes` three times to extract this information, so the method should be efficient and inlined.
640+ /// The first read writes the first offset, the second writes each other offset, and the third writes the bytes themselves.
641+ ///
642+ /// The offsets are zero-based, rather than based on `store.len()`.
643+ /// If you call the method with a non-empty `store` be careful decoding.
644+ pub fn encode < ' a , A > ( store : & mut Vec < u64 > , iter : & A )
645+ where A : AsBytes < ' a > ,
646+ {
647+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
648+ // TODO: right-size `store` before first call to `push`.
649+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
650+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
651+ store. push ( offsets_end) ;
652+ // Read 2: Establish each of the offsets based on lengths of byte slices.
653+ let mut position_bytes = offsets_end;
654+ for ( align, bytes) in iter. as_bytes ( ) {
655+ assert ! ( align <= 8 ) ;
656+ // Write length in bytes, but round up to words before updating `position_bytes`.
657+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
658+ store. push ( to_push) ;
659+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
660+ position_bytes += round_len;
661+ }
662+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
663+ for ( _align, bytes) in iter. as_bytes ( ) {
664+ let whole_words = 8 * ( bytes. len ( ) / 8 ) ;
665+ // We want to extend `store` by `bytes`, but `bytes` may not be `u64` aligned.
666+ // In the latter case, init `store` and cast and copy onto it as a byte slice.
667+ if let Ok ( words) = bytemuck:: try_cast_slice ( & bytes[ .. whole_words] ) {
668+ store. extend_from_slice ( words) ;
669+ }
670+ else {
671+ let store_len = store. len ( ) ;
672+ store. resize ( store_len + whole_words/8 , 0 ) ;
673+ let slice = bytemuck:: try_cast_slice_mut ( & mut store[ store_len..] ) . expect ( "&[u64] should convert to &[u8]" ) ;
674+ slice. copy_from_slice ( & bytes[ .. whole_words] ) ;
675+ }
676+ let remaining_bytes = & bytes[ whole_words..] ;
677+ if !remaining_bytes. is_empty ( ) {
678+ let mut remainder = 0u64 ;
679+ let transmute: & mut [ u8 ] = bytemuck:: try_cast_slice_mut ( std:: slice:: from_mut ( & mut remainder) ) . expect ( "&[u64] should convert to &[u8]" ) ;
680+ for ( i, byte) in remaining_bytes. iter ( ) . enumerate ( ) {
681+ transmute[ i] = * byte;
682+ }
683+ store. push ( remainder) ;
684+ }
685+ }
686+ }
687+
688+ pub fn write < ' a , A , W > ( mut writer : W , iter : & A ) -> std:: io:: Result < ( ) >
689+ where
690+ A : AsBytes < ' a > ,
691+ W : std:: io:: Write ,
692+ {
693+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
694+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
695+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
696+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & offsets_end) ) ) ?;
697+ // Read 2: Establish each of the offsets based on lengths of byte slices.
698+ let mut position_bytes = offsets_end;
699+ for ( align, bytes) in iter. as_bytes ( ) {
700+ assert ! ( align <= 8 ) ;
701+ // Write length in bytes, but round up to words before updating `position_bytes`.
702+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
703+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & to_push) ) ) ?;
704+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
705+ position_bytes += round_len;
706+ }
707+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
708+ for ( _align, bytes) in iter. as_bytes ( ) {
709+ writer. write_all ( bytes) ?;
710+ let padding = ( ( bytes. len ( ) + 7 ) & !7 ) - bytes. len ( ) ;
711+ if padding > 0 {
712+ writer. write_all ( & [ 0u8 ; 8 ] [ ..padding] ) ?;
713+ }
714+ }
715+
716+ Ok ( ( ) )
717+ }
718+
719+ /// Decodes an encoded sequence of byte slices. Each result will be `u64` aligned.
720+ pub fn decode ( store : & [ u64 ] ) -> impl Iterator < Item =& [ u8 ] > {
721+ assert ! ( store[ 0 ] % 8 == 0 ) ;
722+ let slices = ( store[ 0 ] / 8 ) - 1 ;
723+ ( 0 .. slices) . map ( |i| decode_index ( store, i) )
724+ }
725+
726+ /// Decodes a specific byte slice by index. It will be `u64` aligned.
727+ #[ inline( always) ]
728+ pub fn decode_index ( store : & [ u64 ] , index : u64 ) -> & [ u8 ] {
729+ debug_assert ! ( index + 1 < store[ 0 ] /8 ) ;
730+ let index: usize = index. try_into ( ) . unwrap ( ) ;
731+ let lower: usize = ( ( store[ index] + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
732+ let upper: usize = ( store[ index + 1 ] ) . try_into ( ) . unwrap ( ) ;
733+ let bytes: & [ u8 ] = bytemuck:: try_cast_slice ( store) . expect ( "&[u64] should convert to &[u8]" ) ;
734+ & bytes[ lower .. upper]
735+ }
736+
737+ #[ cfg( test) ]
738+ mod test {
739+
740+ use crate :: { Columnar , Container } ;
741+ use crate :: common:: Push ;
742+ use crate :: AsBytes ;
743+
744+ use super :: { encode, decode} ;
745+
746+ fn assert_roundtrip < ' a , AB : AsBytes < ' a > > ( item : & AB ) {
747+ let mut store = Vec :: new ( ) ;
748+ encode ( & mut store, item) ;
749+ assert ! ( item. as_bytes( ) . map( |x| x. 1 ) . eq( decode( & store) ) ) ;
750+ }
751+
752+ #[ test]
753+ fn round_trip ( ) {
754+
755+ let mut column: <Result < u64 , String > as Columnar >:: Container = Default :: default ( ) ;
756+ for i in 0 ..10000u64 {
757+ column. push ( & Ok :: < u64 , String > ( i) ) ;
758+ column. push ( & Err :: < u64 , String > ( format ! ( "{:?}" , i) ) ) ;
759+ }
760+
761+ assert_roundtrip ( & column. borrow ( ) ) ;
762+ }
763+ }
764+ }
601765
602766 #[ cfg( test) ]
603767 mod test {
@@ -635,7 +799,6 @@ pub mod bytes {
635799 }
636800 }
637801 }
638-
639802}
640803
641804/// Types that prefer to be represented by `Vec<T>`.
0 commit comments