@@ -11,7 +11,6 @@ use arrow_array::{
1111} ;
1212use arrow_select:: concat:: concat_batches;
1313use arrow_select:: take:: take_record_batch;
14- use bytes:: { Bytes , BytesMut } ;
1514use futures:: stream;
1615use itertools:: Itertools ;
1716use log:: info;
@@ -23,22 +22,17 @@ use parquet::file::metadata::RowGroupMetaData;
2322use serde:: { Deserialize , Serialize } ;
2423use stream:: StreamExt ;
2524use vortex:: aliases:: hash_map:: HashMap ;
26- use vortex:: array:: { ChunkedArray , PrimitiveArray } ;
25+ use vortex:: array:: ChunkedArray ;
2726use vortex:: arrow:: FromArrowType ;
28- use vortex:: buffer:: Buffer ;
2927use vortex:: compress:: CompressionStrategy ;
3028use vortex:: dtype:: DType ;
31- use vortex:: error:: { vortex_err, VortexResult } ;
32- use vortex:: sampling_compressor:: SamplingCompressor ;
33- use vortex:: serde:: chunked_reader:: ChunkedArrayReader ;
34- use vortex:: serde:: io:: { ObjectStoreExt , VortexReadAt , VortexWrite } ;
35- use vortex:: serde:: stream_reader:: StreamArrayReader ;
36- use vortex:: serde:: stream_writer:: StreamArrayWriter ;
37- use vortex:: serde:: DTypeReader ;
38- use vortex:: stream:: ArrayStreamExt ;
39- use vortex:: { Array , ArrayDType , IntoArray , IntoCanonical } ;
40-
41- use crate :: { COMPRESSORS , CTX } ;
29+ use vortex:: error:: VortexResult ;
30+ use vortex:: sampling_compressor:: { SamplingCompressor , ALL_ENCODINGS_CONTEXT } ;
31+ use vortex:: serde:: io:: { ObjectStoreReadAt , VortexReadAt , VortexWrite } ;
32+ use vortex:: serde:: layouts:: {
33+ LayoutBatchStreamBuilder , LayoutContext , LayoutDeserializer , LayoutWriter ,
34+ } ;
35+ use vortex:: { Array , IntoArray , IntoCanonical } ;
4236
4337pub const BATCH_SIZE : usize = 65_536 ;
4438
@@ -51,15 +45,18 @@ pub struct VortexFooter {
5145
5246pub async fn open_vortex ( path : & Path ) -> VortexResult < Array > {
5347 let file = tokio:: fs:: File :: open ( path) . await . unwrap ( ) ;
54- let reader = StreamArrayReader :: try_new ( file, CTX . clone ( ) )
55- . await ?
56- . load_dtype ( )
57- . await ?;
58- reader
59- . into_array_stream ( )
60- . collect_chunked ( )
61- . await
62- . map ( IntoArray :: into_array)
48+
49+ LayoutBatchStreamBuilder :: new (
50+ file,
51+ LayoutDeserializer :: new (
52+ ALL_ENCODINGS_CONTEXT . clone ( ) ,
53+ LayoutContext :: default ( ) . into ( ) ,
54+ ) ,
55+ )
56+ . build ( )
57+ . await ?
58+ . read_all ( )
59+ . await
6360}
6461
6562pub async fn rewrite_parquet_as_vortex < W : VortexWrite > (
@@ -68,24 +65,11 @@ pub async fn rewrite_parquet_as_vortex<W: VortexWrite>(
6865) -> VortexResult < ( ) > {
6966 let chunked = compress_parquet_to_vortex ( parquet_path. as_path ( ) ) ?;
7067
71- let written = StreamArrayWriter :: new ( write)
72- . write_array_stream ( chunked. array_stream ( ) )
68+ LayoutWriter :: new ( write)
69+ . write_array_columns ( chunked)
70+ . await ?
71+ . finalize ( )
7372 . await ?;
74-
75- let layout = written. array_layouts ( ) [ 0 ] . clone ( ) ;
76- let mut w = written. into_inner ( ) ;
77- let mut s = flexbuffers:: FlexbufferSerializer :: new ( ) ;
78- VortexFooter {
79- byte_offsets : layout. chunks . byte_offsets ,
80- row_offsets : layout. chunks . row_offsets ,
81- dtype_range : layout. dtype . begin ..layout. dtype . end ,
82- }
83- . serialize ( & mut s) ?;
84- let footer_bytes = Buffer :: from ( Bytes :: from ( s. take_buffer ( ) ) ) ;
85- let footer_len = footer_bytes. len ( ) as u64 ;
86- w. write_all ( footer_bytes) . await ?;
87- w. write_all ( footer_len. to_le_bytes ( ) ) . await ?;
88-
8973 Ok ( ( ) )
9074}
9175
@@ -102,17 +86,9 @@ pub fn read_parquet_to_vortex<P: AsRef<Path>>(parquet_path: P) -> VortexResult<C
10286 ChunkedArray :: try_new ( chunks, dtype)
10387}
10488
105- pub fn compress_parquet_to_vortex ( parquet_path : & Path ) -> VortexResult < ChunkedArray > {
89+ pub fn compress_parquet_to_vortex ( parquet_path : & Path ) -> VortexResult < Array > {
10690 let chunked = read_parquet_to_vortex ( parquet_path) ?;
107- let compressor: & dyn CompressionStrategy = & SamplingCompressor :: new ( COMPRESSORS . clone ( ) ) ;
108- let dtype = chunked. dtype ( ) . clone ( ) ;
109- ChunkedArray :: try_new (
110- chunked
111- . chunks ( )
112- . map ( |x| compressor. compress ( & x) )
113- . collect :: < VortexResult < Vec < _ > > > ( ) ?,
114- dtype,
115- )
91+ CompressionStrategy :: compress ( & SamplingCompressor :: default ( ) , & chunked. into_array ( ) )
11692}
11793
11894pub fn write_csv_as_parquet ( csv_path : PathBuf , output_path : & Path ) -> VortexResult < ( ) > {
@@ -134,64 +110,37 @@ pub fn write_csv_as_parquet(csv_path: PathBuf, output_path: &Path) -> VortexResu
134110 Ok ( ( ) )
135111}
136112
137- pub async fn read_vortex_footer_format < R : VortexReadAt > (
138- reader : R ,
139- len : u64 ,
140- ) -> VortexResult < ChunkedArrayReader < R > > {
141- let mut buf = BytesMut :: with_capacity ( 8 ) ;
142- unsafe { buf. set_len ( 8 ) }
143- buf = reader. read_at_into ( len - 8 , buf) . await ?;
144- let footer_len = u64:: from_le_bytes ( buf. as_ref ( ) . try_into ( ) . unwrap ( ) ) as usize ;
145-
146- buf. reserve ( footer_len - buf. len ( ) ) ;
147- unsafe { buf. set_len ( footer_len) }
148- buf = reader
149- . read_at_into ( len - footer_len as u64 - 8 , buf)
150- . await ?;
151-
152- let footer: VortexFooter = VortexFooter :: deserialize (
153- flexbuffers:: Reader :: get_root ( buf. as_ref ( ) ) . map_err ( |e| vortex_err ! ( "{}" , e) ) ?,
154- ) ?;
155-
156- let header_len = ( footer. dtype_range . end - footer. dtype_range . start ) as usize ;
157- buf. reserve ( header_len - buf. len ( ) ) ;
158- unsafe { buf. set_len ( header_len) }
159- buf = reader. read_at_into ( footer. dtype_range . start , buf) . await ?;
160- let dtype = DTypeReader :: new ( buf) . await ?. read_dtype ( ) . await ?;
161-
162- ChunkedArrayReader :: try_new (
113+ async fn take_vortex < T : VortexReadAt + Unpin + ' static > (
114+ reader : T ,
115+ indices : & [ u64 ] ,
116+ ) -> VortexResult < Array > {
117+ LayoutBatchStreamBuilder :: new (
163118 reader,
164- CTX . clone ( ) ,
165- dtype . into ( ) ,
166- PrimitiveArray :: from ( footer . byte_offsets ) . into_array ( ) ,
167- PrimitiveArray :: from ( footer . row_offsets ) . into_array ( ) ,
119+ LayoutDeserializer :: new (
120+ ALL_ENCODINGS_CONTEXT . clone ( ) ,
121+ LayoutContext :: default ( ) . into ( ) ,
122+ ) ,
168123 )
124+ . with_indices ( Array :: from ( indices. to_vec ( ) ) )
125+ . build ( )
126+ . await ?
127+ . read_all ( )
128+ . await
129+ // For equivalence.... we decompress to make sure we're not cheating too much.
130+ . and_then ( IntoCanonical :: into_canonical)
131+ . map ( Array :: from)
169132}
170133
171134pub async fn take_vortex_object_store (
172- fs : & Arc < dyn ObjectStore > ,
173- path : & object_store:: path:: Path ,
135+ fs : Arc < dyn ObjectStore > ,
136+ path : object_store:: path:: Path ,
174137 indices : & [ u64 ] ,
175138) -> VortexResult < Array > {
176- let head = fs. head ( path) . await ?;
177- let indices_array = indices. to_vec ( ) . into_array ( ) ;
178- let taken = read_vortex_footer_format ( fs. vortex_reader ( path) , head. size as u64 )
179- . await ?
180- . take_rows ( & indices_array)
181- . await ?;
182- // For equivalence.... we flatten to make sure we're not cheating too much.
183- Ok ( taken. into_canonical ( ) ?. into ( ) )
139+ take_vortex ( ObjectStoreReadAt :: new ( fs. clone ( ) , path) , indices) . await
184140}
185141
186142pub async fn take_vortex_tokio ( path : & Path , indices : & [ u64 ] ) -> VortexResult < Array > {
187- let len = File :: open ( path) ?. metadata ( ) ?. len ( ) ;
188- let indices_array = indices. to_vec ( ) . into_array ( ) ;
189- let taken = read_vortex_footer_format ( tokio:: fs:: File :: open ( path) . await ?, len)
190- . await ?
191- . take_rows ( & indices_array)
192- . await ?;
193- // For equivalence.... we flatten to make sure we're not cheating too much.
194- Ok ( taken. into_canonical ( ) ?. into ( ) )
143+ take_vortex ( tokio:: fs:: File :: open ( path) . await ?, indices) . await
195144}
196145
197146pub async fn take_parquet_object_store (
0 commit comments