@@ -340,19 +340,15 @@ def parquet_decoder(
340340 # pyarrow.Buffer when we have a memoryview to avoid creating intermediate
341341 # Python bytes objects.
342342 if isinstance (buffer , memoryview ):
343- # pyarrow.py_buffer accepts buffer-protocol objects and is zero-copy
344- try :
345- pa_buf = pyarrow .py_buffer (buffer )
346- stream = pyarrow .BufferReader (pa_buf )
347- except Exception :
348- # fallback to MemoryViewStream if pyarrow can't handle this memoryview
349- stream = MemoryViewStream (buffer )
343+ pa_buf = pyarrow .py_buffer (buffer )
344+ stream = pyarrow .BufferReader (pa_buf )
350345 elif isinstance (buffer , bytes ):
351346 stream = pyarrow .BufferReader (buffer )
352347 else :
353348 stream = pyarrow .input_stream (buffer )
354349
355- parquet_file = parquet .ParquetFile (stream )
350+ pq_meta = parquet .read_metadata (stream )
351+ stream .seek (0 )
356352
357353 # we need to work out if we have a selection which may force us
358354 # fetching columns just for filtering
@@ -366,7 +362,7 @@ def parquet_decoder(
366362 c .value for c in get_all_nodes_of_type (processed_selection , (NodeType .IDENTIFIER ,))
367363 }
368364 selected_columns = list (
369- projection_set .union (filter_columns ).intersection (parquet_file . schema_arrow .names )
365+ projection_set .union (filter_columns ).intersection (pq_meta . schema .names )
370366 )
371367
372368 # Read all columns if none are selected, unless force_read is set
@@ -376,18 +372,18 @@ def parquet_decoder(
376372 # get the full data size of the file to see how effective projection/selection is
377373 uncompressed_size = sum (
378374 row_group .column (j ).total_uncompressed_size
379- for i in range (parquet_file . metadata .num_row_groups )
380- for row_group in [parquet_file . metadata .row_group (i )]
375+ for i in range (pq_meta .num_row_groups )
376+ for row_group in [pq_meta .row_group (i )]
381377 for j in range (row_group .num_columns )
382378 )
383379
384380 # If it's COUNT(*), we don't need to create a full dataset
385381 # We have a handler later to sum up the $COUNT(*) column
386382 if projection == [] and selection == []:
387- table = pyarrow .Table .from_arrays ([[parquet_file . metadata .num_rows ]], names = ["$COUNT(*)" ])
383+ table = pyarrow .Table .from_arrays ([[pq_meta .num_rows ]], names = ["$COUNT(*)" ])
388384 return (
389- parquet_file . metadata .num_rows ,
390- parquet_file . metadata .num_columns ,
385+ pq_meta .num_rows ,
386+ pq_meta .num_columns ,
391387 uncompressed_size ,
392388 table ,
393389 )
@@ -400,16 +396,15 @@ def parquet_decoder(
400396 filters = dnf_filter ,
401397 use_threads = use_threads ,
402398 use_pandas_metadata = False ,
403- schema = parquet_file .schema_arrow ,
404399 )
405400
406401 # Any filters we couldn't push to PyArrow to read we run here
407402 if processed_selection :
408403 table = filter_records (processed_selection , table )
409404
410405 return (
411- parquet_file . metadata .num_rows ,
412- parquet_file . metadata .num_columns ,
406+ pq_meta .num_rows ,
407+ pq_meta .num_columns ,
413408 uncompressed_size ,
414409 table ,
415410 )
0 commit comments