11import os
2+ import random
23
34import pytest
45
1112except ImportError :
1213 pq = None
1314
14- from fsspec .core import url_to_fs
1515from fsspec .parquet import (
16- _get_parquet_byte_ranges ,
1716 open_parquet_file ,
1817 open_parquet_files ,
1918)
2019
20+ pd = pytest .importorskip ("pandas" )
21+ pd_gt_3 = pd .__version__ > "3"
22+
2123# Define `engine` fixture
22- FASTPARQUET_MARK = pytest .mark .skipif (not fastparquet , reason = "fastparquet not found" )
24+ FASTPARQUET_MARK = pytest .mark .skipif (
25+ pd_gt_3 or not fastparquet , reason = "fastparquet not found"
26+ )
2327PYARROW_MARK = pytest .mark .skipif (not pq , reason = "pyarrow not found" )
2428
2529
@@ -43,7 +47,6 @@ def test_open_parquet_file(
4347 tmpdir , engine , columns , max_gap , max_block , footer_sample_size , range_index
4448):
4549 # Pandas required for this test
46- pd = pytest .importorskip ("pandas" )
4750 if columns == ["z" ] and engine == "fastparquet" :
4851 columns = ["z.a" ] # fastparquet is more specific
4952
@@ -52,9 +55,9 @@ def test_open_parquet_file(
5255 nrows = 40
5356 df = pd .DataFrame (
5457 {
55- "x" : [i * 7 % 5 for i in range (nrows )],
5658 "y" : [[0 , i ] for i in range (nrows )], # list
5759 "z" : [{"a" : i , "b" : "cat" } for i in range (nrows )], # struct
60+ "x" : [i * 7 % 5 for i in range (nrows )],
5861 },
5962 index = pd .Index ([10 * i for i in range (nrows )], name = "myindex" ),
6063 )
@@ -66,40 +69,6 @@ def test_open_parquet_file(
6669 # "Traditional read" (without `open_parquet_file`)
6770 expect = pd .read_parquet (path , columns = columns , engine = engine )
6871
69- # Use `_get_parquet_byte_ranges` to re-write a
70- # place-holder file with all bytes NOT required
71- # to read `columns` set to b"0". The purpose of
72- # this step is to make sure the read will fail
73- # if the correct bytes have not been accurately
74- # selected by `_get_parquet_byte_ranges`. If this
75- # test were reading from remote storage, we would
76- # not need this logic to capture errors.
77- fs = url_to_fs (path )[0 ]
78- data = _get_parquet_byte_ranges (
79- [path ],
80- fs ,
81- columns = columns ,
82- engine = engine ,
83- max_gap = max_gap ,
84- max_block = max_block ,
85- footer_sample_size = footer_sample_size ,
86- )[path ]
87- file_size = fs .size (path )
88- with open (path , "wb" ) as f :
89- f .write (b"0" * file_size )
90-
91- if footer_sample_size == 8 and columns is not None :
92- # We know 8 bytes is too small to include
93- # the footer metadata, so there should NOT
94- # be a key for the last 8 bytes of the file
95- bad_key = (file_size - 8 , file_size )
96- assert bad_key not in data
97-
98- for (start , stop ), byte_data in data .items ():
99- f .seek (start )
100- f .write (byte_data )
101-
102- # Read back the modified file with `open_parquet_file`
10372 with open_parquet_file (
10473 path ,
10574 columns = columns ,
@@ -151,10 +120,9 @@ def test_open_parquet_file(
151120 )
152121
153122
123+ @pytest .mark .filterwarnings ("ignore:.*Not enough data.*" )
154124@FASTPARQUET_MARK
155125def test_with_filter (tmpdir ):
156- import pandas as pd
157-
158126 df = pd .DataFrame (
159127 {
160128 "a" : [10 , 1 , 2 , 3 , 7 , 8 , 9 ],
@@ -180,10 +148,9 @@ def test_with_filter(tmpdir):
180148 pd .testing .assert_frame_equal (expect , result )
181149
182150
151+ @pytest .mark .filterwarnings ("ignore:.*Not enough data.*" )
183152@FASTPARQUET_MARK
184153def test_multiple (tmpdir ):
185- import pandas as pd
186-
187154 df = pd .DataFrame (
188155 {
189156 "a" : [10 , 1 , 2 , 3 , 7 , 8 , 9 ],
@@ -238,3 +205,19 @@ def test_multiple(tmpdir):
238205 dfs = [pd .read_parquet (f , engine = "fastparquet" , columns = ["a" ]) for f in ofs ]
239206 result = pd .concat (dfs ).reset_index (drop = True )
240207 assert expect .equals (result )
208+
209+
210+ @pytest .mark .parametrize ("n" , [100 , 10_000 , 1_000_000 ])
211+ def test_nested (n , tmpdir , engine ):
212+ path = os .path .join (str (tmpdir ), "test.parquet" )
213+ pa = pytest .importorskip ("pyarrow" )
214+ flat = pa .array ([random .random () for _ in range (n )])
215+ a = random .random ()
216+ b = random .random ()
217+ nested = pa .array ([{"a" : a , "b" : b } for _ in range (n )])
218+ table = pa .table ({"flat" : flat , "nested" : nested })
219+ pq .write_table (table , path )
220+ with open_parquet_file (path , columns = ["nested.a" ], engine = engine ) as fh :
221+ col = pd .read_parquet (fh , engine = engine , columns = ["nested.a" ])
222+ name = "a" if engine == "pyarrow" else "nested.a"
223+ assert (col [name ] == a ).all ()
0 commit comments