Skip to content

Commit 2cbf122

Browse files
authored
GH-47172: [Python] Add a utility function to create Arrow table instead of pandas df (#47199)
### Rationale for this change resolve #47172 ### What changes are included in this PR? Add a utility function to create Arrow table instead of pandas df ### Are these changes tested? yes ### Are there any user-facing changes? no * GitHub Issue: #47172 Authored-by: egolearner <[email protected]> Signed-off-by: Rok Mihevc <[email protected]>
1 parent 5101641 commit 2cbf122

File tree

4 files changed

+34
-32
lines changed

4 files changed

+34
-32
lines changed

python/pyarrow/tests/parquet/common.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,9 @@ def _range_integers(size, dtype):
9595
return pa.array(np.arange(size, dtype=dtype))
9696

9797

98-
def _test_dataframe(size=10000, seed=0):
99-
import pandas as pd
100-
98+
def _test_dict(size=10000, seed=0):
10199
np.random.seed(seed)
102-
df = pd.DataFrame({
100+
return {
103101
'uint8': _random_integers(size, np.uint8),
104102
'uint16': _random_integers(size, np.uint16),
105103
'uint32': _random_integers(size, np.uint32),
@@ -114,13 +112,23 @@ def _test_dataframe(size=10000, seed=0):
114112
'strings': [util.rands(10) for i in range(size)],
115113
'all_none': [None] * size,
116114
'all_none_category': [None] * size
117-
})
115+
}
116+
117+
118+
def _test_dataframe(size=10000, seed=0):
119+
import pandas as pd
120+
121+
df = pd.DataFrame(_test_dict(size, seed))
118122

119123
# TODO(PARQUET-1015)
120124
# df['all_none_category'] = df['all_none_category'].astype('category')
121125
return df
122126

123127

128+
def _test_table(size=10000, seed=0):
129+
return pa.Table.from_pydict(_test_dict(size, seed))
130+
131+
124132
def make_sample_file(table_or_df):
125133
import pyarrow.parquet as pq
126134

python/pyarrow/tests/parquet/test_basic.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from pyarrow import fs
2929
from pyarrow.tests import util
3030
from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
31-
_test_dataframe)
31+
_test_table)
3232

3333
try:
3434
import pyarrow.parquet as pq
@@ -76,20 +76,18 @@ def test_set_data_page_size():
7676
_check_roundtrip(t, data_page_size=target_page_size)
7777

7878

79-
@pytest.mark.pandas
79+
@pytest.mark.numpy
8080
def test_set_write_batch_size():
81-
df = _test_dataframe(100)
82-
table = pa.Table.from_pandas(df, preserve_index=False)
81+
table = _test_table(100)
8382

8483
_check_roundtrip(
8584
table, data_page_size=10, write_batch_size=1, version='2.4'
8685
)
8786

8887

89-
@pytest.mark.pandas
88+
@pytest.mark.numpy
9089
def test_set_dictionary_pagesize_limit():
91-
df = _test_dataframe(100)
92-
table = pa.Table.from_pandas(df, preserve_index=False)
90+
table = _test_table(100)
9391

9492
_check_roundtrip(table, dictionary_pagesize_limit=1,
9593
data_page_size=10, version='2.4')

python/pyarrow/tests/parquet/test_dataset.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
try:
3939
import pyarrow.parquet as pq
4040
from pyarrow.tests.parquet.common import (
41-
_read_table, _test_dataframe, _write_table)
41+
_read_table, _test_dataframe, _test_table, _write_table)
4242
except ImportError:
4343
pq = None
4444

@@ -742,30 +742,28 @@ def test_dataset_read_pandas(tempdir):
742742
tm.assert_frame_equal(result.reindex(columns=expected.columns), expected)
743743

744744

745-
@pytest.mark.pandas
745+
@pytest.mark.numpy
746746
def test_dataset_memory_map(tempdir):
747747
# ARROW-2627: Check that we can use ParquetDataset with memory-mapping
748748
dirpath = tempdir / guid()
749749
dirpath.mkdir()
750750

751-
df = _test_dataframe(10, seed=0)
751+
table = _test_table(10, seed=0)
752752
path = dirpath / '0.parquet'
753-
table = pa.Table.from_pandas(df)
754753
_write_table(table, path, version='2.6')
755754

756755
dataset = pq.ParquetDataset(
757756
dirpath, memory_map=True)
758757
assert dataset.read().equals(table)
759758

760759

761-
@pytest.mark.pandas
760+
@pytest.mark.numpy
762761
def test_dataset_enable_buffered_stream(tempdir):
763762
dirpath = tempdir / guid()
764763
dirpath.mkdir()
765764

766-
df = _test_dataframe(10, seed=0)
765+
table = _test_table(10, seed=0)
767766
path = dirpath / '0.parquet'
768-
table = pa.Table.from_pandas(df)
769767
_write_table(table, path, version='2.6')
770768

771769
with pytest.raises(ValueError):
@@ -778,14 +776,13 @@ def test_dataset_enable_buffered_stream(tempdir):
778776
assert dataset.read().equals(table)
779777

780778

781-
@pytest.mark.pandas
779+
@pytest.mark.numpy
782780
def test_dataset_enable_pre_buffer(tempdir):
783781
dirpath = tempdir / guid()
784782
dirpath.mkdir()
785783

786-
df = _test_dataframe(10, seed=0)
784+
table = _test_table(10, seed=0)
787785
path = dirpath / '0.parquet'
788-
table = pa.Table.from_pandas(df)
789786
_write_table(table, path, version='2.6')
790787

791788
for pre_buffer in (True, False):
@@ -800,10 +797,10 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5):
800797
test_data = []
801798
paths = []
802799
for i in range(nfiles):
803-
df = _test_dataframe(file_nrows, seed=i)
800+
table = _test_table(file_nrows, seed=i)
804801
path = base_path / f'{i}.parquet'
805802

806-
test_data.append(_write_table(df, path))
803+
test_data.append(_write_table(table, path))
807804
paths.append(path)
808805
return paths
809806

@@ -813,7 +810,7 @@ def _assert_dataset_paths(dataset, paths):
813810
assert set(paths) == set(dataset.files)
814811

815812

816-
@pytest.mark.pandas
813+
@pytest.mark.numpy
817814
@pytest.mark.parametrize('dir_prefix', ['_', '.'])
818815
def test_ignore_private_directories(tempdir, dir_prefix):
819816
dirpath = tempdir / guid()
@@ -830,7 +827,7 @@ def test_ignore_private_directories(tempdir, dir_prefix):
830827
_assert_dataset_paths(dataset, paths)
831828

832829

833-
@pytest.mark.pandas
830+
@pytest.mark.numpy
834831
def test_ignore_hidden_files_dot(tempdir):
835832
dirpath = tempdir / guid()
836833
dirpath.mkdir()
@@ -849,7 +846,7 @@ def test_ignore_hidden_files_dot(tempdir):
849846
_assert_dataset_paths(dataset, paths)
850847

851848

852-
@pytest.mark.pandas
849+
@pytest.mark.numpy
853850
def test_ignore_hidden_files_underscore(tempdir):
854851
dirpath = tempdir / guid()
855852
dirpath.mkdir()
@@ -868,7 +865,7 @@ def test_ignore_hidden_files_underscore(tempdir):
868865
_assert_dataset_paths(dataset, paths)
869866

870867

871-
@pytest.mark.pandas
868+
@pytest.mark.numpy
872869
@pytest.mark.parametrize('dir_prefix', ['_', '.'])
873870
def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix):
874871
# ARROW-8427 - don't ignore explicitly listed files if parent directory

python/pyarrow/tests/parquet/test_parquet_writer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
try:
2424
import pyarrow.parquet as pq
2525
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
26-
_range_integers)
26+
_test_table, _range_integers)
2727
except ImportError:
2828
pq = None
2929

@@ -314,10 +314,9 @@ def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
314314
tm.assert_frame_equal(result, df)
315315

316316

317-
@pytest.mark.pandas
317+
@pytest.mark.numpy
318318
def test_parquet_writer_filesystem_buffer_raises():
319-
df = _test_dataframe(100)
320-
table = pa.Table.from_pandas(df, preserve_index=False)
319+
table = _test_table(100)
321320
filesystem = fs.LocalFileSystem()
322321

323322
# Should raise ValueError when filesystem is passed with file-like object

0 commit comments

Comments
 (0)