GH-47172: [Python] Add a utility function to create Arrow table instead of pandas df (#47199)

egolearner · web-flow · commit 2cbf1227939d · 2025-08-04T09:46:15.000+02:00
### Rationale for this change resolve #47172 ### What changes are included in this PR? Add a utility function to create Arrow table instead of pandas df ### Are these changes tested? yes ### Are there any user-facing changes? no * GitHub Issue: #47172 Authored-by: egolearner <lijiliang@outlook.com> Signed-off-by: Rok Mihevc <rok@mihevc.org>
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
@@ -95,11 +95,9 @@ def _range_integers(size, dtype):
     return pa.array(np.arange(size, dtype=dtype))
 
 
-def _test_dataframe(size=10000, seed=0):
-    import pandas as pd
-
+def _test_dict(size=10000, seed=0):
     np.random.seed(seed)
-    df = pd.DataFrame({
+    return {
         'uint8': _random_integers(size, np.uint8),
         'uint16': _random_integers(size, np.uint16),
         'uint32': _random_integers(size, np.uint32),
@@ -114,13 +112,23 @@ def _test_dataframe(size=10000, seed=0):
         'strings': [util.rands(10) for i in range(size)],
         'all_none': [None] * size,
         'all_none_category': [None] * size
-    })
+    }
+
+
+def _test_dataframe(size=10000, seed=0):
+    import pandas as pd
+
+    df = pd.DataFrame(_test_dict(size, seed))
 
     # TODO(PARQUET-1015)
     # df['all_none_category'] = df['all_none_category'].astype('category')
     return df
 
 
+def _test_table(size=10000, seed=0):
+    return pa.Table.from_pydict(_test_dict(size, seed))
+
+
 def make_sample_file(table_or_df):
     import pyarrow.parquet as pq
 
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
@@ -28,7 +28,7 @@
 from pyarrow import fs
 from pyarrow.tests import util
 from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
-                                          _test_dataframe)
+                                          _test_table)
 
 try:
     import pyarrow.parquet as pq
@@ -76,20 +76,18 @@ def test_set_data_page_size():
         _check_roundtrip(t, data_page_size=target_page_size)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_set_write_batch_size():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
 
     _check_roundtrip(
         table, data_page_size=10, write_batch_size=1, version='2.4'
     )
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_set_dictionary_pagesize_limit():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
 
     _check_roundtrip(table, dictionary_pagesize_limit=1,
                      data_page_size=10, version='2.4')
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -38,7 +38,7 @@
 try:
     import pyarrow.parquet as pq
     from pyarrow.tests.parquet.common import (
-        _read_table, _test_dataframe, _write_table)
+        _read_table, _test_dataframe, _test_table, _write_table)
 except ImportError:
     pq = None
 
@@ -742,30 +742,28 @@ def test_dataset_read_pandas(tempdir):
     tm.assert_frame_equal(result.reindex(columns=expected.columns), expected)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_dataset_memory_map(tempdir):
     # ARROW-2627: Check that we can use ParquetDataset with memory-mapping
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     dataset = pq.ParquetDataset(
         dirpath, memory_map=True)
     assert dataset.read().equals(table)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_dataset_enable_buffered_stream(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     with pytest.raises(ValueError):
@@ -778,14 +776,13 @@ def test_dataset_enable_buffered_stream(tempdir):
         assert dataset.read().equals(table)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_dataset_enable_pre_buffer(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     for pre_buffer in (True, False):
@@ -800,10 +797,10 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5):
     test_data = []
     paths = []
     for i in range(nfiles):
-        df = _test_dataframe(file_nrows, seed=i)
+        table = _test_table(file_nrows, seed=i)
         path = base_path / f'{i}.parquet'
 
-        test_data.append(_write_table(df, path))
+        test_data.append(_write_table(table, path))
         paths.append(path)
     return paths
 
@@ -813,7 +810,7 @@ def _assert_dataset_paths(dataset, paths):
     assert set(paths) == set(dataset.files)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 @pytest.mark.parametrize('dir_prefix', ['_', '.'])
 def test_ignore_private_directories(tempdir, dir_prefix):
     dirpath = tempdir / guid()
@@ -830,7 +827,7 @@ def test_ignore_private_directories(tempdir, dir_prefix):
     _assert_dataset_paths(dataset, paths)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_ignore_hidden_files_dot(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
@@ -849,7 +846,7 @@ def test_ignore_hidden_files_dot(tempdir):
     _assert_dataset_paths(dataset, paths)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_ignore_hidden_files_underscore(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
@@ -868,7 +865,7 @@ def test_ignore_hidden_files_underscore(tempdir):
     _assert_dataset_paths(dataset, paths)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 @pytest.mark.parametrize('dir_prefix', ['_', '.'])
 def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix):
     # ARROW-8427 - don't ignore explicitly listed files if parent directory
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -23,7 +23,7 @@
 try:
     import pyarrow.parquet as pq
     from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
-                                              _range_integers)
+                                              _test_table, _range_integers)
 except ImportError:
     pq = None
 
@@ -314,10 +314,9 @@ def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
     tm.assert_frame_equal(result, df)
 
 
-@pytest.mark.pandas
+@pytest.mark.numpy
 def test_parquet_writer_filesystem_buffer_raises():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
     filesystem = fs.LocalFileSystem()
 
     # Should raise ValueError when filesystem is passed with file-like object