add a test to capture the bug

Michael-J-Ward · Michael-J-Ward · commit d91b738ae8ed · 2024-08-27T15:06:46.000-05:00
diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py
@@ -343,6 +343,54 @@ def test_dataset_filter(ctx, capfd):
     assert result[0].column(1) == pa.array([-3])
 
 
+def test_dataset_count(ctx):
+    # `datafusion-python` issue: https://github.com/apache/datafusion-python/issues/800
+    # probably related to:
+    #  - [Support RecordBatch with zero columns but non zero row count](https://github.com/apache/arrow-rs/issues/1536)
+    #    * PR: https://github.com/apache/arrow-rs/pull/1552
+    #  - https://github.com/apache/arrow-rs/issues/1783
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    dataset = ds.dataset([batch])
+    ctx.register_dataset("t", dataset)
+
+    # The bug occurss in both the dataframe and SQL api
+    df = ctx.table("t")
+    assert df.count() == 3
+
+    count = ctx.sql("SELECT COUNT(*) FROM t")
+
+    # print(count.explain(verbose=False))
+    # +---------------+----------------------------------------------------------------------------+
+    # | plan_type     | plan                                                                       |
+    # +---------------+----------------------------------------------------------------------------+
+    # | logical_plan  | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]              |
+    # |               |   TableScan: t projection=[]                                               |
+    # | physical_plan | AggregateExec: mode=Final, gby=[], aggr=[count(*)]                         |
+    # |               |   CoalescePartitionsExec                                                   |
+    # |               |     AggregateExec: mode=Partial, gby=[], aggr=[count(*)]                   |
+    # |               |       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 |
+    # |               |         DatasetExec: number_of_fragments=1, projection=[]                  |
+    # |               |                                                                            |
+    # +---------------+----------------------------------------------------------------------------+
+
+    count = count.collect()
+    assert count[0].column(0) == pa.array([3])
+
+    # file_path = "./examples/tpch/data/lineitem.parquet"
+    # pyarrow_dataset = ds.dataset([file_path])
+
+    # ctx.register_dataset("pyarrow_dataset", pyarrow_dataset)
+    # df = ctx.table("pyarrow_dataset").select("l_orderkey", "l_partkey", "l_linenumber")
+
+    # df.limit(3).show()
+
+    # this is the line that causes the error
+    # assert df.count() == 100
+
+
 def test_pyarrow_predicate_pushdown_is_null(ctx, capfd):
     """Ensure that pyarrow filter gets pushed down for `IsNull`"""
     # create a RecordBatch and register it as a pyarrow.dataset.Dataset