tests

andygrove · andygrove · commit ebe403a0cd6e · 2024-12-14T08:56:04.000-07:00
diff --git a/src/query_stage.rs b/src/query_stage.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::context::serialize_execution_plan;
-use crate::shuffle::{ShuffleCodec, ShuffleReaderExec};
+use crate::shuffle::{ShuffleCodec, ShuffleReaderExec, ShuffleWriterExec};
 use datafusion::error::Result;
 use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, Partitioning};
 use datafusion::prelude::SessionContext;
@@ -99,7 +99,14 @@ impl QueryStage {
     /// Get the input partition count. This is the same as the number of concurrent tasks
     /// when we schedule this query stage for execution
     pub fn get_input_partition_count(&self) -> usize {
-        self.plan.output_partitioning().partition_count()
+        self.plan.children()[0].output_partitioning().partition_count()
+        if self.plan.as_any().is::<ShuffleWriterExec>() {
+            // most query stages represent a shuffle write
+            self.plan.children()[0].output_partitioning().partition_count()
+        } else {
+            // probably the final query stage
+            self.plan.output_partitioning().partition_count()
+        }
     }
 
     pub fn get_output_partition_count(&self) -> usize {
diff --git a/tests/test_context.py b/tests/test_context.py
@@ -23,9 +23,21 @@ def test_basic_query_succeed():
     df_ctx = SessionContext()
     ctx = DatafusionRayContext(df_ctx)
     df_ctx.register_csv("tips", "examples/tips.csv", has_header=True)
+    # TODO why does this return a single batch and not a list of batches?
     record_batch = ctx.sql("SELECT * FROM tips")
     assert record_batch.num_rows == 244
 
+def test_aggregate():
+    df_ctx = SessionContext()
+    ctx = DatafusionRayContext(df_ctx)
+    df_ctx.register_csv("tips", "examples/tips.csv", has_header=True)
+    record_batches = ctx.sql("select sex, smoker, avg(tip/total_bill) as tip_pct from tips group by sex, smoker")
+    assert isinstance(record_batches, list)
+    # TODO why does this return many empty batches?
+    num_rows = 0
+    for record_batch in record_batches:
+        num_rows += record_batch.num_rows
+    assert num_rows == 4
 
 def test_no_result_query():
     df_ctx = SessionContext()