refactor: Update PyDataFrame methods to consistently use display_config for DataFrame creation

kosiew · kosiew · commit 17379736e5ee · 2025-04-02T18:00:46.000+08:00
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -127,10 +127,8 @@ impl PyDataFrame {
                 self.display_config.max_table_rows_in_repr,
                 self.display_config.max_table_bytes,
             ),
-        let (batches, has_more) = wait_for_future(
-            py,
-            self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes),
         )?;
+
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
             return Ok("No data to display".to_string());
@@ -281,7 +279,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df))
+        Ok(Self::new(stat_df, self.display_config.as_ref().clone()))
     }
 
     /// Returns the schema from the logical plan
@@ -311,31 +309,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -345,7 +343,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -356,27 +354,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -393,7 +391,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -418,7 +416,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn join(
@@ -452,7 +450,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn join_on(
@@ -481,7 +479,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Print the query plan
@@ -514,7 +512,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -526,7 +524,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -542,7 +540,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -553,7 +551,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -566,7 +564,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .unnest_columns_with_options(&[column], unnest_options)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (columns, preserve_nulls=true))]
@@ -584,7 +582,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .unnest_columns_with_options(&cols, unnest_options)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the intersection of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
@@ -594,13 +592,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df. self.display_config))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -907,9 +905,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows)
-        || rows_so_far < min_rows
-    {
+    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {
                 break;