Skip to content

Commit 7fa2c7c

Browse files
committed
Amend PyDataFrame to use display_config instead of constants
1 parent 065fa40 commit 7fa2c7c

File tree

1 file changed

+26
-13
lines changed

1 file changed

+26
-13
lines changed

src/dataframe.rs

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,6 @@ impl PyTableProvider {
7373
PyTable::new(table_provider)
7474
}
7575
}
76-
const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
77-
const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
78-
const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
7976

8077
/// A PyDataFrame is a representation of a logical plan and an API to compose statements.
8178
/// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -84,15 +81,15 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
8481
#[derive(Clone)]
8582
pub struct PyDataFrame {
8683
df: Arc<DataFrame>,
87-
display_config: PyDataframeDisplayConfig,
84+
display_config: Arc<PyDataframeDisplayConfig>,
8885
}
8986

9087
impl PyDataFrame {
9188
/// creates a new PyDataFrame
9289
pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
9390
Self {
9491
df: Arc::new(df),
95-
display_config,
92+
display_config: Arc::new(display_config),
9693
}
9794
}
9895
}
@@ -121,9 +118,23 @@ impl PyDataFrame {
121118
}
122119

123120
fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
121+
// Get display configuration values
122+
let min_rows = self.display_config.min_table_rows;
123+
let max_rows = self.display_config.max_table_rows_in_repr;
124+
let max_bytes = self.display_config.max_table_bytes;
125+
126+
// Collect record batches for display
127+
let (batches, has_more) = wait_for_future(
128+
py,
129+
collect_record_batches_to_display(
130+
self.df.as_ref().clone(),
131+
self.display_config.min_table_rows,
132+
self.display_config.max_table_rows_in_repr,
133+
self.display_config.max_table_bytes,
134+
),
124135
let (batches, has_more) = wait_for_future(
125136
py,
126-
collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
137+
self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes),
127138
)?;
128139
if batches.is_empty() {
129140
// This should not be reached, but do it for safety since we index into the vector below
@@ -146,8 +157,9 @@ impl PyDataFrame {
146157
py,
147158
collect_record_batches_to_display(
148159
self.df.as_ref().clone(),
149-
MIN_TABLE_ROWS_TO_DISPLAY,
150-
usize::MAX,
160+
self.display_config.min_table_rows,
161+
self.display_config.max_table_rows_in_repr,
162+
self.display_config.max_table_bytes,
151163
),
152164
)?;
153165
if batches.is_empty() {
@@ -223,8 +235,8 @@ impl PyDataFrame {
223235
for (col, formatter) in batch_formatter.iter().enumerate() {
224236
let cell_data = formatter.value(batch_row).to_string();
225237
// From testing, primitive data types do not typically get larger than 21 characters
226-
if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
227-
let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
238+
if cell_data.len() > self.display_config.max_cell_length {
239+
let short_cell_data = &cell_data[0..self.display_config.max_cell_length];
228240
cells.push(format!("
229241
<td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
230242
<div class=\"expandable-container\">
@@ -891,6 +903,7 @@ async fn collect_record_batches_to_display(
891903
df: DataFrame,
892904
min_rows: usize,
893905
max_rows: usize,
906+
max_bytes: usize,
894907
) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
895908
let partitioned_stream = df.execute_stream_partitioned().await?;
896909
let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -899,7 +912,7 @@ async fn collect_record_batches_to_display(
899912
let mut record_batches = Vec::default();
900913
let mut has_more = false;
901914

902-
while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
915+
while (size_estimate_so_far < max_bytes && rows_so_far < max_rows)
903916
|| rows_so_far < min_rows
904917
{
905918
let mut rb = match stream.next().await {
@@ -914,8 +927,8 @@ async fn collect_record_batches_to_display(
914927
if rows_in_rb > 0 {
915928
size_estimate_so_far += rb.get_array_memory_size();
916929

917-
if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
918-
let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
930+
if size_estimate_so_far > max_bytes {
931+
let ratio = max_bytes as f32 / size_estimate_so_far as f32;
919932
let total_rows = rows_in_rb + rows_so_far;
920933

921934
let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

0 commit comments

Comments
 (0)