Skip to content

Commit ae7470e

Browse files
Add unwraps and debug prints to dataset_exec.rs
This is a temporary commit for the purpose of tracking down the bug.
1 parent d91b738 commit ae7470e

File tree

1 file changed

+45
-19
lines changed

1 file changed

+45
-19
lines changed

src/dataset_exec.rs

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,25 @@ impl Iterator for PyArrowBatchesAdapter {
5353

5454
fn next(&mut self) -> Option<Self::Item> {
5555
Python::with_gil(|py| {
56+
println!("getting next pyarrow batch");
5657
let mut batches = self.batches.clone().into_bound(py);
57-
Some(
58-
batches
59-
.next()?
60-
.and_then(|batch| Ok(batch.extract::<PyArrowType<_>>()?.0))
61-
.map_err(|err| ArrowError::ExternalError(Box::new(err))),
62-
)
58+
59+
let next_batch = batches.next().expect("no next batch").expect("failed to get next batch");
60+
61+
// NOTE: This is where the failure actually occurs.
62+
// It occurs because `from_pyarrow_bound` uses the default `RecordBatchOptions` which does *not* allow a batch with no columns.
63+
// See https://github.com/apache/arrow-rs/pull/1552 for more details.
64+
let extracted = next_batch.extract::<PyArrowType<_>>().expect("failed to extract batch");
65+
Some(Ok(extracted.0))
66+
67+
// Some(Ok(
68+
// batches
69+
// .next()
70+
// .unwrap()
71+
// .and_then(|batch| Ok(batch.extract::<PyArrowType<_>>().unwrap().0))
72+
// .unwrap()
73+
// // .map_err(|err| ArrowError::ExternalError(Box::new(err))),
74+
// ))
6375
})
6476
}
6577
}
@@ -83,6 +95,7 @@ impl DatasetExec {
8395
projection: Option<Vec<usize>>,
8496
filters: &[Expr],
8597
) -> Result<Self, DataFusionError> {
98+
println!("initiating new DatasetExec");
8699
let columns: Option<Result<Vec<String>, DataFusionError>> = projection.map(|p| {
87100
p.iter()
88101
.map(|index| {
@@ -138,7 +151,7 @@ impl DatasetExec {
138151
Partitioning::UnknownPartitioning(fragments.len()),
139152
ExecutionMode::Bounded,
140153
);
141-
154+
println!("intiating new DatasetExec: done");
142155
Ok(DatasetExec {
143156
dataset: dataset.clone().unbind(),
144157
schema,
@@ -184,45 +197,58 @@ impl ExecutionPlan for DatasetExec {
184197
partition: usize,
185198
context: Arc<TaskContext>,
186199
) -> DFResult<SendableRecordBatchStream> {
200+
println!("executing DatasetExec");
187201
let batch_size = context.session_config().batch_size();
188202
Python::with_gil(|py| {
189203
let dataset = self.dataset.bind(py);
190204
let fragments = self.fragments.bind(py);
191205
let fragment = fragments
192206
.get_item(partition)
193-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
207+
.unwrap();
208+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
194209

195210
// We need to pass the dataset schema to unify the fragment and dataset schema per PyArrow docs
196211
let dataset_schema = dataset
197212
.getattr("schema")
198-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
213+
.unwrap();
214+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
215+
println!("dataset_schema: {:?}", dataset_schema);
199216
let kwargs = PyDict::new_bound(py);
200217
kwargs
201-
.set_item("columns", self.columns.clone())
202-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
218+
.set_item("columns", self.columns.clone()).unwrap();
219+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
203220
kwargs
204221
.set_item(
205222
"filter",
206223
self.filter_expr.as_ref().map(|expr| expr.clone_ref(py)),
207-
)
208-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
224+
).unwrap();
225+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
209226
kwargs
210227
.set_item("batch_size", batch_size)
211-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
228+
.unwrap();
229+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
212230
let scanner = fragment
213231
.call_method("scanner", (dataset_schema,), Some(&kwargs))
214-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
232+
.unwrap();
233+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
215234
let schema: SchemaRef = Arc::new(
216235
scanner
217236
.getattr("projected_schema")
218-
.and_then(|schema| Ok(schema.extract::<PyArrowType<_>>()?.0))
219-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?,
237+
.and_then(|schema| {
238+
let pyarrow_schema = schema.extract::<PyArrowType<_>>().unwrap().0;
239+
println!("pyarrow_schema: {:?}", pyarrow_schema);
240+
Ok(pyarrow_schema)
241+
})
242+
.unwrap(),
243+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?,
220244
);
221245
let record_batches: Bound<'_, PyIterator> = scanner
222246
.call_method0("to_batches")
223-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?
247+
.unwrap()
248+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?
224249
.iter()
225-
.map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
250+
.unwrap();
251+
// .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
226252

227253
let record_batches = PyArrowBatchesAdapter {
228254
batches: record_batches.into(),

0 commit comments

Comments
 (0)