|
15 | 15 | // specific language governing permissions and limitations |
16 | 16 | // under the License. |
17 | 17 |
|
18 | | -use arrow::array::AsArray; |
| 18 | +use arrow::array::{AsArray, PrimitiveArray}; |
19 | 19 | use arrow::datatypes::{Float64Type, Int32Type}; |
20 | 20 | use datafusion::error::Result; |
21 | 21 | use datafusion::prelude::*; |
| 22 | +use datafusion_common::assert_batches_eq; |
22 | 23 | use futures::StreamExt; |
23 | 24 |
|
24 | | -/// This example shows that it is possible to convert query results into Rust structs . |
| 25 | +/// This example shows how to convert query results into Rust structs by using |
| 26 | +/// the Arrow APIs to convert the results into Rust native types. |
| 27 | +/// |
| 28 | +/// This is a bit tricky initially as the results are returned as columns stored |
| 29 | +/// as [ArrayRef] |
| 30 | +/// |
| 31 | +/// [ArrayRef]: arrow::array::ArrayRef |
25 | 32 | #[tokio::main] |
26 | 33 | async fn main() -> Result<()> { |
27 | | - let data_list = Data::new().await?; |
28 | | - println!("{data_list:#?}"); |
29 | | - Ok(()) |
30 | | -} |
| 34 | + // Run a query that returns two columns of data |
| 35 | + let ctx = SessionContext::new(); |
| 36 | + let testdata = datafusion::test_util::parquet_test_data(); |
| 37 | + ctx.register_parquet( |
| 38 | + "alltypes_plain", |
| 39 | + &format!("{testdata}/alltypes_plain.parquet"), |
| 40 | + ParquetReadOptions::default(), |
| 41 | + ) |
| 42 | + .await?; |
| 43 | + let df = ctx |
| 44 | + .sql("SELECT int_col, double_col FROM alltypes_plain") |
| 45 | + .await?; |
31 | 46 |
|
32 | | -#[derive(Debug)] |
33 | | -struct Data { |
34 | | - #[allow(dead_code)] |
35 | | - int_col: i32, |
36 | | - #[allow(dead_code)] |
37 | | - double_col: f64, |
38 | | -} |
| 47 | + // print out the results showing we have an int32 and a float64 column |
| 48 | + let results = df.clone().collect().await?; |
| 49 | + assert_batches_eq!( |
| 50 | + [ |
| 51 | + "+---------+------------+", |
| 52 | + "| int_col | double_col |", |
| 53 | + "+---------+------------+", |
| 54 | + "| 0 | 0.0 |", |
| 55 | + "| 1 | 10.1 |", |
| 56 | + "| 0 | 0.0 |", |
| 57 | + "| 1 | 10.1 |", |
| 58 | + "| 0 | 0.0 |", |
| 59 | + "| 1 | 10.1 |", |
| 60 | + "| 0 | 0.0 |", |
| 61 | + "| 1 | 10.1 |", |
| 62 | + "+---------+------------+", |
| 63 | + ], |
| 64 | + &results |
| 65 | + ); |
39 | 66 |
|
40 | | -impl Data { |
41 | | - pub async fn new() -> Result<Vec<Self>> { |
42 | | - // this group is almost the same as the one you find it in parquet_sql.rs |
43 | | - let ctx = SessionContext::new(); |
| 67 | + // We will now convert the query results into a Rust struct |
| 68 | + let mut stream = df.execute_stream().await?; |
| 69 | + let mut list = vec![]; |
44 | 70 |
|
45 | | - let testdata = datafusion::test_util::parquet_test_data(); |
| 71 | + // DataFusion produces data in chunks called `RecordBatch`es which are |
| 72 | + // typically 8000 rows each. This loop processes each `RecordBatch` as it is |
| 73 | + // produced by the query plan and adds it to the list |
| 74 | + while let Some(b) = stream.next().await.transpose()? { |
| 75 | + // Each `RecordBatch` has one or more columns. Each column is stored as |
| 76 | + // an `ArrayRef`. To interact with data using Rust native types we need to |
| 77 | + // convert these `ArrayRef`s into concrete array types using APIs from |
| 78 | + // the arrow crate. |
46 | 79 |
|
47 | | - ctx.register_parquet( |
48 | | - "alltypes_plain", |
49 | | - &format!("{testdata}/alltypes_plain.parquet"), |
50 | | - ParquetReadOptions::default(), |
51 | | - ) |
52 | | - .await?; |
| 80 | + // In this case, we know that each batch has two columns of the Arrow |
| 81 | + // types Int32 and Float64, so first we cast the two columns to the |
| 82 | + // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).: |
| 83 | + let int_col: &PrimitiveArray<Int32Type> = b.column(0).as_primitive(); |
| 84 | + let float_col: &PrimitiveArray<Float64Type> = b.column(1).as_primitive(); |
53 | 85 |
|
54 | | - let df = ctx |
55 | | - .sql("SELECT int_col, double_col FROM alltypes_plain") |
56 | | - .await?; |
| 86 | + // With PrimitiveArrays, we can access to the values as native Rust |
| 87 | + // types i32 and f64, and forming the desired `Data` structs |
| 88 | + for (i, f) in int_col.values().iter().zip(float_col.values()) { |
| 89 | + list.push(Data { |
| 90 | + int_col: *i, |
| 91 | + double_col: *f, |
| 92 | + }) |
| 93 | + } |
| 94 | + } |
57 | 95 |
|
58 | | - df.clone().show().await?; |
| 96 | + // Finally, we have the results in the list of Rust structs |
| 97 | + let res = format!("{list:#?}"); |
| 98 | + assert_eq!( |
| 99 | + res, |
| 100 | + r#"[ |
| 101 | + Data { |
| 102 | + int_col: 0, |
| 103 | + double_col: 0.0, |
| 104 | + }, |
| 105 | + Data { |
| 106 | + int_col: 1, |
| 107 | + double_col: 10.1, |
| 108 | + }, |
| 109 | + Data { |
| 110 | + int_col: 0, |
| 111 | + double_col: 0.0, |
| 112 | + }, |
| 113 | + Data { |
| 114 | + int_col: 1, |
| 115 | + double_col: 10.1, |
| 116 | + }, |
| 117 | + Data { |
| 118 | + int_col: 0, |
| 119 | + double_col: 0.0, |
| 120 | + }, |
| 121 | + Data { |
| 122 | + int_col: 1, |
| 123 | + double_col: 10.1, |
| 124 | + }, |
| 125 | + Data { |
| 126 | + int_col: 0, |
| 127 | + double_col: 0.0, |
| 128 | + }, |
| 129 | + Data { |
| 130 | + int_col: 1, |
| 131 | + double_col: 10.1, |
| 132 | + }, |
| 133 | +]"# |
| 134 | + ); |
59 | 135 |
|
60 | | - let mut stream = df.execute_stream().await?; |
61 | | - let mut list = vec![]; |
62 | | - while let Some(b) = stream.next().await.transpose()? { |
63 | | - let int_col = b.column(0).as_primitive::<Int32Type>(); |
64 | | - let float_col = b.column(1).as_primitive::<Float64Type>(); |
| 136 | + // Use the fields in the struct to avoid clippy complaints |
| 137 | + let int_sum = list.iter().fold(0, |acc, x| acc + x.int_col); |
| 138 | + let double_sum = list.iter().fold(0.0, |acc, x| acc + x.double_col); |
| 139 | + assert_eq!(int_sum, 4); |
| 140 | + assert_eq!(double_sum, 40.4); |
65 | 141 |
|
66 | | - for (i, f) in int_col.values().iter().zip(float_col.values()) { |
67 | | - list.push(Data { |
68 | | - int_col: *i, |
69 | | - double_col: *f, |
70 | | - }) |
71 | | - } |
72 | | - } |
| 142 | + Ok(()) |
| 143 | +} |
73 | 144 |
|
74 | | - Ok(list) |
75 | | - } |
| 145 | +/// This is target struct where we want the query results. |
| 146 | +#[derive(Debug)] |
| 147 | +struct Data { |
| 148 | + int_col: i32, |
| 149 | + double_col: f64, |
76 | 150 | } |
0 commit comments