Skip to content

Commit f71c9f7

Browse files
committed
feat: Python None/null support
This allows people to write methods that don't return anything, like: ```python import requests def log(x: int) -> None: requests.post("http://localhost:1234/", data={"x": x}) ```
1 parent 32c230e commit f71c9f7

File tree

8 files changed

+371
-32
lines changed

8 files changed

+371
-32
lines changed

guests/python/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Types are mapped to/from [Apache Arrow] as follows:
6363
| [`datetime`] | [`Timestamp`] w/ [`Microsecond`] and NO timezone |
6464
| [`float`] | [`Float64`] |
6565
| [`int`] | [`Int64`] |
66+
| [`None`] | [`Null`] |
6667
| [`str`] | [`Utf8`] |
6768
| [`time`] | [`Time64`] w/ [`Microsecond`] and NO timezone |
6869
| [`timedelta`]| [`Duration`] |
@@ -206,6 +207,8 @@ There is NO I/O available that escapes the sandbox. The [Python Standard Library
206207
[`functools.cache`]: https://docs.python.org/3/library/functools.html#functools.cache
207208
[`int`]: https://docs.python.org/3/library/stdtypes.html#numeric-types-int-float-complex
208209
[`Int64`]: https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Int64
210+
[`None`]: https://docs.python.org/3/library/constants.html#None
211+
[`Null`]: https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Null
209212
[`time`]: https://docs.python.org/3/library/datetime.html#datetime.time
210213
[`Time64`]: https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Time64
211214
[`timedelta`]: https://docs.python.org/3/library/datetime.html#datetime.timedelta

guests/python/src/conversion.rs

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::{ops::ControlFlow, sync::Arc};
44
use arrow::{
55
array::{
66
Array, ArrayRef, BinaryBuilder, BooleanBuilder, Date32Builder, DurationMicrosecondBuilder,
7-
Float64Builder, Int64Builder, StringBuilder, Time64MicrosecondBuilder,
7+
Float64Builder, Int64Builder, NullBuilder, StringBuilder, Time64MicrosecondBuilder,
88
TimestampMicrosecondBuilder,
99
},
1010
datatypes::{DataType, TimeUnit},
@@ -13,8 +13,8 @@ use chrono::{DateTime, Datelike, NaiveDate, TimeZone, Timelike, Utc};
1313
use datafusion_common::{
1414
cast::{
1515
as_binary_array, as_boolean_array, as_date32_array, as_duration_microsecond_array,
16-
as_float64_array, as_int64_array, as_string_array, as_time64_microsecond_array,
17-
as_timestamp_microsecond_array,
16+
as_float64_array, as_int64_array, as_null_array, as_string_array,
17+
as_time64_microsecond_array, as_timestamp_microsecond_array,
1818
},
1919
error::Result as DataFusionResult,
2020
exec_datafusion_err, exec_err,
@@ -53,6 +53,7 @@ impl PythonType {
5353
Self::DateTime => DataType::Timestamp(TimeUnit::Microsecond, None),
5454
Self::Float => DataType::Float64,
5555
Self::Int => DataType::Int64,
56+
Self::None => DataType::Null,
5657
Self::Str => DataType::Utf8,
5758
Self::Bytes => DataType::Binary,
5859
Self::Date => DataType::Date32,
@@ -166,6 +167,16 @@ impl PythonType {
166167

167168
Ok(Box::new(it))
168169
}
170+
Self::None => {
171+
let array = as_null_array(array)?;
172+
173+
let val = PyNone::get(py)
174+
.into_bound_py_any(py)
175+
.map_err(|e| exec_datafusion_err!("cannot build Python None value: {e}"))?;
176+
let it = std::iter::repeat_n(Some(val), array.len()).map(Ok);
177+
178+
Ok(Box::new(it))
179+
}
169180
Self::Str => {
170181
let array = as_string_array(array)?;
171182

@@ -294,6 +305,7 @@ impl PythonType {
294305
Self::DateTime => Box::new(TimestampMicrosecondBuilder::with_capacity(num_rows)),
295306
Self::Float => Box::new(Float64Builder::with_capacity(num_rows)),
296307
Self::Int => Box::new(Int64Builder::with_capacity(num_rows)),
308+
Self::None => Box::new(NullBuilder::new()),
297309
Self::Str => Box::new(StringBuilder::with_capacity(num_rows, 1024)),
298310
Self::Bytes => Box::new(BinaryBuilder::with_capacity(num_rows, 1024)),
299311
Self::Date => Box::new(Date32Builder::with_capacity(num_rows)),
@@ -467,6 +479,24 @@ impl<'py> ArrayBuilder<'py> for Int64Builder {
467479
}
468480
}
469481

482+
impl<'py> ArrayBuilder<'py> for NullBuilder {
483+
fn push(&mut self, val: Bound<'py, PyAny>) -> DataFusionResult<()> {
484+
val.cast_exact::<PyNone>().map_err(|_| {
485+
exec_datafusion_err!("expected `None` but got {}", py_representation(&val))
486+
})?;
487+
self.append_empty_value();
488+
Ok(())
489+
}
490+
491+
fn skip(&mut self) {
492+
self.append_empty_value();
493+
}
494+
495+
fn finish(&mut self) -> ArrayRef {
496+
Arc::new(self.finish())
497+
}
498+
}
499+
470500
impl<'py> ArrayBuilder<'py> for StringBuilder {
471501
fn push(&mut self, val: Bound<'py, PyAny>) -> DataFusionResult<()> {
472502
let val: &str = val.extract().map_err(|_| {

guests/python/src/inspect.rs

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//! Inspection of Python code to extract [signature](crate::signature) information.
2-
use std::ffi::CString;
2+
use std::{collections::HashSet, ffi::CString};
33

44
use datafusion_common::{DataFusionError, error::Result as DataFusionResult};
55
use pyo3::{
@@ -35,6 +35,10 @@ impl<'a, 'py> FromPyObject<'a, 'py> for PythonType {
3535
let type_time = mod_datetime.getattr(intern!(py, "time"))?;
3636
let type_timedelta = mod_datetime.getattr(intern!(py, "timedelta"))?;
3737

38+
// https://docs.python.org/3/library/types.html
39+
let mod_types = py.import(intern!(py, "types"))?;
40+
let type_none = mod_types.getattr(intern!(py, "NoneType"))?;
41+
3842
if ob.is(type_bool) {
3943
Ok(Self::Bool)
4044
} else if ob.is(type_bytes) {
@@ -45,6 +49,8 @@ impl<'a, 'py> FromPyObject<'a, 'py> for PythonType {
4549
Ok(Self::DateTime)
4650
} else if ob.is(type_float) {
4751
Ok(Self::Float)
52+
} else if ob.is(&type_none) || ob.is_instance(&type_none).unwrap_or_default() {
53+
Ok(Self::None)
4854
} else if ob.is(type_int) {
4955
Ok(Self::Int)
5056
} else if ob.is(type_str) {
@@ -85,38 +91,34 @@ impl<'a, 'py> FromPyObject<'a, 'py> for PythonNullableType {
8591
// https://docs.python.org/3/library/types.html
8692
let mod_types = py.import(intern!(py, "types"))?;
8793
let type_union = mod_types.getattr(intern!(py, "UnionType"))?;
88-
let type_none = mod_types.getattr(intern!(py, "NoneType"))?;
8994

9095
if ob.is_instance(&type_union)? {
9196
let args = ob.getattr(intern!(py, "__args__"))?;
9297

93-
let n_args = args.len()?;
94-
if n_args != 2 {
98+
let mut args = args
99+
.try_iter()?
100+
.map(|arg| {
101+
let arg = arg?;
102+
arg.extract::<PythonType>()
103+
})
104+
.collect::<PyResult<HashSet<_>>>()?;
105+
106+
let nullable = args.len() > 1 && args.remove(&PythonType::None);
107+
108+
if args.len() != 1 {
95109
return Err(PyErr::new::<PyTypeError, _>(format!(
96-
"only unions of length 2 are supported, got {n_args}"
110+
"only unions of form `T | None` are suppored, but got a union of {} distinct none-NULL types",
111+
args.len()
97112
)));
98113
}
99-
let (arg1, arg2): (Bound<'py, PyAny>, Bound<'py, PyAny>) = args.extract()?;
100-
101-
let inner_type = if arg1.is(&type_none) {
102-
arg2
103-
} else if arg2.is(&type_none) {
104-
arg1
105-
} else {
106-
return Err(PyErr::new::<PyTypeError, _>(
107-
"only unions with None are supported",
108-
));
109-
};
110-
111-
Ok(Self {
112-
t: inner_type.extract()?,
113-
nullable: true,
114-
})
114+
let t = args.into_iter().next().expect("just checked length");
115+
116+
Ok(Self { t, nullable })
115117
} else {
116-
Ok(Self {
117-
t: ob.extract()?,
118-
nullable: false,
119-
})
118+
let t = ob.extract()?;
119+
let nullable = t == PythonType::None;
120+
121+
Ok(Self { t, nullable })
120122
}
121123
}
122124
}

guests/python/src/signature.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use pyo3::{Py, PyAny};
88
/// # Naming
99
/// Since Python and Arrow use different names for the same type, we have to settle on some consistency. We chose to
1010
/// use the Python name in CamelCase style, so Python's `datetime` will become `DateTime`.
11-
#[derive(Debug)]
11+
#[derive(Debug, PartialEq, Eq, Hash)]
1212
pub(crate) enum PythonType {
1313
/// Boolean.
1414
///
@@ -82,6 +82,18 @@ pub(crate) enum PythonType {
8282
/// We map this to [`Int64`](arrow::datatypes::DataType::Int64).
8383
Int,
8484

85+
/// None/Null.
86+
///
87+
/// # Python
88+
/// The type is called `None`, documentation can be found here:
89+
///
90+
/// - <https://docs.python.org/3/library/constants.html#None>
91+
/// - <https://docs.python.org/3/library/types.html#types.NoneType>
92+
///
93+
/// # Arrow
94+
/// We map this to [`Null`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Null).
95+
None,
96+
8597
/// String.
8698
///
8799
/// # Python

host/tests/integration_tests/python/inspection/errors.rs

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def add_one(x: int | str) -> int:
8080
@r"
8181
scalar_udfs
8282
caused by
83-
Error during planning: TypeError: only unions with None are supported
83+
Error during planning: TypeError: only unions of form `T | None` are suppored, but got a union of 2 distinct none-NULL types
8484
8585
The above exception was the direct cause of the following exception:
8686
@@ -94,7 +94,7 @@ def add_one(x: int | str) -> int:
9494
}
9595

9696
#[tokio::test(flavor = "multi_thread")]
97-
async fn test_union_type_3() {
97+
async fn test_union_type_2_and_none() {
9898
const CODE: &str = "
9999
def add_one(x: int | str | None) -> int:
100100
return x + 1
@@ -105,7 +105,82 @@ def add_one(x: int | str | None) -> int:
105105
@r"
106106
scalar_udfs
107107
caused by
108-
Error during planning: TypeError: only unions of length 2 are supported, got 3
108+
Error during planning: TypeError: only unions of form `T | None` are suppored, but got a union of 2 distinct none-NULL types
109+
110+
The above exception was the direct cause of the following exception:
111+
112+
TypeError: inspect parameter 1
113+
114+
The above exception was the direct cause of the following exception:
115+
116+
TypeError: inspect type of `add_one`
117+
",
118+
);
119+
}
120+
121+
#[tokio::test(flavor = "multi_thread")]
122+
async fn test_union_type_2_identical() {
123+
const CODE: &str = "
124+
def add_one(x: int | str | int) -> int:
125+
return x + 1
126+
";
127+
128+
insta::assert_snapshot!(
129+
err(CODE).await,
130+
@r"
131+
scalar_udfs
132+
caused by
133+
Error during planning: TypeError: only unions of form `T | None` are suppored, but got a union of 2 distinct none-NULL types
134+
135+
The above exception was the direct cause of the following exception:
136+
137+
TypeError: inspect parameter 1
138+
139+
The above exception was the direct cause of the following exception:
140+
141+
TypeError: inspect type of `add_one`
142+
",
143+
);
144+
}
145+
146+
#[tokio::test(flavor = "multi_thread")]
147+
async fn test_union_type_2_identical_and_none() {
148+
const CODE: &str = "
149+
def add_one(x: int | None | str | int) -> int:
150+
return x + 1
151+
";
152+
153+
insta::assert_snapshot!(
154+
err(CODE).await,
155+
@r"
156+
scalar_udfs
157+
caused by
158+
Error during planning: TypeError: only unions of form `T | None` are suppored, but got a union of 2 distinct none-NULL types
159+
160+
The above exception was the direct cause of the following exception:
161+
162+
TypeError: inspect parameter 1
163+
164+
The above exception was the direct cause of the following exception:
165+
166+
TypeError: inspect type of `add_one`
167+
",
168+
);
169+
}
170+
171+
#[tokio::test(flavor = "multi_thread")]
172+
async fn test_union_type_3() {
173+
const CODE: &str = "
174+
def add_one(x: int | str | float) -> int:
175+
return x + 1
176+
";
177+
178+
insta::assert_snapshot!(
179+
err(CODE).await,
180+
@r"
181+
scalar_udfs
182+
caused by
183+
Error during planning: TypeError: only unions of form `T | None` are suppored, but got a union of 3 distinct none-NULL types
109184
110185
The above exception was the direct cause of the following exception:
111186

host/tests/integration_tests/python/types/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ mod date;
44
mod datetime;
55
mod float;
66
mod int;
7+
mod none;
78
mod str;
89
mod time;
910
mod timedelta;
11+
mod union;

0 commit comments

Comments
 (0)