Skip to content

Commit 89fe31f

Browse files
committed
Support input/output to/from polars via python (not native rust)
Signed-off-by: Tim Paine <[email protected]>
1 parent e9b6cfa commit 89fe31f

File tree

5 files changed

+396
-1
lines changed

5 files changed

+396
-1
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
2+
# ┃ ██████ ██████ ██████ █ █ █ █ █ █▄ ▀███ █ ┃
3+
# ┃ ▄▄▄▄▄█ █▄▄▄▄▄ ▄▄▄▄▄█ ▀▀▀▀▀█▀▀▀▀▀ █ ▀▀▀▀▀█ ████████▌▐███ ███▄ ▀█ █ ▀▀▀▀▀ ┃
4+
# ┃ █▀▀▀▀▀ █▀▀▀▀▀ █▀██▀▀ ▄▄▄▄▄ █ ▄▄▄▄▄█ ▄▄▄▄▄█ ████████▌▐███ █████▄ █ ▄▄▄▄▄ ┃
5+
# ┃ █ ██████ █ ▀█▄ █ ██████ █ ███▌▐███ ███████▄ █ ┃
6+
# ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫
7+
# ┃ Copyright (c) 2017, the Perspective Authors. ┃
8+
# ┃ ╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌ ┃
9+
# ┃ This file is part of the Perspective library, distributed under the terms ┃
10+
# ┃ of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). ┃
11+
# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
12+
13+
from datetime import date, datetime
14+
import numpy as np
15+
import polars as pl
16+
from pytest import mark
17+
import perspective as psp
18+
19+
client = psp.Server().new_local_client()
20+
Table = client.table
21+
22+
23+
def arrow_bytes_to_polars(view):
24+
import pyarrow
25+
26+
with pyarrow.ipc.open_stream(pyarrow.BufferReader(view.to_arrow())) as reader:
27+
return pl.from_dataframe(reader.read_pandas())
28+
29+
30+
class TestTablePolars(object):
31+
def test_empty_table(self):
32+
tbl = Table([])
33+
assert tbl.size() == 0
34+
assert tbl.schema() == {}
35+
36+
def test_table_dataframe(self):
37+
d = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
38+
data = pl.DataFrame(d)
39+
tbl = Table(data)
40+
assert tbl.size() == 2
41+
assert tbl.schema() == {"a": "integer", "b": "integer"}
42+
assert tbl.view().to_records() == [
43+
{"a": 1, "b": 2},
44+
{"a": 3, "b": 4},
45+
]
46+
47+
def test_table_dataframe_column_order(self):
48+
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
49+
data = pl.DataFrame(d).select(["b", "c", "a", "d"])
50+
tbl = Table(data)
51+
assert tbl.size() == 2
52+
assert tbl.columns() == ["b", "c", "a", "d"]
53+
54+
def test_table_dataframe_selective_column_order(self):
55+
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
56+
data = pl.DataFrame(d).select(["b", "c", "a"])
57+
tbl = Table(data)
58+
assert tbl.size() == 2
59+
assert tbl.columns() == ["b", "c", "a"]
60+
61+
@mark.skip(reason="Not supported, polars converts to fixed_size_binary")
62+
def test_table_dataframe_does_not_mutate(self):
63+
# make sure we don't mutate the dataframe that a user passes in
64+
data = pl.DataFrame(
65+
{
66+
"a": np.array([None, 1, None, 2], dtype=object),
67+
"b": np.array([1.5, None, 2.5, None], dtype=object),
68+
}
69+
)
70+
assert data["a"].to_list() == [None, 1, None, 2]
71+
assert data["b"].to_list() == [1.5, None, 2.5, None]
72+
73+
tbl = Table(data)
74+
assert tbl.size() == 4
75+
assert tbl.schema() == {"a": "integer", "b": "float"}
76+
77+
assert data["a"].to_list() == [None, 1, None, 2]
78+
assert data["b"].to_list() == [1.5, None, 2.5, None]
79+
80+
def test_table_polars_from_schema_int(self):
81+
data = [None, 1, None, 2, None, 3, 4]
82+
df = pl.DataFrame({"a": data})
83+
table = Table({"a": "integer"})
84+
table.update(df)
85+
assert table.view().to_columns()["a"] == data
86+
87+
def test_table_polars_from_schema_bool(self):
88+
data = [True, False, True, False]
89+
df = pl.DataFrame({"a": data})
90+
table = Table({"a": "boolean"})
91+
table.update(df)
92+
assert table.view().to_columns()["a"] == data
93+
94+
def test_table_polars_from_schema_float(self):
95+
data = [None, 1.5, None, 2.5, None, 3.5, 4.5]
96+
df = pl.DataFrame({"a": data})
97+
table = Table({"a": "float"})
98+
table.update(df)
99+
assert table.view().to_columns()["a"] == data
100+
101+
def test_table_polars_from_schema_float_all_nan(self):
102+
data = [np.nan, np.nan, np.nan, np.nan]
103+
df = pl.DataFrame({"a": data})
104+
table = Table({"a": "float"})
105+
table.update(df)
106+
assert table.view().to_columns()["a"] == [None, None, None, None]
107+
108+
def test_table_polars_from_schema_float_to_int(self):
109+
data = [None, 1.5, None, 2.5, None, 3.5, 4.5]
110+
df = pl.DataFrame({"a": data})
111+
table = Table({"a": "integer"})
112+
table.update(df)
113+
# truncates decimal
114+
assert table.view().to_columns()["a"] == [None, 1, None, 2, None, 3, 4]
115+
116+
def test_table_polars_from_schema_int_to_float(self):
117+
data = [None, 1, None, 2, None, 3, 4]
118+
df = pl.DataFrame({"a": data})
119+
table = Table({"a": "float"})
120+
table.update(df)
121+
assert table.view().to_columns()["a"] == [None, 1.0, None, 2.0, None, 3.0, 4.0]
122+
123+
def test_table_polars_from_schema_date(self, util):
124+
data = [date(2019, 8, 15), None, date(2019, 8, 16)]
125+
df = pl.DataFrame({"a": data})
126+
table = Table({"a": "date"})
127+
table.update(df)
128+
assert table.view().to_columns()["a"] == [
129+
util.to_timestamp(datetime(2019, 8, 15)),
130+
None,
131+
util.to_timestamp(datetime(2019, 8, 16)),
132+
]
133+
134+
def test_table_polars_from_schema_str(self):
135+
data = ["a", None, "b", None, "c"]
136+
df = pl.DataFrame({"a": data})
137+
table = Table({"a": "string"})
138+
table.update(df)
139+
assert table.view().to_columns()["a"] == data
140+
141+
def test_table_polars_none(self):
142+
data = [None, None, None]
143+
df = pl.DataFrame({"a": data})
144+
table = Table(df)
145+
assert table.view().to_columns()["a"] == data
146+
147+
def test_table_polars_symmetric_table(self):
148+
# make sure that updates are symmetric to table creation
149+
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [1.5, 2.5, 3.5, 4.5]})
150+
t1 = Table(df)
151+
t2 = Table({"a": "integer", "b": "float"})
152+
t2.update(df)
153+
assert t1.view().to_columns() == {
154+
"a": [1, 2, 3, 4],
155+
"b": [1.5, 2.5, 3.5, 4.5],
156+
}
157+
158+
def test_table_polars_symmetric_stacked_updates(self):
159+
# make sure that updates are symmetric to table creation
160+
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [1.5, 2.5, 3.5, 4.5]})
161+
162+
t1 = Table(df)
163+
t1.update(df)
164+
165+
t2 = Table({"a": "integer", "b": "float"})
166+
t2.update(df)
167+
t2.update(df)
168+
169+
assert t1.view().to_columns() == {
170+
"a": [1, 2, 3, 4, 1, 2, 3, 4],
171+
"b": [1.5, 2.5, 3.5, 4.5, 1.5, 2.5, 3.5, 4.5],
172+
}
173+
174+
@mark.skip(reason="Not supported, polars doesnt like input")
175+
def test_table_polars_transitive(self):
176+
# serialized output -> table -> serialized output
177+
records = {
178+
"a": [1, 2, 3, 4],
179+
"b": [1.5, 2.5, 3.5, 4.5],
180+
"c": [np.nan, np.nan, "abc", np.nan],
181+
"d": [None, True, None, False],
182+
"e": [
183+
float("nan"),
184+
datetime(2019, 7, 11, 12, 30),
185+
float("nan"),
186+
datetime(2019, 7, 11, 12, 30),
187+
],
188+
}
189+
190+
df = pl.DataFrame(records, strict=False)
191+
t1 = Table(df)
192+
out1 = arrow_bytes_to_polars(t1.view(columns=["a", "b", "c", "d", "e"]))
193+
t2 = Table(out1)
194+
assert t1.schema() == t2.schema()
195+
out2 = t2.view().to_columns()
196+
assert t1.view().to_columns() == out2
197+
198+
# dtype=object should have correct inferred types
199+
200+
@mark.skip(reason="Not supported, polars converts to fixed_size_binary")
201+
def test_table_polars_object_to_int(self):
202+
df = pl.DataFrame({"a": np.array([1, 2, None, 2, None, 3, 4], dtype=object)})
203+
table = Table(df)
204+
assert table.schema() == {"a": "integer"}
205+
assert table.view().to_columns()["a"] == [1, 2, None, 2, None, 3, 4]
206+
207+
@mark.skip(reason="Not supported, polars converts to fixed_size_binary")
208+
def test_table_polars_object_to_float(self):
209+
df = pl.DataFrame({"a": np.array([None, 1, None, 2, None, 3, 4], dtype=object)})
210+
table = Table(df)
211+
assert table.schema() == {"a": "integer"}
212+
assert table.view().to_columns()["a"] == [None, 1.0, None, 2.0, None, 3.0, 4.0]
213+
214+
@mark.skip(reason="Not supported, polars converts to fixed_size_binary")
215+
def test_table_polars_object_to_bool(self):
216+
df = pl.DataFrame(
217+
{"a": np.array([True, False, True, False, True, False], dtype=object)}
218+
)
219+
table = Table(df)
220+
assert table.schema() == {"a": "boolean"}
221+
assert table.view().to_columns()["a"] == [True, False, True, False, True, False]
222+
223+
224+
@mark.skip(reason="Not supported, polars converts to fixed_size_binary")
225+
def test_table_polars_object_to_datetime(self):
226+
df = pl.DataFrame(
227+
{
228+
"a": np.array(
229+
[
230+
datetime(2019, 7, 11, 1, 2, 3),
231+
datetime(2019, 7, 12, 1, 2, 3),
232+
None,
233+
],
234+
dtype=object,
235+
)
236+
}
237+
)
238+
table = Table(df)
239+
assert table.schema() == {"a": "datetime"}
240+
assert table.view().to_columns()["a"] == [
241+
datetime(2019, 7, 11, 1, 2, 3),
242+
datetime(2019, 7, 12, 1, 2, 3),
243+
None,
244+
]
245+
246+
def test_table_polars_object_to_str(self):
247+
df = pl.DataFrame({"a": np.array(["abc", "def", None, "ghi"], dtype=object)})
248+
table = Table(df)
249+
assert table.schema() == {"a": "string"}
250+
assert table.view().to_columns()["a"] == ["abc", "def", None, "ghi"]

rust/perspective-python/src/client/client_sync.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,11 @@ impl View {
350350
self.0.to_dataframe(window).py_block_on(py)
351351
}
352352

353+
#[pyo3(signature = (**window))]
354+
pub fn to_polars(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyAny>> {
355+
self.0.to_polars(window).py_block_on(py)
356+
}
357+
353358
#[doc = crate::inherit_docs!("view/to_arrow.md")]
354359
#[pyo3(signature = (**window))]
355360
pub fn to_arrow(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyBytes>> {

rust/perspective-python/src/client/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212

1313
pub mod client_sync;
1414
mod pandas;
15+
mod polars;
1516
mod pyarrow;
1617
pub mod python;
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
2+
// ┃ ██████ ██████ ██████ █ █ █ █ █ █▄ ▀███ █ ┃
3+
// ┃ ▄▄▄▄▄█ █▄▄▄▄▄ ▄▄▄▄▄█ ▀▀▀▀▀█▀▀▀▀▀ █ ▀▀▀▀▀█ ████████▌▐███ ███▄ ▀█ █ ▀▀▀▀▀ ┃
4+
// ┃ █▀▀▀▀▀ █▀▀▀▀▀ █▀██▀▀ ▄▄▄▄▄ █ ▄▄▄▄▄█ ▄▄▄▄▄█ ████████▌▐███ █████▄ █ ▄▄▄▄▄ ┃
5+
// ┃ █ ██████ █ ▀█▄ █ ██████ █ ███▌▐███ ███████▄ █ ┃
6+
// ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫
7+
// ┃ Copyright (c) 2017, the Perspective Authors. ┃
8+
// ┃ ╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌ ┃
9+
// ┃ This file is part of the Perspective library, distributed under the terms ┃
10+
// ┃ of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). ┃
11+
// ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
12+
13+
use pyo3::exceptions::PyValueError;
14+
use pyo3::prelude::*;
15+
use pyo3::types::{PyAny, PyBytes, PyList};
16+
17+
use super::pyarrow;
18+
19+
fn get_polars_df_cls(py: Python<'_>) -> PyResult<Option<Bound<'_, PyAny>>> {
20+
let sys = PyModule::import_bound(py, "sys")?;
21+
if sys.getattr("modules")?.contains("polars")? {
22+
let polars = PyModule::import_bound(py, "polars")?;
23+
Ok(Some(
24+
polars.getattr("DataFrame")?.to_object(py).into_bound(py),
25+
))
26+
} else {
27+
Ok(None)
28+
}
29+
}
30+
31+
fn get_polars_lf_cls(py: Python<'_>) -> PyResult<Option<Bound<'_, PyAny>>> {
32+
let sys = PyModule::import_bound(py, "sys")?;
33+
if sys.getattr("modules")?.contains("polars")? {
34+
let polars = PyModule::import_bound(py, "polars")?;
35+
Ok(Some(
36+
polars.getattr("LazyFrame")?.to_object(py).into_bound(py),
37+
))
38+
} else {
39+
Ok(None)
40+
}
41+
}
42+
43+
pub fn is_polars_df(py: Python, df: &Bound<'_, PyAny>) -> PyResult<bool> {
44+
if let Some(df_class) = get_polars_df_cls(py)? {
45+
df.is_instance(&df_class)
46+
} else {
47+
Ok(false)
48+
}
49+
}
50+
51+
pub fn is_polars_lf(py: Python, df: &Bound<'_, PyAny>) -> PyResult<bool> {
52+
if let Some(df_class) = get_polars_lf_cls(py)? {
53+
df.is_instance(&df_class)
54+
} else {
55+
Ok(false)
56+
}
57+
}
58+
59+
// ipc_bytes = self.to_arrow()
60+
// table = pa.ipc.open_stream(ipc_bytes).read_all()
61+
// x = pd.DataFrame(table.to_pandas())
62+
// print("AAA", x)
63+
// return x
64+
65+
pub fn arrow_to_polars(py: Python<'_>, arrow: &[u8]) -> PyResult<Py<PyAny>> {
66+
let polars = PyModule::import_bound(py, "polars")?;
67+
let bytes = PyBytes::new_bound(py, arrow);
68+
Ok(polars
69+
.getattr("read_ipc_stream")?
70+
.call1((bytes,))?
71+
.call0()?
72+
.as_unbound()
73+
.clone())
74+
}
75+
76+
pub fn polars_to_arrow_bytes<'py>(
77+
py: Python<'py>,
78+
df: &Bound<'py, PyAny>,
79+
) -> PyResult<Bound<'py, PyBytes>> {
80+
let df_class = get_polars_df_cls(py)?
81+
.ok_or_else(|| PyValueError::new_err("Failed to import polars.DataFrame"))?;
82+
let lf_class = get_polars_lf_cls(py)?
83+
.ok_or_else(|| PyValueError::new_err("Failed to import polars.LazyFrame"))?;
84+
85+
if !df.is_instance(&df_class)? && !df.is_instance(&lf_class)? {
86+
return Err(PyValueError::new_err("Input is not a polars.DataFrame or polars.LazyFrame"));
87+
}
88+
89+
let is_lazyframe = df.is_instance(&lf_class)?;
90+
91+
// let kwargs = PyDict::new_bound(py);
92+
// kwargs.set_item("preserve_index", true)?;
93+
94+
let table = if is_lazyframe {
95+
df.call_method0("collect")?.call_method0("to_arrow")?
96+
} else {
97+
df.call_method0("to_arrow")?
98+
};
99+
100+
// rename from __index_level_0__ to index
101+
let old_names: Vec<String> = table.getattr("column_names")?.extract()?;
102+
let mut new_names: Vec<String> = old_names
103+
.into_iter()
104+
.map(|e| {
105+
if e == "__index_level_0__" {
106+
"index".to_string()
107+
} else {
108+
e
109+
}
110+
})
111+
.collect();
112+
113+
let names = PyList::new_bound(py, new_names.clone());
114+
let table = table.call_method1("rename_columns", (names,))?;
115+
116+
// move the index column to be the first column.
117+
if new_names[new_names.len() - 1] == "index" {
118+
new_names.rotate_right(1);
119+
let order = PyList::new_bound(py, new_names);
120+
let table = table.call_method1("select", (order,))?;
121+
pyarrow::to_arrow_bytes(py, &table)
122+
} else {
123+
pyarrow::to_arrow_bytes(py, &table)
124+
}
125+
}

0 commit comments

Comments
 (0)