Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,16 @@ def rstrip(self) -> Expression:

return rstrip(self)

def trim(self) -> Expression:
"""Strip whitespace from both sides of a UTF-8 string.

Tip: See Also
[`daft.functions.trim`](https://docs.daft.ai/en/stable/api/functions/trim/)
"""
from daft.functions import trim

return trim(self)

def reverse(self) -> Expression:
"""Reverse a UTF-8 string.

Expand Down
4 changes: 4 additions & 0 deletions daft/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@
upper,
lstrip,
rstrip,
strip,
trim,
reverse,
capitalize,
to_camel_case,
Expand Down Expand Up @@ -424,6 +426,7 @@
"stddev",
"strftime",
"string_agg",
"strip",
"substr",
"sum",
"tan",
Expand All @@ -450,6 +453,7 @@
"total_minutes",
"total_nanoseconds",
"total_seconds",
"trim",
"try_compress",
"try_decode",
"try_decompress",
Expand Down
71 changes: 71 additions & 0 deletions daft/functions/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,77 @@ def rstrip(expr: Expression) -> Expression:
return Expression._call_builtin_scalar_fn("rstrip", expr)


def trim(expr: Expression) -> Expression:
"""Strip whitespace from both sides of string.

Returns:
Expression: a String expression which is `self` with leading and trailing whitespace stripped

Examples:
>>> import daft
>>> from daft.functions import trim
>>> df = daft.from_pydict({"x": ["foo", "bar", " baz "]})
>>> df = df.select(trim(df["x"]))
>>> df.show()
╭────────╮
│ x │
│ --- │
│ String │
╞════════╡
│ foo │
├╌╌╌╌╌╌╌╌┤
│ bar │
├╌╌╌╌╌╌╌╌┤
│ baz │
╰────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

"""
return Expression._call_builtin_scalar_fn("trim", expr)


def strip(expr: Expression, mode: Literal["left", "right", "both"] = "both") -> Expression:
"""Strip whitespace from string.

Args:
expr: The expression to strip whitespace from.
mode: The mode to use for stripping whitespace. Can be "left", "right", or "both". Defaults to "both".

Returns:
Expression: a String expression which is `self` with whitespace stripped according to the mode

Examples:
>>> import daft
>>> from daft.functions import strip
>>> df = daft.from_pydict({"x": ["foo", "bar", " baz "]})
>>> df = df.select(strip(df["x"], mode="both"))
>>> df.show()
╭────────╮
│ x │
│ --- │
│ String │
╞════════╡
│ foo │
├╌╌╌╌╌╌╌╌┤
│ bar │
├╌╌╌╌╌╌╌╌┤
│ baz │
╰────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

"""
if mode == "left":
return lstrip(expr)
elif mode == "right":
return rstrip(expr)
elif mode == "both":
return trim(expr)
else:
raise ValueError(f"Invalid mode: {mode}. Must be one of 'left', 'right', or 'both'.")


def reverse(expr: Expression) -> Expression:
"""Reverse a UTF-8 string.

Expand Down
3 changes: 3 additions & 0 deletions daft/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,9 @@ def lstrip(self) -> Series:
def rstrip(self) -> Series:
return self._eval_expressions("rstrip")

def trim(self) -> Series:
return self._eval_expressions("trim")

def reverse(self) -> Series:
return self._eval_expressions("reverse")

Expand Down
3 changes: 3 additions & 0 deletions src/daft-functions-utf8/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mod startswith;
mod substr;
mod to_date;
mod to_datetime;
mod trim;
mod upper;
pub(crate) mod utils;

Expand Down Expand Up @@ -60,6 +61,7 @@ pub use startswith::*;
pub use substr::*;
pub use to_date::*;
pub use to_datetime::*;
pub use trim::*;
pub use upper::*;

pub struct Utf8Functions;
Expand Down Expand Up @@ -93,6 +95,7 @@ impl daft_dsl::functions::FunctionModule for Utf8Functions {
parent.add_fn(Right);
parent.add_fn(RPad);
parent.add_fn(RStrip);
parent.add_fn(Trim);
parent.add_fn(SnakeCase);
parent.add_fn(Split);
parent.add_fn(StartsWith);
Expand Down
52 changes: 52 additions & 0 deletions src/daft-functions-utf8/src/trim.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use common_error::DaftResult;
use daft_core::{
prelude::{DataType, Field, Schema},
series::{IntoSeries, Series},
};
use daft_dsl::{
ExprRef,
functions::{FunctionArgs, ScalarUDF, scalar::ScalarFn},
};
use serde::{Deserialize, Serialize};

use crate::utils::{Utf8ArrayUtils, unary_utf8_evaluate, unary_utf8_to_field};

#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Trim;

#[typetag::serde]
impl ScalarUDF for Trim {
fn name(&self) -> &'static str {
"trim"
}

fn call(
&self,
inputs: daft_dsl::functions::FunctionArgs<Series>,
_ctx: &daft_dsl::functions::scalar::EvalContext,
) -> DaftResult<Series> {
unary_utf8_evaluate(inputs, |s| {
s.with_utf8_array(|arr| {
arr.unary_broadcasted_op(|val| val.trim().into())
.map(IntoSeries::into_series)
})
})
}

fn get_return_field(
&self,
inputs: FunctionArgs<ExprRef>,
schema: &Schema,
) -> DaftResult<Field> {
unary_utf8_to_field(inputs, schema, self.name(), DataType::Utf8)
}

fn docstring(&self) -> &'static str {
"Removes leading and trailing whitespace from the string"
}
}

#[must_use]
pub fn trim(input: ExprRef) -> ExprRef {
ScalarFn::builtin(Trim {}, vec![input]).into()
}
10 changes: 10 additions & 0 deletions tests/recordbatch/utf8/test_trim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations

from daft.expressions import col
from daft.recordbatch import MicroPartition


def test_utf8_trim():
table = MicroPartition.from_pydict({"col": ["\ta\t", None, "\nb\n", "\vc\t", "\td ", "\ne", "f\n", "g"]})
result = table.eval_expression_list([col("col").trim()])
assert result.to_pydict() == {"col": ["a", None, "b", "c", "d", "e", "f", "g"]}
Loading