Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,499 changes: 764 additions & 735 deletions doc/user/data/sql_funcs.yml

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ pub use linear::{
};
pub use relation::func::order_aggregate_datums as order_aggregate_datums_exported_for_benchmarking;
pub use relation::func::{
AggregateFunc, AnalyzedRegex, CaptureGroupDesc, LagLeadType, NaiveOneByOneAggr, OneByOneAggr,
TableFunc,
AggregateFunc, AnalyzedRegex, AnalyzedRegexOpts, CaptureGroupDesc, LagLeadType,
NaiveOneByOneAggr, OneByOneAggr, TableFunc,
};
pub use relation::join_input_mapper::JoinInputMapper;
pub use relation::{
Expand Down
1 change: 1 addition & 0 deletions src/expr/src/relation.proto
Original file line number Diff line number Diff line change
Expand Up @@ -201,5 +201,6 @@ message ProtoTableFunc {
google.protobuf.Empty acl_explode = 16;
google.protobuf.Empty mz_acl_explode = 17;
mz_repr.relation_and_scalar.ProtoScalarType unnest_map = 18;
mz_expr.relation.func.ProtoAnalyzedRegex regexp_matches = 19;
}
}
6 changes: 6 additions & 0 deletions src/expr/src/relation/func.proto
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ message ProtoCaptureGroupDesc {
bool nullable = 3;
}

message ProtoAnalyzedRegexOpts {
bool case_insensitive = 1;
bool global = 2;
}

message ProtoAnalyzedRegex {
mz_repr.adt.regex.ProtoRegex regex = 1;
repeated ProtoCaptureGroupDesc groups = 2;
ProtoAnalyzedRegexOpts opts = 3;
}
109 changes: 104 additions & 5 deletions src/expr/src/relation/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
use std::cmp::{max, min};
use std::iter::Sum;
use std::ops::Deref;
use std::str::FromStr;
use std::{fmt, iter};

use chrono::{DateTime, NaiveDateTime, NaiveTime, Utc};
use dec::OrderedDecimal;
use itertools::Itertools;
use itertools::{Either, Itertools};
use mz_lowertest::MzReflect;
use mz_ore::cast::CastFrom;

Expand Down Expand Up @@ -3134,6 +3135,30 @@ fn regexp_extract(a: Datum, r: &AnalyzedRegex) -> Option<(Row, Diff)> {
Some((Row::pack(datums), 1))
}

fn regexp_matches<'a, 'r: 'a>(
a: Datum<'a>,
r: &'r AnalyzedRegex,
) -> impl Iterator<Item = (Row, Diff)> + 'a {
let regex = r.inner();
let a = a.unwrap_str();

let iter = regex.captures_iter(a).map(move |captures| {
let matches = captures
.iter()
// The first match is the *entire* match, we want the capture groups by themselves.
.skip(1)
.map(|m| Datum::from(m.map(|m| m.as_str())));

(Row::pack(matches), 1)
});

if r.opts().global {
Either::Left(iter)
} else {
Either::Right(iter.take(1))
}
}

fn generate_series<N>(
start: N,
stop: N,
Expand Down Expand Up @@ -3459,33 +3484,87 @@ impl RustType<ProtoCaptureGroupDesc> for CaptureGroupDesc {
}
}

#[derive(
Arbitrary,
Clone,
Debug,
Eq,
PartialEq,
Ord,
PartialOrd,
Serialize,
Deserialize,
Hash,
MzReflect,
Default,
)]
pub struct AnalyzedRegexOpts {
pub case_insensitive: bool,
pub global: bool,
}

impl FromStr for AnalyzedRegexOpts {
type Err = EvalError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut opts = AnalyzedRegexOpts::default();
for c in s.chars() {
match c {
'i' => opts.case_insensitive = true,
'g' => opts.global = true,
_ => return Err(EvalError::InvalidRegexFlag(c)),
}
}
Ok(opts)
}
}

impl RustType<ProtoAnalyzedRegexOpts> for AnalyzedRegexOpts {
fn into_proto(&self) -> ProtoAnalyzedRegexOpts {
ProtoAnalyzedRegexOpts {
case_insensitive: self.case_insensitive,
global: self.global,
}
}

fn from_proto(proto: ProtoAnalyzedRegexOpts) -> Result<Self, TryFromProtoError> {
Ok(Self {
case_insensitive: proto.case_insensitive,
global: proto.global,
})
}
}

#[derive(
Arbitrary, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash, MzReflect,
)]
pub struct AnalyzedRegex(
#[proptest(strategy = "mz_repr::adt::regex::any_regex()")] ReprRegex,
Vec<CaptureGroupDesc>,
AnalyzedRegexOpts,
);

impl RustType<ProtoAnalyzedRegex> for AnalyzedRegex {
fn into_proto(&self) -> ProtoAnalyzedRegex {
ProtoAnalyzedRegex {
regex: Some(self.0.into_proto()),
groups: self.1.into_proto(),
opts: Some(self.2.into_proto()),
}
}

fn from_proto(proto: ProtoAnalyzedRegex) -> Result<Self, TryFromProtoError> {
Ok(AnalyzedRegex(
proto.regex.into_rust_if_some("ProtoAnalyzedRegex::regex")?,
proto.groups.into_rust()?,
proto.opts.into_rust_if_some("ProtoAnalyzedRegex::opts")?,
))
}
}

impl AnalyzedRegex {
pub fn new(s: &str) -> Result<Self, regex::Error> {
let r = ReprRegex::new(s, false)?;
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, regex::Error> {
let r = ReprRegex::new(s, opts.case_insensitive)?;
// TODO(benesch): remove potentially dangerous usage of `as`.
#[allow(clippy::as_conversions)]
let descs: Vec<_> = r
Expand All @@ -3504,7 +3583,7 @@ impl AnalyzedRegex {
nullable: true,
})
.collect();
Ok(Self(r, descs))
Ok(Self(r, descs, opts))
}
pub fn capture_groups_len(&self) -> usize {
self.1.len()
Expand All @@ -3515,6 +3594,9 @@ impl AnalyzedRegex {
pub fn inner(&self) -> &Regex {
&(self.0).regex
}
pub fn opts(&self) -> &AnalyzedRegexOpts {
&self.2
}
}

pub fn csv_extract(a: Datum, n_cols: usize) -> impl Iterator<Item = (Row, Diff)> + '_ {
Expand Down Expand Up @@ -3639,6 +3721,7 @@ pub enum TableFunc {
name: String,
relation: RelationType,
},
RegexpMatches(AnalyzedRegex),
}

impl RustType<ProtoTableFunc> for TableFunc {
Expand Down Expand Up @@ -3673,6 +3756,7 @@ impl RustType<ProtoTableFunc> for TableFunc {
relation: Some(relation.into_proto()),
})
}
TableFunc::RegexpMatches(x) => Kind::RegexpMatches(x.into_proto()),
}),
}
}
Expand Down Expand Up @@ -3717,6 +3801,7 @@ impl RustType<ProtoTableFunc> for TableFunc {
.relation
.into_rust_if_some("ProtoTabletizedScalar::relation")?,
},
Kind::RegexpMatches(x) => TableFunc::RegexpMatches(x.into_rust()?),
})
}
}
Expand Down Expand Up @@ -3796,6 +3881,7 @@ impl TableFunc {
let r = Row::pack_slice(datums);
Ok(Box::new(std::iter::once((r, 1))))
}
TableFunc::RegexpMatches(a) => Ok(Box::new(regexp_matches(datums[0], a))),
}
}

Expand Down Expand Up @@ -3924,6 +4010,15 @@ impl TableFunc {
TableFunc::TabletizedScalar { relation, .. } => {
return relation.clone();
}
TableFunc::RegexpMatches(a) => {
let column_types = a
.capture_groups_iter()
.map(|cg| ScalarType::String.nullable(cg.nullable))
.collect();
let keys = vec![];

(column_types, keys)
}
};

if !keys.is_empty() {
Expand Down Expand Up @@ -3953,6 +4048,7 @@ impl TableFunc {
TableFunc::UnnestMap { .. } => 2,
TableFunc::Wrap { width, .. } => *width,
TableFunc::TabletizedScalar { relation, .. } => relation.column_types.len(),
TableFunc::RegexpMatches(a) => a.capture_groups_len(),
}
}

Expand All @@ -3973,7 +4069,8 @@ impl TableFunc {
| TableFunc::Repeat
| TableFunc::UnnestArray { .. }
| TableFunc::UnnestList { .. }
| TableFunc::UnnestMap { .. } => true,
| TableFunc::UnnestMap { .. }
| TableFunc::RegexpMatches(_) => true,
TableFunc::Wrap { .. } => false,
TableFunc::TabletizedScalar { .. } => false,
}
Expand Down Expand Up @@ -4002,6 +4099,7 @@ impl TableFunc {
TableFunc::UnnestMap { .. } => true,
TableFunc::Wrap { .. } => true,
TableFunc::TabletizedScalar { .. } => true,
TableFunc::RegexpMatches(_) => true,
}
}
}
Expand All @@ -4027,6 +4125,7 @@ impl fmt::Display for TableFunc {
TableFunc::UnnestMap { .. } => f.write_str("unnest_map"),
TableFunc::Wrap { width, .. } => write!(f, "wrap{}", width),
TableFunc::TabletizedScalar { name, .. } => f.write_str(name),
TableFunc::RegexpMatches(a) => write!(f, "regexp_matches({:?}, _)", a.0),
}
}
}
Expand Down
54 changes: 53 additions & 1 deletion src/sql/src/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
use std::cell::RefCell;
use std::collections::BTreeMap;
use std::fmt;
use std::str::FromStr;
use std::sync::LazyLock;

use itertools::Itertools;
Expand Down Expand Up @@ -3444,6 +3445,57 @@ pub static PG_CATALOG_BUILTINS: LazyLock<BTreeMap<&'static str, Func>> = LazyLoc
params!(String, String, String, String) => VariadicFunc::RegexpReplace => String, 2285;
// TODO: PostgreSQL supports additional five and six argument forms of this function which
// allow controlling where to start the replacement and how many replacements to make.
},
"regexp_matches" => Table {
params!(String, String) => Operation::variadic(move |_ecx, exprs| {
let regex = match exprs[1].clone().into_literal_string() {
None => sql_bail!("regexp_matches requires a string literal as its second argument"),
Some(regex) => mz_expr::AnalyzedRegex::new(&regex, mz_expr::AnalyzedRegexOpts::default()).map_err(|e| sql_err!("analyzing regex: {}", e))?,
};
let column_names = regex
.capture_groups_iter()
.map(|cg| {
cg.name.clone().unwrap_or_else(|| format!("column{}", cg.index)).into()
})
.collect::<Vec<_>>();
if column_names.is_empty(){
sql_bail!("regexp_matches must specify at least one capture group");
}
Ok(TableFuncPlan {
expr: HirRelationExpr::CallTable {
func: TableFunc::RegexpMatches(regex),
exprs: vec![exprs[0].clone()],
},
column_names,
})
}) => ReturnType::set_of(String.into()), 2763;
params!(String, String, String) => Operation::variadic(move |_ecx, exprs| {
let flags = match exprs[2].clone().into_literal_string() {
None => sql_bail!("regexp_matches requires a string literal as its third argument"),
Some(flags) => flags,
};
let opts = mz_expr::AnalyzedRegexOpts::from_str(&flags).map_err(|e| sql_err!("parsing regex flags: {}", e))?;
let regex = match exprs[1].clone().into_literal_string() {
None => sql_bail!("regexp_matches requires a string literal as its second argument"),
Some(regex) => mz_expr::AnalyzedRegex::new(&regex, opts).map_err(|e| sql_err!("analyzing regex: {}", e))?,
};
let column_names = regex
.capture_groups_iter()
.map(|cg| {
cg.name.clone().unwrap_or_else(|| format!("column{}", cg.index)).into()
})
.collect::<Vec<_>>();
if column_names.is_empty(){
sql_bail!("regexp_matches must specify at least one capture group");
}
Ok(TableFuncPlan {
expr: HirRelationExpr::CallTable {
func: TableFunc::RegexpMatches(regex),
exprs: vec![exprs[0].clone()],
},
column_names,
})
}) => ReturnType::set_of(String.into()), 2764;
}
};

Expand Down Expand Up @@ -3821,7 +3873,7 @@ pub static MZ_CATALOG_BUILTINS: LazyLock<BTreeMap<&'static str, Func>> = LazyLoc
params!(String, String) => Operation::binary(move |_ecx, regex, haystack| {
let regex = match regex.into_literal_string() {
None => sql_bail!("regexp_extract requires a string literal as its first argument"),
Some(regex) => mz_expr::AnalyzedRegex::new(&regex).map_err(|e| sql_err!("analyzing regex: {}", e))?,
Some(regex) => mz_expr::AnalyzedRegex::new(&regex, mz_expr::AnalyzedRegexOpts::default()).map_err(|e| sql_err!("analyzing regex: {}", e))?,
};
let column_names = regex
.capture_groups_iter()
Expand Down
Loading