Skip to content

Commit e300ed1

Browse files
authored
chore: implement StrPadOp, StrFindOp, StrExtractOp, StrRepeatOp, RegexReplaceStrOp and ReplaceStrOp compilers (#2015)
1 parent 6559877 commit e300ed1

File tree

13 files changed

+306
-8
lines changed

13 files changed

+306
-8
lines changed

bigframes/core/compile/sqlglot/expressions/unary_compiler.py

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,96 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
177177
)
178178

179179

180+
@UNARY_OP_REGISTRATION.register(ops.StrContainsOp)
181+
def _(op: ops.StrContainsOp, expr: TypedExpr) -> sge.Expression:
182+
return sge.Like(this=expr.expr, expression=sge.convert(f"%{op.pat}%"))
183+
184+
180185
@UNARY_OP_REGISTRATION.register(ops.StrContainsRegexOp)
181186
def _(op: ops.StrContainsRegexOp, expr: TypedExpr) -> sge.Expression:
182187
return sge.RegexpLike(this=expr.expr, expression=sge.convert(op.pat))
183188

184189

185-
@UNARY_OP_REGISTRATION.register(ops.StrContainsOp)
186-
def _(op: ops.StrContainsOp, expr: TypedExpr) -> sge.Expression:
187-
return sge.Like(this=expr.expr, expression=sge.convert(f"%{op.pat}%"))
190+
@UNARY_OP_REGISTRATION.register(ops.StrExtractOp)
191+
def _(op: ops.StrExtractOp, expr: TypedExpr) -> sge.Expression:
192+
return sge.RegexpExtract(
193+
this=expr.expr, expression=sge.convert(op.pat), group=sge.convert(op.n)
194+
)
195+
196+
197+
@UNARY_OP_REGISTRATION.register(ops.StrFindOp)
198+
def _(op: ops.StrFindOp, expr: TypedExpr) -> sge.Expression:
199+
# INSTR is 1-based, so we need to adjust the start position.
200+
start = sge.convert(op.start + 1) if op.start is not None else sge.convert(1)
201+
if op.end is not None:
202+
# BigQuery's INSTR doesn't support `end`, so we need to use SUBSTR.
203+
return sge.func(
204+
"INSTR",
205+
sge.Substring(
206+
this=expr.expr,
207+
start=start,
208+
length=sge.convert(op.end - (op.start or 0)),
209+
),
210+
sge.convert(op.substr),
211+
) - sge.convert(1)
212+
else:
213+
return sge.func(
214+
"INSTR",
215+
expr.expr,
216+
sge.convert(op.substr),
217+
start,
218+
) - sge.convert(1)
219+
220+
221+
@UNARY_OP_REGISTRATION.register(ops.StrLstripOp)
222+
def _(op: ops.StrLstripOp, expr: TypedExpr) -> sge.Expression:
223+
return sge.Trim(this=expr.expr, expression=sge.convert(op.to_strip), side="LEFT")
224+
225+
226+
@UNARY_OP_REGISTRATION.register(ops.StrPadOp)
227+
def _(op: ops.StrPadOp, expr: TypedExpr) -> sge.Expression:
228+
pad_length = sge.func(
229+
"GREATEST", sge.Length(this=expr.expr), sge.convert(op.length)
230+
)
231+
if op.side == "left":
232+
return sge.func(
233+
"LPAD",
234+
expr.expr,
235+
pad_length,
236+
sge.convert(op.fillchar),
237+
)
238+
elif op.side == "right":
239+
return sge.func(
240+
"RPAD",
241+
expr.expr,
242+
pad_length,
243+
sge.convert(op.fillchar),
244+
)
245+
else: # side == both
246+
lpad_amount = sge.Cast(
247+
this=sge.func(
248+
"SAFE_DIVIDE",
249+
sge.Sub(this=pad_length, expression=sge.Length(this=expr.expr)),
250+
sge.convert(2),
251+
),
252+
to="INT64",
253+
) + sge.Length(this=expr.expr)
254+
return sge.func(
255+
"RPAD",
256+
sge.func(
257+
"LPAD",
258+
expr.expr,
259+
lpad_amount,
260+
sge.convert(op.fillchar),
261+
),
262+
pad_length,
263+
sge.convert(op.fillchar),
264+
)
265+
266+
267+
@UNARY_OP_REGISTRATION.register(ops.StrRepeatOp)
268+
def _(op: ops.StrRepeatOp, expr: TypedExpr) -> sge.Expression:
269+
return sge.Repeat(this=expr.expr, times=sge.convert(op.repeats))
188270

189271

190272
@UNARY_OP_REGISTRATION.register(ops.date_op)
@@ -444,11 +526,6 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
444526
return sge.Extract(this=sge.Identifier(this="MONTH"), expression=expr.expr)
445527

446528

447-
@UNARY_OP_REGISTRATION.register(ops.StrLstripOp)
448-
def _(op: ops.StrLstripOp, expr: TypedExpr) -> sge.Expression:
449-
return sge.Trim(this=expr.expr, expression=sge.convert(op.to_strip), side="LEFT")
450-
451-
452529
@UNARY_OP_REGISTRATION.register(ops.neg_op)
453530
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
454531
return sge.Neg(this=expr.expr)
@@ -484,6 +561,18 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
484561
return sge.Extract(this=sge.Identifier(this="QUARTER"), expression=expr.expr)
485562

486563

564+
@UNARY_OP_REGISTRATION.register(ops.ReplaceStrOp)
565+
def _(op: ops.ReplaceStrOp, expr: TypedExpr) -> sge.Expression:
566+
return sge.func("REPLACE", expr.expr, sge.convert(op.pat), sge.convert(op.repl))
567+
568+
569+
@UNARY_OP_REGISTRATION.register(ops.RegexReplaceStrOp)
570+
def _(op: ops.RegexReplaceStrOp, expr: TypedExpr) -> sge.Expression:
571+
return sge.func(
572+
"REGEXP_REPLACE", expr.expr, sge.convert(op.pat), sge.convert(op.repl)
573+
)
574+
575+
487576
@UNARY_OP_REGISTRATION.register(ops.reverse_op)
488577
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
489578
return sge.func("REVERSE", expr.expr)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
REGEXP_REPLACE(`bfcol_0`, 'e', 'a') AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
REPLACE(`bfcol_0`, 'e', 'a') AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
REGEXP_EXTRACT(`bfcol_0`, '([a-z]*)') AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
INSTR(`bfcol_0`, 'e', 1) - 1 AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
INSTR(SUBSTRING(`bfcol_0`, 1, 5), 'e') - 1 AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
INSTR(`bfcol_0`, 'e', 3) - 1 AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
INSTR(SUBSTRING(`bfcol_0`, 3, 3), 'e') - 1 AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
RPAD(
9+
LPAD(
10+
`bfcol_0`,
11+
CAST(SAFE_DIVIDE(GREATEST(LENGTH(`bfcol_0`), 10) - LENGTH(`bfcol_0`), 2) AS INT64) + LENGTH(`bfcol_0`),
12+
'-'
13+
),
14+
GREATEST(LENGTH(`bfcol_0`), 10),
15+
'-'
16+
) AS `bfcol_1`
17+
FROM `bfcte_0`
18+
)
19+
SELECT
20+
`bfcol_1` AS `string_col`
21+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`string_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
LPAD(`bfcol_0`, GREATEST(LENGTH(`bfcol_0`), 10), '-') AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `string_col`
13+
FROM `bfcte_1`

0 commit comments

Comments
 (0)