Skip to content

Commit 4727d26

Browse files
SNOW-2455523: Add support for scalar string and binary functions - part 3 (#3947)
1 parent 294d96f commit 4727d26

File tree

3 files changed

+243
-0
lines changed

3 files changed

+243
-0
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121
- `sha1_binary`
2222
- `sha2_binary`
2323
- `soundex_p123`
24+
- `strtok`
25+
- `try_base64_decode_binary`
26+
- `try_base64_decode_string`
27+
- `try_hex_decode_binary`
28+
- `try_hex_decode_string`
29+
- `unicode`
30+
- `uuid_string`
2431

2532
#### Improvements
2633

docs/source/snowpark/functions.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,11 @@ Functions
506506
st_geographyfromwkt
507507
st_geometryfromwkb
508508
st_geometryfromwkt
509+
strtok
510+
try_base64_decode_binary
511+
try_base64_decode_string
512+
try_hex_decode_binary
513+
try_hex_decode_string
509514
try_to_geography
510515
try_to_geometry
511516
substr
@@ -551,6 +556,8 @@ Functions
551556
udf
552557
udtf
553558
unbase64
559+
unicode
560+
uuid_string
554561
uniform
555562
unix_timestamp
556563
upper

src/snowflake/snowpark/_functions/scalar_functions.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4230,3 +4230,232 @@ def soundex_p123(varchar_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
42304230
"""
42314231
c = _to_col_if_str(varchar_expr, "soundex_p123")
42324232
return builtin("soundex_p123", _emit_ast=_emit_ast)(c)
4233+
4234+
4235+
@publicapi
4236+
def strtok(
4237+
string: ColumnOrName,
4238+
delimiter: ColumnOrName = None,
4239+
part_nr: ColumnOrName = None,
4240+
_emit_ast: bool = True,
4241+
) -> Column:
4242+
"""
4243+
Tokenizes a string with the given set of delimiters and returns the requested part.
4244+
4245+
Args:
4246+
string (ColumnOrName): The string to be tokenized.
4247+
delimiter (ColumnOrName, optional): A set of delimiters. Each character in the delimiter string is treated as a delimiter. If not specified, defaults to a single space character.
4248+
part_nr (ColumnOrName, optional): The requested part number (1-based). If not specified, returns the entire string.
4249+
4250+
Returns:
4251+
Column: The requested part of the tokenized string.
4252+
4253+
Examples::
4254+
>>> from snowflake.snowpark.functions import col, lit
4255+
>>> df = session.create_dataframe([["a.b.c"]], schema=["string_col"])
4256+
>>> df.select(strtok(col("string_col")).alias("result")).collect()
4257+
[Row(RESULT='a.b.c')]
4258+
>>> df.select(strtok(col("string_col"), lit(".")).alias("result")).collect()
4259+
[Row(RESULT='a')]
4260+
>>> df.select(strtok(col("string_col"), lit("."), lit(2)).alias("result")).collect()
4261+
[Row(RESULT='b')]
4262+
>>> df2 = session.create_dataframe([["[email protected]"]], schema=["string_col"])
4263+
>>> df2.select(strtok(col("string_col"), lit("@."), lit(1)).alias("result")).collect()
4264+
[Row(RESULT='user')]
4265+
>>> df2.select(strtok(col("string_col"), lit("@."), lit(3)).alias("result")).collect()
4266+
[Row(RESULT='com')]
4267+
"""
4268+
string_col = _to_col_if_str(string, "strtok")
4269+
4270+
if delimiter is None and part_nr is None:
4271+
return builtin("strtok", _emit_ast=_emit_ast)(string_col)
4272+
elif delimiter is not None and part_nr is None:
4273+
delimiter_col = _to_col_if_str(delimiter, "strtok")
4274+
return builtin("strtok", _emit_ast=_emit_ast)(string_col, delimiter_col)
4275+
else:
4276+
delimiter_col = (
4277+
_to_col_if_str(delimiter, "strtok") if delimiter is not None else lit(" ")
4278+
)
4279+
part_nr_col = _to_col_if_str(part_nr, "strtok")
4280+
return builtin("strtok", _emit_ast=_emit_ast)(
4281+
string_col, delimiter_col, part_nr_col
4282+
)
4283+
4284+
4285+
@publicapi
4286+
def try_base64_decode_binary(
4287+
input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True
4288+
) -> Column:
4289+
"""
4290+
Decodes a base64-encoded string to binary data. Returns None if the input is not valid base64.
4291+
4292+
Args:
4293+
input_expr (ColumnOrName): The base64-encoded string to decode.
4294+
alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet.
4295+
4296+
Returns:
4297+
Column: A column containing the decoded binary data, or None if the input is invalid.
4298+
4299+
Examples::
4300+
>>> from snowflake.snowpark.functions import base64_encode
4301+
>>> df = session.create_dataframe(["HELP", "TEST"], schema=["input"])
4302+
>>> df.select(try_base64_decode_binary(base64_encode(df["input"])).alias("result")).collect()
4303+
[Row(RESULT=bytearray(b'HELP')), Row(RESULT=bytearray(b'TEST'))]
4304+
4305+
>>> df2 = session.create_dataframe(["SEVMUA==", "VEVTVA=="], schema=["encoded"])
4306+
>>> df2.select(try_base64_decode_binary(df2["encoded"]).alias("result")).collect()
4307+
[Row(RESULT=bytearray(b'HELP')), Row(RESULT=bytearray(b'TEST'))]
4308+
4309+
>>> df3 = session.create_dataframe(["invalid_base64!"], schema=["bad_input"])
4310+
>>> df3.select(try_base64_decode_binary(df3["bad_input"]).alias("result")).collect()
4311+
[Row(RESULT=None)]
4312+
4313+
>>> df4 = session.create_dataframe(["SEVMUA=="], schema=["encoded"])
4314+
>>> df4.select(try_base64_decode_binary(df4["encoded"], lit("$")).alias("result")).collect()
4315+
[Row(RESULT=bytearray(b'HELP'))]
4316+
"""
4317+
input_col = _to_col_if_str(input_expr, "try_base64_decode_binary")
4318+
4319+
if alphabet is not None:
4320+
alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_binary")
4321+
return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(
4322+
input_col, alphabet_col
4323+
)
4324+
else:
4325+
return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(input_col)
4326+
4327+
4328+
@publicapi
4329+
def try_base64_decode_string(
4330+
input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True
4331+
) -> Column:
4332+
"""
4333+
Decodes a base64-encoded string and returns the result. If the input is not a valid base64-encoded string, returns NULL instead of raising an error.
4334+
4335+
Args:
4336+
input_expr (ColumnOrName): A base64-encoded string to decode.
4337+
alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet.
4338+
4339+
Returns:
4340+
Column: The decoded string, or NULL if the input is not valid base64.
4341+
4342+
Examples::
4343+
>>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"])
4344+
>>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect()
4345+
[Row(RESULT='HELLO')]
4346+
4347+
>>> df = session.create_dataframe([["invalid_base64"]], schema=["encoded"])
4348+
>>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect()
4349+
[Row(RESULT=None)]
4350+
4351+
>>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"])
4352+
>>> df.select(try_base64_decode_string(df["encoded"], lit('$')).alias('result')).collect()
4353+
[Row(RESULT='HELLO')]
4354+
"""
4355+
c = _to_col_if_str(input_expr, "try_base64_decode_string")
4356+
if alphabet is not None:
4357+
alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_string")
4358+
return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c, alphabet_col)
4359+
else:
4360+
return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c)
4361+
4362+
4363+
@publicapi
4364+
def try_hex_decode_binary(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
4365+
"""
4366+
Decodes a hex-encoded string to binary data. Returns None if the input is not a valid hex string.
4367+
4368+
Args:
4369+
input_expr (ColumnOrName): A hex-encoded string to decode to binary data.
4370+
4371+
Returns:
4372+
Column: The decoded binary data as bytearray, or None if input is invalid.
4373+
4374+
Examples::
4375+
>>> from snowflake.snowpark.functions import col
4376+
>>> df = session.create_dataframe([["41426162"], ["48656C6C6F"], ["576F726C64"]], schema=["hex_string"])
4377+
>>> df.select(try_hex_decode_binary(col("hex_string")).alias("decoded_binary")).collect()
4378+
[Row(DECODED_BINARY=bytearray(b'ABab')), Row(DECODED_BINARY=bytearray(b'Hello')), Row(DECODED_BINARY=bytearray(b'World'))]
4379+
"""
4380+
c = _to_col_if_str(input_expr, "try_hex_decode_binary")
4381+
return builtin("try_hex_decode_binary", _emit_ast=_emit_ast)(c)
4382+
4383+
4384+
@publicapi
4385+
def try_hex_decode_string(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
4386+
"""
4387+
Decodes a hex-encoded string to its original string value. Returns None if the input is not a valid hex string.
4388+
4389+
Args:
4390+
input_expr (ColumnOrName): The hex-encoded string to decode.
4391+
4392+
Returns:
4393+
Column: The decoded string, or None if the input is not valid hex.
4394+
4395+
Examples::
4396+
>>> df = session.create_dataframe([["41614262"], ["127"], ["invalid_hex"]], schema=["hex_input"])
4397+
>>> df.select(try_hex_decode_string(df["hex_input"]).alias("decoded")).collect()
4398+
[Row(DECODED='AaBb'), Row(DECODED=None), Row(DECODED=None)]
4399+
"""
4400+
c = _to_col_if_str(input_expr, "try_hex_decode_string")
4401+
return builtin("try_hex_decode_string", _emit_ast=_emit_ast)(c)
4402+
4403+
4404+
@publicapi
4405+
def unicode(input_str: ColumnOrName, _emit_ast: bool = True) -> Column:
4406+
"""
4407+
Returns the Unicode code point of the first character in a string.
4408+
4409+
Args:
4410+
input_str (ColumnOrName): The input string column or string value to get the Unicode code point from.
4411+
4412+
Returns:
4413+
Column: The Unicode code point of the first character. Returns 0 for empty strings.
4414+
4415+
Examples::
4416+
>>> from snowflake.snowpark.functions import col
4417+
>>> df = session.create_dataframe([['a'], ['❄'], ['cde'], ['']], schema=["input_str"])
4418+
>>> df.select(unicode(col("input_str")).alias("unicode_result")).collect()
4419+
[Row(UNICODE_RESULT=97), Row(UNICODE_RESULT=10052), Row(UNICODE_RESULT=99), Row(UNICODE_RESULT=0)]
4420+
"""
4421+
c = _to_col_if_str(input_str, "unicode")
4422+
return builtin("unicode", _emit_ast=_emit_ast)(c)
4423+
4424+
4425+
@publicapi
4426+
def uuid_string(
4427+
uuid: ColumnOrName = None, name: ColumnOrName = None, _emit_ast: bool = True
4428+
) -> Column:
4429+
"""
4430+
Returns a universally unique identifier (UUID) as a string. If the uuid is provided, also the name must be provided.
4431+
4432+
Args:
4433+
uuid (ColumnOrName, optional): The namespace UUID as a string. If provided, generates a UUID based on this namespace.
4434+
name (ColumnOrName, optional): The name to use for UUID generation. Used in combination with uuid parameter.
4435+
4436+
Returns:
4437+
Column: The UUID string.
4438+
4439+
Examples::
4440+
>>> from snowflake.snowpark.functions import col
4441+
4442+
>>> df = session.create_dataframe(
4443+
... [["fe971b24-9572-4005-b22f-351e9c09274d", "foo"]], schema=["uuid", "name"]
4444+
... )
4445+
4446+
>>> df.select(uuid_string().alias("random_uuid")).collect()
4447+
[Row(RANDOM_UUID='...')]
4448+
4449+
>>> result = df.select(
4450+
... uuid_string(col("uuid"), col("name")).alias("NAMED_UUID")
4451+
... ).collect()
4452+
>>> expected_uuid = "dc0b6f65-fca6-5b4b-9d37-ccc3fde1f3e2"
4453+
>>> result[0]["NAMED_UUID"] == expected_uuid
4454+
True
4455+
"""
4456+
if uuid is None and name is None:
4457+
return builtin("uuid_string", _emit_ast=_emit_ast)()
4458+
else:
4459+
uuid_col = _to_col_if_str(uuid, "uuid_string")
4460+
name_col = _to_col_if_str(name, "uuid_string")
4461+
return builtin("uuid_string", _emit_ast=_emit_ast)(uuid_col, name_col)

0 commit comments

Comments
 (0)