Skip to content

Commit 1bbf28a

Browse files
Add support for scalar string and binary functions - part 3
1 parent 30fa5f3 commit 1bbf28a

File tree

3 files changed

+239
-0
lines changed

3 files changed

+239
-0
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@
5757
- `st_geometryfromwkt`
5858
- `try_to_geography`
5959
- `try_to_geometry`
60+
- String & binary functions:
61+
- `strtok`
62+
- `try_base64_decode_binary`
63+
- `try_base64_decode_string`
64+
- `try_hex_decode_binary`
65+
- `try_hex_decode_string`
66+
- `unicode`
67+
- `uuid_string`
6068
- Added a parameter to enable and disable automatic column name aliasing for `interval_day_time_from_parts` and `interval_year_month_from_parts` functions.
6169

6270
#### Bug Fixes

docs/source/snowpark/functions.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,11 @@ Functions
495495
st_geographyfromwkt
496496
st_geometryfromwkb
497497
st_geometryfromwkt
498+
strtok
499+
try_base64_decode_binary
500+
try_base64_decode_string
501+
try_hex_decode_binary
502+
try_hex_decode_string
498503
try_to_geography
499504
try_to_geometry
500505
substr
@@ -540,6 +545,8 @@ Functions
540545
udf
541546
udtf
542547
unbase64
548+
unicode
549+
uuid_string
543550
uniform
544551
unix_timestamp
545552
upper

src/snowflake/snowpark/_functions/scalar_functions.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3971,3 +3971,227 @@ def try_to_geometry(
39713971
return builtin("try_to_geometry", _emit_ast=_emit_ast)(c, allow_invalid_col)
39723972
else:
39733973
return builtin("try_to_geometry", _emit_ast=_emit_ast)(c)
3974+
3975+
3976+
@publicapi
3977+
def strtok(
3978+
string: ColumnOrName,
3979+
delimiter: ColumnOrName = None,
3980+
part_nr: ColumnOrName = None,
3981+
_emit_ast: bool = True,
3982+
) -> Column:
3983+
"""
3984+
Tokenizes a string with the given set of delimiters and returns the requested part.
3985+
3986+
Args:
3987+
string (ColumnOrName): The string to be tokenized.
3988+
delimiter (ColumnOrName, optional): A set of delimiters. Each character in the delimiter string is treated as a delimiter. If not specified, defaults to a single space character.
3989+
part_nr (ColumnOrName, optional): The requested part number (1-based). If not specified, returns the entire string.
3990+
3991+
Returns:
3992+
Column: The requested part of the tokenized string.
3993+
3994+
Examples::
3995+
>>> from snowflake.snowpark.functions import col, lit
3996+
>>> df = session.create_dataframe([["a.b.c"]], schema=["string_col"])
3997+
>>> df.select(strtok(col("string_col")).alias("result")).collect()
3998+
[Row(RESULT='a.b.c')]
3999+
>>> df.select(strtok(col("string_col"), lit(".")).alias("result")).collect()
4000+
[Row(RESULT='a')]
4001+
>>> df.select(strtok(col("string_col"), lit("."), lit(2)).alias("result")).collect()
4002+
[Row(RESULT='b')]
4003+
>>> df2 = session.create_dataframe([["[email protected]"]], schema=["string_col"])
4004+
>>> df2.select(strtok(col("string_col"), lit("@."), lit(1)).alias("result")).collect()
4005+
[Row(RESULT='user')]
4006+
>>> df2.select(strtok(col("string_col"), lit("@."), lit(3)).alias("result")).collect()
4007+
[Row(RESULT='com')]
4008+
"""
4009+
string_col = _to_col_if_str(string, "strtok")
4010+
4011+
if delimiter is None and part_nr is None:
4012+
return builtin("strtok", _emit_ast=_emit_ast)(string_col)
4013+
elif part_nr is None:
4014+
delimiter_col = _to_col_if_str(delimiter, "strtok")
4015+
return builtin("strtok", _emit_ast=_emit_ast)(string_col, delimiter_col)
4016+
else:
4017+
delimiter_col = (
4018+
_to_col_if_str(delimiter, "strtok") if delimiter is not None else lit(" ")
4019+
)
4020+
part_nr_col = _to_col_if_str(part_nr, "strtok")
4021+
return builtin("strtok", _emit_ast=_emit_ast)(
4022+
string_col, delimiter_col, part_nr_col
4023+
)
4024+
4025+
4026+
@publicapi
4027+
def try_base64_decode_binary(
4028+
input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True
4029+
) -> Column:
4030+
"""
4031+
Decodes a base64-encoded string to binary data. Returns None if the input is not valid base64.
4032+
4033+
Args:
4034+
input_expr (ColumnOrName): The base64-encoded string to decode.
4035+
alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet.
4036+
4037+
Returns:
4038+
Column: A column containing the decoded binary data, or None if the input is invalid.
4039+
4040+
Examples::
4041+
>>> from snowflake.snowpark.functions import base64_encode
4042+
>>> df = session.create_dataframe(["HELP", "TEST"], schema=["input"])
4043+
>>> df.select(try_base64_decode_binary(base64_encode(df["input"]))).collect()
4044+
[Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'TEST'))]
4045+
4046+
>>> df2 = session.create_dataframe(["SEVMUA==", "VEVTVA=="], schema=["encoded"])
4047+
>>> df2.select(try_base64_decode_binary(df2["encoded"])).collect()
4048+
[Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'TEST'))]
4049+
4050+
>>> df3 = session.create_dataframe(["invalid_base64!"], schema=["bad_input"])
4051+
>>> df3.select(try_base64_decode_binary(df3["bad_input"])).collect()
4052+
[Row(TRY_BASE64_DECODE_BINARY("BAD_INPUT")=None)]
4053+
"""
4054+
input_col = _to_col_if_str(input_expr, "try_base64_decode_binary")
4055+
4056+
if alphabet is not None:
4057+
alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_binary")
4058+
return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(
4059+
input_col, alphabet_col
4060+
)
4061+
else:
4062+
return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(input_col)
4063+
4064+
4065+
@publicapi
4066+
def try_base64_decode_string(
4067+
input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True
4068+
) -> Column:
4069+
"""
4070+
Decodes a base64-encoded string and returns the result. If the input is not a valid base64-encoded string, returns NULL instead of raising an error.
4071+
4072+
Args:
4073+
input_expr (ColumnOrName): A base64-encoded string to decode.
4074+
alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet.
4075+
4076+
Returns:
4077+
Column: The decoded string, or NULL if the input is not valid base64.
4078+
4079+
Examples::
4080+
>>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"])
4081+
>>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect()
4082+
[Row(RESULT='HELLO')]
4083+
4084+
>>> df = session.create_dataframe([["invalid_base64"]], schema=["encoded"])
4085+
>>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect()
4086+
[Row(RESULT=None)]
4087+
4088+
>>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"])
4089+
>>> df.select(try_base64_decode_string(df["encoded"], lit('$')).alias('result')).collect()
4090+
[Row(RESULT='HELLO')]
4091+
"""
4092+
c = _to_col_if_str(input_expr, "try_base64_decode_string")
4093+
if alphabet is not None:
4094+
alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_string")
4095+
return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c, alphabet_col)
4096+
else:
4097+
return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c)
4098+
4099+
4100+
@publicapi
4101+
def try_hex_decode_binary(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
4102+
"""
4103+
Decodes a hex-encoded string to binary data. Returns None if the input is not a valid hex string.
4104+
4105+
Args:
4106+
input_expr (ColumnOrName): A hex-encoded string to decode to binary data.
4107+
4108+
Returns:
4109+
Column: The decoded binary data as bytearray, or None if input is invalid.
4110+
4111+
Examples::
4112+
>>> from snowflake.snowpark.functions import col
4113+
>>> df = session.create_dataframe([["41426162"], ["48656C6C6F"], ["576F726C64"]], schema=["hex_string"])
4114+
>>> df.select(try_hex_decode_binary(col("hex_string")).alias("decoded_binary")).collect()
4115+
[Row(DECODED_BINARY=bytearray(b'ABab')), Row(DECODED_BINARY=bytearray(b'Hello')), Row(DECODED_BINARY=bytearray(b'World'))]
4116+
"""
4117+
c = _to_col_if_str(input_expr, "try_hex_decode_binary")
4118+
return builtin("try_hex_decode_binary", _emit_ast=_emit_ast)(c)
4119+
4120+
4121+
@publicapi
4122+
def try_hex_decode_string(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
4123+
"""
4124+
Decodes a hex-encoded string to its original string value. Returns None if the input is not a valid hex string.
4125+
4126+
Args:
4127+
input_expr (ColumnOrName): The hex-encoded string to decode.
4128+
4129+
Returns:
4130+
Column: The decoded string, or None if the input is not valid hex.
4131+
4132+
Examples::
4133+
>>> df = session.create_dataframe([["41614262"], ["127"], ["invalid_hex"]], schema=["hex_input"])
4134+
>>> df.select(try_hex_decode_string(df["hex_input"]).alias("decoded")).collect()
4135+
[Row(DECODED='AaBb'), Row(DECODED=None), Row(DECODED=None)]
4136+
"""
4137+
c = _to_col_if_str(input_expr, "try_hex_decode_string")
4138+
return builtin("try_hex_decode_string", _emit_ast=_emit_ast)(c)
4139+
4140+
4141+
@publicapi
4142+
def unicode(input_str: ColumnOrName, _emit_ast: bool = True) -> Column:
4143+
"""
4144+
Returns the Unicode code point of the first character in a string.
4145+
4146+
Args:
4147+
input_str (ColumnOrName): The input string column or string value to get the Unicode code point from.
4148+
4149+
Returns:
4150+
Column: The Unicode code point of the first character. Returns 0 for empty strings.
4151+
4152+
Examples::
4153+
>>> from snowflake.snowpark.functions import col
4154+
>>> df = session.create_dataframe([['a'], ['❄'], ['cde'], ['']], schema=["input_str"])
4155+
>>> df.select(unicode(col("input_str")).alias("unicode_result")).collect()
4156+
[Row(UNICODE_RESULT=97), Row(UNICODE_RESULT=10052), Row(UNICODE_RESULT=99), Row(UNICODE_RESULT=0)]
4157+
"""
4158+
c = _to_col_if_str(input_str, "unicode")
4159+
return builtin("unicode", _emit_ast=_emit_ast)(c)
4160+
4161+
4162+
@publicapi
4163+
def uuid_string(
4164+
uuid: ColumnOrName = None, name: ColumnOrName = None, _emit_ast: bool = True
4165+
) -> Column:
4166+
"""
4167+
Returns a universally unique identifier (UUID) as a string.
4168+
4169+
Args:
4170+
uuid (ColumnOrName, optional): The namespace UUID as a string. If provided, generates a UUID based on this namespace.
4171+
name (ColumnOrName, optional): The name to use for UUID generation. Used in combination with uuid parameter.
4172+
4173+
Returns:
4174+
Column: The UUID string.
4175+
4176+
Examples::
4177+
>>> df = session.create_dataframe([["test"]], schema=["a"])
4178+
>>> df.select(uuid_string().alias("random_uuid")).collect() # doctest: +SKIP
4179+
[Row(RANDOM_UUID='...')]
4180+
4181+
>>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d", "foo").alias("named_uuid")).collect() # doctest: +SKIP
4182+
[Row(NAMED_UUID='...')]
4183+
4184+
>>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d").alias("uuid_with_namespace")).collect() # doctest: +SKIP
4185+
[Row(UUID_WITH_NAMESPACE='...')]
4186+
4187+
>>> df.select(uuid_string(name="foo").alias("uuid_with_name")).collect() # doctest: +SKIP
4188+
[Row(UUID_WITH_NAME='...')]
4189+
"""
4190+
if uuid is None and name is None:
4191+
return builtin("uuid_string", _emit_ast=_emit_ast)()
4192+
elif uuid is not None and name is not None:
4193+
return builtin("uuid_string", _emit_ast=_emit_ast)(uuid, name)
4194+
elif uuid is not None:
4195+
return builtin("uuid_string", _emit_ast=_emit_ast)(uuid)
4196+
else:
4197+
builtin("uuid_string", _emit_ast=_emit_ast)(name)

0 commit comments

Comments
 (0)