Skip to content

Commit 8da868b

Browse files
Merge branch 'refs/heads/main' into feature/aherrera/SNOW-2500535-ConditionalAndNumeric
# Conflicts: # CHANGELOG.md # src/snowflake/snowpark/_functions/scalar_functions.py
2 parents 64866df + db1b20f commit 8da868b

File tree

9 files changed

+434
-27
lines changed

9 files changed

+434
-27
lines changed

CHANGELOG.md

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,30 @@
1010
- Added support for `Session.udf_profiler`.
1111
- Added support for `functions.ai_translate`.
1212
- Added support for the following functions in `functions.py`:
13-
- Conditional expressions:
14-
- `booland_agg`
15-
- `boolxor_agg`
16-
- `regr_valy`
17-
- `zeroifnull`
13+
- String and Binary functions:
14+
- `base64_decode_binary`
15+
- `compress`
16+
- `decompress_binary`
17+
- `decompress_string`
18+
- `md5_binary`
19+
- `md5_number_lower64`
20+
- `md5_number_upper64`
21+
- `sha1_binary`
22+
- `sha2_binary`
23+
- `soundex_p123`
24+
25+
- Conditional expressions:
26+
- `booland_agg`
27+
- `boolxor_agg`
28+
- `regr_valy`
29+
- `zeroifnull`
1830

19-
- Numeric expressions:
20-
- `cot`
21-
- `mod`
22-
- `pi`
23-
- `square`
24-
- `width_bucket`
31+
- Numeric expressions:
32+
- `cot`
33+
- `mod`
34+
- `pi`
35+
- `square`
36+
- `width_bucket`
2537

2638
#### Improvements
2739

@@ -121,7 +133,11 @@
121133
- `expanding.std`
122134
- `expanding.var`
123135
- `expanding.sem`
136+
- `cumsum`
137+
- `cummin`
138+
- `cummax`
124139
- Make faster pandas disabled by default (opt-in instead of opt-out).
140+
- Improve performance of `drop_duplicates` by avoiding joins when `keep!=False` in faster pandas.
125141

126142
## 1.42.0 (2025-10-28)
127143

docs/source/snowpark/functions.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ Functions
9898
avg
9999
base64
100100
base64_decode_string
101+
base64_decode_binary
101102
base64_encode
102103
bit_length
103104
bitand
@@ -141,6 +142,7 @@ Functions
141142
collect_list
142143
collect_set
143144
column
145+
compress
144146
concat
145147
concat_ws
146148
contains
@@ -193,6 +195,8 @@ Functions
193195
dayofweek
194196
dayofyear
195197
decode
198+
decompress_binary
199+
decompress_string
196200
degrees
197201
dense_rank
198202
desc
@@ -349,6 +353,9 @@ Functions
349353
md5
350354
mean
351355
median
356+
md5_binary
357+
md5_number_lower64
358+
md5_number_upper64
352359
min
353360
min_by
354361
minute
@@ -423,7 +430,9 @@ Functions
423430
sequence
424431
service
425432
sha1
433+
sha1_binary
426434
sha2
435+
sha2_binary
427436
sin
428437
sinh
429438
size
@@ -432,6 +441,7 @@ Functions
432441
snowflake_cortex_summarize
433442
sort_array
434443
soundex
444+
soundex_p123
435445
split
436446
sproc
437447
sql_expr

src/snowflake/snowpark/_functions/scalar_functions.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3973,6 +3973,265 @@ def try_to_geometry(
39733973
return builtin("try_to_geometry", _emit_ast=_emit_ast)(c)
39743974

39753975

3976+
@publicapi
3977+
def base64_decode_binary(
3978+
input_expr: ColumnOrName,
3979+
alphabet: Optional[ColumnOrName] = None,
3980+
_emit_ast: bool = True,
3981+
) -> Column:
3982+
"""
3983+
Decodes a base64-encoded string and returns the result as a binary value.
3984+
3985+
Args:
3986+
input_expr (ColumnOrName): A base64-encoded string to decode.
3987+
alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet.
3988+
3989+
Returns:
3990+
Column: A binary value containing the decoded result.
3991+
3992+
Examples::
3993+
>>> from snowflake.snowpark.functions import col, lit
3994+
>>> df = session.create_dataframe(["SEVMUA=="], schema=["input"])
3995+
>>> df.select(base64_decode_binary(col("input")).alias("result")).collect()
3996+
[Row(RESULT=bytearray(b'HELP'))]
3997+
3998+
>>> df.select(base64_decode_binary(col('input'), lit('$')).alias('result')).collect()
3999+
[Row(RESULT=bytearray(b'HELP'))]
4000+
"""
4001+
from snowflake.snowpark.functions import builtin
4002+
4003+
input_col = _to_col_if_str(input_expr, "base64_decode_binary")
4004+
4005+
if alphabet is not None:
4006+
alphabet_col = _to_col_if_str(alphabet, "base64_decode_binary")
4007+
return builtin("base64_decode_binary", _emit_ast=_emit_ast)(
4008+
input_col, alphabet_col
4009+
)
4010+
else:
4011+
return builtin("base64_decode_binary", _emit_ast=_emit_ast)(input_col)
4012+
4013+
4014+
@publicapi
4015+
def compress(
4016+
input_val: ColumnOrName, method: ColumnOrName, _emit_ast: bool = True
4017+
) -> Column:
4018+
"""
4019+
Compresses the input string using the specified compression method.
4020+
4021+
Args:
4022+
input_val (ColumnOrName): The input string to be compressed.
4023+
method (ColumnOrName): The compression method (e.g., "SNAPPY").
4024+
4025+
Returns:
4026+
Column: The compressed binary data.
4027+
4028+
Example::
4029+
>>> df = session.create_dataframe([['Snowflake'], ['Hello World']], schema=["input"])
4030+
>>> df.select(compress(df["input"], lit("SNAPPY")).alias("compressed")).collect()
4031+
[Row(COMPRESSED=bytearray(b'\\t Snowflake')), Row(COMPRESSED=bytearray(b'\\x0b(Hello World'))]
4032+
"""
4033+
input_col = _to_col_if_str(input_val, "compress")
4034+
method_col = _to_col_if_str(method, "compress")
4035+
return builtin("compress", _emit_ast=_emit_ast)(input_col, method_col)
4036+
4037+
4038+
@publicapi
4039+
def decompress_binary(
4040+
input_data: ColumnOrName, method: ColumnOrName, _emit_ast: bool = True
4041+
) -> Column:
4042+
"""
4043+
Decompresses binary data using the specified compression method.
4044+
4045+
Args:
4046+
input_data (ColumnOrName): The binary data to decompress.
4047+
method (ColumnOrName): The compression method used to decompress the data.
4048+
4049+
Returns:
4050+
Column: The decompressed binary data.
4051+
4052+
Examples::
4053+
>>> from snowflake.snowpark.functions import lit
4054+
>>> from snowflake.snowpark.functions import to_binary, lit
4055+
>>> df = session.create_dataframe([['0920536E6F77666C616B65']], schema=["compressed_hex"])
4056+
>>> df.select(decompress_binary(to_binary(df["compressed_hex"]), lit("SNAPPY")).alias("decompressed")).collect()
4057+
[Row(DECOMPRESSED=bytearray(b'Snowflake'))]
4058+
"""
4059+
input_col = _to_col_if_str(input_data, "decompress_binary")
4060+
method_col = _to_col_if_str(method, "decompress_binary")
4061+
return builtin("decompress_binary", _emit_ast=_emit_ast)(input_col, method_col)
4062+
4063+
4064+
@publicapi
4065+
def decompress_string(
4066+
input_data: ColumnOrName, method: ColumnOrName, _emit_ast: bool = True
4067+
) -> Column:
4068+
"""
4069+
Decompresses a BINARY value using the specified compression method and returns the result as a string.
4070+
4071+
Args:
4072+
input_data (ColumnOrName): The compressed binary data to decompress.
4073+
method (ColumnOrName): The compression method used. Supported methods include 'SNAPPY', 'GZIP', etc.
4074+
4075+
Returns:
4076+
Column: The decompressed string.
4077+
4078+
Example::
4079+
4080+
>>> from snowflake.snowpark.functions import to_binary
4081+
>>> df = session.create_dataframe([['0920536E6F77666C616B65', 'SNAPPY']], schema=["compressed_hex", "method"])
4082+
>>> df.select(decompress_string(to_binary(df["compressed_hex"], 'HEX'), df["method"]).alias("decompressed")).collect()
4083+
[Row(DECOMPRESSED='Snowflake')]
4084+
"""
4085+
input_col = _to_col_if_str(input_data, "decompress_string")
4086+
method_col = _to_col_if_str(method, "decompress_string")
4087+
return builtin("decompress_string", _emit_ast=_emit_ast)(input_col, method_col)
4088+
4089+
4090+
@publicapi
4091+
def md5_binary(msg: ColumnOrName, _emit_ast: bool = True) -> Column:
4092+
"""
4093+
Returns the MD5 hash of the input message as a binary value.
4094+
4095+
Args:
4096+
msg (ColumnOrName): The input message to compute the MD5 hash for.
4097+
4098+
Returns:
4099+
Column: The MD5 hash as a binary value (bytearray).
4100+
4101+
Examples::
4102+
>>> from snowflake.snowpark import Row
4103+
>>> from snowflake.snowpark.functions import col
4104+
>>> df = session.create_dataframe([["Snowflake"], ["test"], [""]], schema=["msg"])
4105+
>>> result = df.select(md5_binary(col("msg")).alias("md5_result")).collect()
4106+
4107+
>>> expected = [
4108+
... Row(MD5_RESULT=bytearray(b'\\xed\\xf1C\\x90u\\xa8:D\\x7f\\xb8\\xb60\\xdd\\xc9\\xc8\\xde')), # "Snowflake"
4109+
... Row(MD5_RESULT=bytearray(b"\\t\\x8fk\\xcdF!\\xd3s\\xca\\xdeN\\x83&'\\xb4\\xf6")), # "test"
4110+
... Row(MD5_RESULT=bytearray(b'\\xd4\\x1d\\x8c\\xd9\\x8f\\x00\\xb2\\x04\\xe9\\x80\\t\\x98\\xec\\xf8B~')) # "" (empty)
4111+
... ]
4112+
4113+
>>> assert result == expected
4114+
"""
4115+
c = _to_col_if_str(msg, "md5_binary")
4116+
return builtin("md5_binary", _emit_ast=_emit_ast)(c)
4117+
4118+
4119+
@publicapi
4120+
def md5_number_lower64(msg: ColumnOrName, _emit_ast: bool = True) -> Column:
4121+
"""
4122+
Returns a 64-bit number from the lower 64 bits of the MD5 hash of the input message.
4123+
4124+
Args:
4125+
msg (ColumnOrName): The input message to hash.
4126+
4127+
Returns:
4128+
Column: A 64-bit number representing the lower 64 bits of the MD5 hash.
4129+
4130+
Examples::
4131+
>>> from snowflake.snowpark.functions import col
4132+
>>> df = session.create_dataframe([["Snowflake"], ["test"], ["hello"]], schema=["msg"])
4133+
>>> df.select(md5_number_lower64(col("msg")).alias("result")).collect()
4134+
[Row(RESULT=9203306159527282910), Row(RESULT=14618207765679027446), Row(RESULT=13362634815750784402)]
4135+
"""
4136+
c = _to_col_if_str(msg, "md5_number_lower64")
4137+
return builtin("md5_number_lower64", _emit_ast=_emit_ast)(c)
4138+
4139+
4140+
@publicapi
4141+
def md5_number_upper64(msg: ColumnOrName, _emit_ast: bool = True) -> Column:
4142+
"""
4143+
Returns the upper 64 bits of the MD5 hash of the input message as a number.
4144+
4145+
Args:
4146+
msg (ColumnOrName): The input message to hash.
4147+
4148+
Returns:
4149+
Column: A column containing the upper 64 bits of the MD5 hash as a number.
4150+
4151+
Examples::
4152+
>>> from snowflake.snowpark.functions import col
4153+
>>> df = session.create_dataframe([["Snowflake"], ["test"], ["hello"]], schema=["msg"])
4154+
>>> df.select(md5_number_upper64(col("msg")).alias("result")).collect()
4155+
[Row(RESULT=17145559544104499780), Row(RESULT=688887797400064883), Row(RESULT=6719722671305337462)]
4156+
"""
4157+
c = _to_col_if_str(msg, "md5_number_upper64")
4158+
return builtin("md5_number_upper64", _emit_ast=_emit_ast)(c)
4159+
4160+
4161+
@publicapi
4162+
def sha1_binary(msg: ColumnOrName, _emit_ast: bool = True) -> Column:
4163+
"""
4164+
Returns the SHA-1 hash of the input message as a binary value.
4165+
4166+
Args:
4167+
msg (ColumnOrName): The input message to hash.
4168+
4169+
Returns:
4170+
Column: The SHA-1 hash as a binary value.
4171+
4172+
Examples::
4173+
>>> from snowflake.snowpark.functions import col
4174+
>>> df = session.create_dataframe([["Snowflake"], ["test"], ["hello"]], schema=["msg"])
4175+
>>> df.select(sha1_binary(col("msg")).alias("sha1_result")).collect()
4176+
[Row(SHA1_RESULT=bytearray(b'\\xfd\\xa7k\\x0b\\xcc\\x1e\\x87\\xcf%\\x9b\\x1d\\x1e2q\\xd7oY\\x0f\\xb5\\xdd')), Row(SHA1_RESULT=bytearray(b'\\xa9J\\x8f\\xe5\\xcc\\xb1\\x9b\\xa6\\x1cL\\x08s\\xd3\\x91\\xe9\\x87\\x98/\\xbb\\xd3')), Row(SHA1_RESULT=bytearray(b'\\xaa\\xf4\\xc6\\x1d\\xdc\\xc5\\xe8\\xa2\\xda\\xbe\\xde\\x0f;H,\\xd9\\xae\\xa9CM'))]
4177+
"""
4178+
c = _to_col_if_str(msg, "sha1_binary")
4179+
return builtin("sha1_binary", _emit_ast=_emit_ast)(c)
4180+
4181+
4182+
@publicapi
4183+
def sha2_binary(
4184+
msg: ColumnOrName, digest_size: ColumnOrName = None, _emit_ast: bool = True
4185+
) -> Column:
4186+
"""
4187+
Returns a binary SHA-2 hash of the input message. The digest size determines the hash algorithm used.
4188+
4189+
Args:
4190+
msg (ColumnOrName): The input message to hash.
4191+
digest_size (ColumnOrName, optional): The digest size in bits. Valid values are 224, 256, 384, and 512. Defaults to 256 if not specified.
4192+
4193+
Returns:
4194+
Column: A binary representation of the SHA-2 hash.
4195+
4196+
Examples::
4197+
>>> from snowflake.snowpark.functions import col, lit
4198+
>>> df = session.create_dataframe([["Snowflake"], ["test"], ["hello"]], schema=["msg"])
4199+
>>> df.select(sha2_binary(col("msg")).alias("result")).collect()
4200+
[Row(RESULT=bytearray(b'\\x1d\\xbdY\\xf6a\\xd6\\x8b\\x90rO!\\x08C\\x96\\xb8eIqs\\xe4\\xd2qOM\\x91\\xcf\\x05\\xfa_\\xc5\\xe1\\x8d')), Row(RESULT=bytearray(b'\\x9f\\x86\\xd0\\x81\\x88L}e\\x9a/\\xea\\xa0\\xc5Z\\xd0\\x15\\xa3\\xbfO\\x1b+\\x0b\\x82,\\xd1]l\\x15\\xb0\\xf0\\n\\x08')), Row(RESULT=bytearray(b',\\xf2M\\xba_\\xb0\\xa3\\x0e&\\xe8;*\\xc5\\xb9\\xe2\\x9e\\x1b\\x16\\x1e\\\\\\x1f\\xa7B^s\\x043b\\x93\\x8b\\x98$'))]
4201+
>>> df.select(sha2_binary(col("msg"), lit(224)).alias("result")).collect()
4202+
[Row(RESULT=bytearray(b'bg\\xd3\\xd7\\xa5\\x99)\\xe6\\x86M\\xd4\\xb77\\xd9\\x8e>\\xf8V\\x9d\\x9f\\x88\\xa7FfG\\x83\\x852')), Row(RESULT=bytearray(b'\\x90\\xa3\\xed\\x9e2\\xb2\\xaa\\xf4\\xc6\\x1cA\\x0e\\xb9%Ba\\x19\\xe1\\xa9\\xdcS\\xd4(j\\xde\\x99\\xa8\\t')), Row(RESULT=bytearray(b'\\xea\\t\\xae\\x9c\\xc6v\\x8cP\\xfc\\xee\\x90>\\xd0TUn[\\xfc\\x83G\\x90\\x7f\\x12Y\\x8a\\xa2A\\x93'))]
4203+
"""
4204+
c = _to_col_if_str(msg, "sha2_binary")
4205+
if digest_size is None:
4206+
return builtin("sha2_binary", _emit_ast=_emit_ast)(c)
4207+
else:
4208+
d = _to_col_if_str(digest_size, "sha2_binary")
4209+
return builtin("sha2_binary", _emit_ast=_emit_ast)(c, d)
4210+
4211+
4212+
@publicapi
4213+
def soundex_p123(varchar_expr: ColumnOrName, _emit_ast: bool = True) -> Column:
4214+
"""
4215+
Returns a phonetic representation of a string using the Soundex algorithm with P123 encoding.
4216+
This function converts names or words that sound similar into the same code, making it useful
4217+
for fuzzy matching and searching.
4218+
4219+
Args:
4220+
varchar_expr (ColumnOrName): The string expression to convert to Soundex P123 format.
4221+
4222+
Returns:
4223+
Column: The Soundex P123 encoded string.
4224+
4225+
Examples::
4226+
>>> from snowflake.snowpark.functions import col
4227+
>>> df = session.create_dataframe([["Pfister"], ["Lloyd"], ["Smith"], ["Johnson"]], schema=["name"])
4228+
>>> df.select(soundex_p123(col("name")).alias("soundex_result")).collect()
4229+
[Row(SOUNDEX_RESULT='P123'), Row(SOUNDEX_RESULT='L430'), Row(SOUNDEX_RESULT='S530'), Row(SOUNDEX_RESULT='J525')]
4230+
"""
4231+
c = _to_col_if_str(varchar_expr, "soundex_p123")
4232+
return builtin("soundex_p123", _emit_ast=_emit_ast)(c)
4233+
4234+
39764235
@publicapi
39774236
def booland_agg(expr: ColumnOrName, _emit_ast: bool = True) -> Column:
39784237
"""

0 commit comments

Comments
 (0)