Skip to content

Commit 7798054

Browse files
Chris Boumalhabcboumalh
authored andcommitted
[SPARK-54179][SQL][FOLLOW-UP] Add Dataframe API support for Tuple sketches
### What changes were proposed in this pull request? This PR adds DataFrame API support for Tuple sketch functions that were previously only available in Spark SQL. The following functions are now accessible through the DataFrame API: - `tuple_sketch_agg_*` - Aggregation functions for creating tuple sketches - `tuple_union_agg_*` - Union aggregation functions - `tuple_intersection_agg_*` - Intersection aggregation functions - `tuple_sketch_estimate_*` - Estimation functions for tuple sketches - `tuple_sketch_summary_*` - Summary functions for tuple sketches - `tuple_sketch_theta_*` - Theta extraction functions - `tuple_union_*` - Union operations - `tuple_intersection_*` - Intersection operations - `tuple_difference_*` - Difference operations This is a follow-up to SPARK-54179 that completes the DataFrame API parity for tuple sketch operations. ### Why are the changes needed? This PR expands the sketch feature's adoption in DataFrame cases. ### Does this PR introduce _any_ user-facing change? yes, it provides the DataFrame API for Tuple sketches. ``` * tuple_sketch_agg_* * tuple_union_agg_* * tuple_intersection_agg_* * tuple_sketch_estimate_* * tuple_sketch_summary_* * tuple_sketch_theta_* * tuple_union_* * tuple_intersection_* * tuple_difference_* ``` ### How was this patch tested? Added tests to DataFrameAggregateSuite.scala ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Sonnet 4.5 Closes #54041 from cboumalh/cboumalh-tuple-dataframe-followup. Lead-authored-by: Chris Boumalhab <cboumalh@amazon.com> Co-authored-by: Chris Boumalhab <84485659+cboumalh@users.noreply.github.com> Signed-off-by: Daniel Tenedorio <daniel.tenedorio@databricks.com>
1 parent 6112a0b commit 7798054

File tree

8 files changed

+3679
-7
lines changed

8 files changed

+3679
-7
lines changed

docs/sql-ref-sketch-aggregates.md

Lines changed: 403 additions & 7 deletions
Large diffs are not rendered by default.

python/docs/source/reference/pyspark.sql/functions.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,12 @@ Aggregate Functions
501501
theta_intersection_agg
502502
theta_sketch_agg
503503
theta_union_agg
504+
tuple_intersection_agg_double
505+
tuple_intersection_agg_integer
506+
tuple_sketch_agg_double
507+
tuple_sketch_agg_integer
508+
tuple_union_agg_double
509+
tuple_union_agg_integer
504510
try_avg
505511
try_sum
506512
var_pop
@@ -665,6 +671,18 @@ Misc Functions
665671
theta_intersection
666672
theta_sketch_estimate
667673
theta_union
674+
tuple_difference_double
675+
tuple_difference_integer
676+
tuple_intersection_double
677+
tuple_intersection_integer
678+
tuple_sketch_estimate_double
679+
tuple_sketch_estimate_integer
680+
tuple_sketch_summary_double
681+
tuple_sketch_summary_integer
682+
tuple_sketch_theta_double
683+
tuple_sketch_theta_integer
684+
tuple_union_double
685+
tuple_union_integer
668686
try_aes_decrypt
669687
try_reflect
670688
typeof

python/pyspark/sql/connect/functions/builtin.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4575,6 +4575,96 @@ def theta_intersection_agg(
45754575
theta_intersection_agg.__doc__ = pysparkfuncs.theta_intersection_agg.__doc__
45764576

45774577

4578+
def tuple_sketch_agg_double(
4579+
key: "ColumnOrName",
4580+
summary: "ColumnOrName",
4581+
lgNomEntries: Optional[Union[int, Column]] = None,
4582+
mode: Optional[Union[str, Column]] = None,
4583+
) -> Column:
4584+
fn = "tuple_sketch_agg_double"
4585+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4586+
_mode = lit("sum") if mode is None else lit(mode)
4587+
4588+
return _invoke_function_over_columns(fn, key, summary, _lgNomEntries, _mode)
4589+
4590+
4591+
tuple_sketch_agg_double.__doc__ = pysparkfuncs.tuple_sketch_agg_double.__doc__
4592+
4593+
4594+
def tuple_sketch_agg_integer(
4595+
key: "ColumnOrName",
4596+
summary: "ColumnOrName",
4597+
lgNomEntries: Optional[Union[int, Column]] = None,
4598+
mode: Optional[Union[str, Column]] = None,
4599+
) -> Column:
4600+
fn = "tuple_sketch_agg_integer"
4601+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4602+
_mode = lit("sum") if mode is None else lit(mode)
4603+
4604+
return _invoke_function_over_columns(fn, key, summary, _lgNomEntries, _mode)
4605+
4606+
4607+
tuple_sketch_agg_integer.__doc__ = pysparkfuncs.tuple_sketch_agg_integer.__doc__
4608+
4609+
4610+
def tuple_union_agg_double(
4611+
col: "ColumnOrName",
4612+
lgNomEntries: Optional[Union[int, Column]] = None,
4613+
mode: Optional[Union[str, Column]] = None,
4614+
) -> Column:
4615+
fn = "tuple_union_agg_double"
4616+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4617+
_mode = lit("sum") if mode is None else lit(mode)
4618+
4619+
return _invoke_function_over_columns(fn, col, _lgNomEntries, _mode)
4620+
4621+
4622+
tuple_union_agg_double.__doc__ = pysparkfuncs.tuple_union_agg_double.__doc__
4623+
4624+
4625+
def tuple_union_agg_integer(
4626+
col: "ColumnOrName",
4627+
lgNomEntries: Optional[Union[int, Column]] = None,
4628+
mode: Optional[Union[str, Column]] = None,
4629+
) -> Column:
4630+
fn = "tuple_union_agg_integer"
4631+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4632+
_mode = lit("sum") if mode is None else lit(mode)
4633+
4634+
return _invoke_function_over_columns(fn, col, _lgNomEntries, _mode)
4635+
4636+
4637+
tuple_union_agg_integer.__doc__ = pysparkfuncs.tuple_union_agg_integer.__doc__
4638+
4639+
4640+
def tuple_intersection_agg_double(
4641+
col: "ColumnOrName",
4642+
mode: Optional[Union[str, Column]] = None,
4643+
) -> Column:
4644+
fn = "tuple_intersection_agg_double"
4645+
if mode is None:
4646+
return _invoke_function_over_columns(fn, col)
4647+
else:
4648+
return _invoke_function_over_columns(fn, col, lit(mode))
4649+
4650+
4651+
tuple_intersection_agg_double.__doc__ = pysparkfuncs.tuple_intersection_agg_double.__doc__
4652+
4653+
4654+
def tuple_intersection_agg_integer(
4655+
col: "ColumnOrName",
4656+
mode: Optional[Union[str, Column]] = None,
4657+
) -> Column:
4658+
fn = "tuple_intersection_agg_integer"
4659+
if mode is None:
4660+
return _invoke_function_over_columns(fn, col)
4661+
else:
4662+
return _invoke_function_over_columns(fn, col, lit(mode))
4663+
4664+
4665+
tuple_intersection_agg_integer.__doc__ = pysparkfuncs.tuple_intersection_agg_integer.__doc__
4666+
4667+
45784668
def kll_sketch_agg_bigint(
45794669
col: "ColumnOrName",
45804670
k: Optional[Union[int, Column]] = None,
@@ -4816,6 +4906,140 @@ def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
48164906
theta_difference.__doc__ = pysparkfuncs.theta_difference.__doc__
48174907

48184908

4909+
def tuple_sketch_estimate_double(col: "ColumnOrName") -> Column:
4910+
fn = "tuple_sketch_estimate_double"
4911+
return _invoke_function_over_columns(fn, col)
4912+
4913+
4914+
tuple_sketch_estimate_double.__doc__ = pysparkfuncs.tuple_sketch_estimate_double.__doc__
4915+
4916+
4917+
def tuple_sketch_estimate_integer(col: "ColumnOrName") -> Column:
4918+
fn = "tuple_sketch_estimate_integer"
4919+
return _invoke_function_over_columns(fn, col)
4920+
4921+
4922+
tuple_sketch_estimate_integer.__doc__ = pysparkfuncs.tuple_sketch_estimate_integer.__doc__
4923+
4924+
4925+
def tuple_sketch_summary_double(
4926+
col: "ColumnOrName",
4927+
mode: Optional[Union[str, Column]] = None,
4928+
) -> Column:
4929+
fn = "tuple_sketch_summary_double"
4930+
if mode is None:
4931+
return _invoke_function_over_columns(fn, col)
4932+
else:
4933+
return _invoke_function_over_columns(fn, col, lit(mode))
4934+
4935+
4936+
tuple_sketch_summary_double.__doc__ = pysparkfuncs.tuple_sketch_summary_double.__doc__
4937+
4938+
4939+
def tuple_sketch_summary_integer(
4940+
col: "ColumnOrName",
4941+
mode: Optional[Union[str, Column]] = None,
4942+
) -> Column:
4943+
fn = "tuple_sketch_summary_integer"
4944+
if mode is None:
4945+
return _invoke_function_over_columns(fn, col)
4946+
else:
4947+
return _invoke_function_over_columns(fn, col, lit(mode))
4948+
4949+
4950+
tuple_sketch_summary_integer.__doc__ = pysparkfuncs.tuple_sketch_summary_integer.__doc__
4951+
4952+
4953+
def tuple_sketch_theta_double(col: "ColumnOrName") -> Column:
4954+
return _invoke_function_over_columns("tuple_sketch_theta_double", col)
4955+
4956+
4957+
tuple_sketch_theta_double.__doc__ = pysparkfuncs.tuple_sketch_theta_double.__doc__
4958+
4959+
4960+
def tuple_sketch_theta_integer(col: "ColumnOrName") -> Column:
4961+
return _invoke_function_over_columns("tuple_sketch_theta_integer", col)
4962+
4963+
4964+
tuple_sketch_theta_integer.__doc__ = pysparkfuncs.tuple_sketch_theta_integer.__doc__
4965+
4966+
4967+
def tuple_union_double(
4968+
col1: "ColumnOrName",
4969+
col2: "ColumnOrName",
4970+
lgNomEntries: Optional[Union[int, Column]] = None,
4971+
mode: Optional[Union[str, Column]] = None,
4972+
) -> Column:
4973+
fn = "tuple_union_double"
4974+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4975+
_mode = lit("sum") if mode is None else lit(mode)
4976+
4977+
return _invoke_function_over_columns(fn, col1, col2, _lgNomEntries, _mode)
4978+
4979+
4980+
tuple_union_double.__doc__ = pysparkfuncs.tuple_union_double.__doc__
4981+
4982+
4983+
def tuple_union_integer(
4984+
col1: "ColumnOrName",
4985+
col2: "ColumnOrName",
4986+
lgNomEntries: Optional[Union[int, Column]] = None,
4987+
mode: Optional[Union[str, Column]] = None,
4988+
) -> Column:
4989+
fn = "tuple_union_integer"
4990+
_lgNomEntries = lit(12) if lgNomEntries is None else lit(lgNomEntries)
4991+
_mode = lit("sum") if mode is None else lit(mode)
4992+
4993+
return _invoke_function_over_columns(fn, col1, col2, _lgNomEntries, _mode)
4994+
4995+
4996+
tuple_union_integer.__doc__ = pysparkfuncs.tuple_union_integer.__doc__
4997+
4998+
4999+
def tuple_intersection_double(
5000+
col1: "ColumnOrName",
5001+
col2: "ColumnOrName",
5002+
mode: Optional[Union[str, Column]] = None,
5003+
) -> Column:
5004+
fn = "tuple_intersection_double"
5005+
if mode is None:
5006+
return _invoke_function_over_columns(fn, col1, col2)
5007+
else:
5008+
return _invoke_function_over_columns(fn, col1, col2, lit(mode))
5009+
5010+
5011+
tuple_intersection_double.__doc__ = pysparkfuncs.tuple_intersection_double.__doc__
5012+
5013+
5014+
def tuple_intersection_integer(
5015+
col1: "ColumnOrName",
5016+
col2: "ColumnOrName",
5017+
mode: Optional[Union[str, Column]] = None,
5018+
) -> Column:
5019+
fn = "tuple_intersection_integer"
5020+
if mode is None:
5021+
return _invoke_function_over_columns(fn, col1, col2)
5022+
else:
5023+
return _invoke_function_over_columns(fn, col1, col2, lit(mode))
5024+
5025+
5026+
tuple_intersection_integer.__doc__ = pysparkfuncs.tuple_intersection_integer.__doc__
5027+
5028+
5029+
def tuple_difference_double(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
5030+
return _invoke_function_over_columns("tuple_difference_double", col1, col2)
5031+
5032+
5033+
tuple_difference_double.__doc__ = pysparkfuncs.tuple_difference_double.__doc__
5034+
5035+
5036+
def tuple_difference_integer(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
5037+
return _invoke_function_over_columns("tuple_difference_integer", col1, col2)
5038+
5039+
5040+
tuple_difference_integer.__doc__ = pysparkfuncs.tuple_difference_integer.__doc__
5041+
5042+
48195043
# Predicates Function
48205044

48215045

python/pyspark/sql/functions/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,12 @@
421421
"theta_intersection_agg",
422422
"theta_sketch_agg",
423423
"theta_union_agg",
424+
"tuple_intersection_agg_double",
425+
"tuple_intersection_agg_integer",
426+
"tuple_sketch_agg_double",
427+
"tuple_sketch_agg_integer",
428+
"tuple_union_agg_double",
429+
"tuple_union_agg_integer",
424430
"try_avg",
425431
"try_sum",
426432
"var_pop",
@@ -531,6 +537,18 @@
531537
"theta_intersection",
532538
"theta_sketch_estimate",
533539
"theta_union",
540+
"tuple_difference_double",
541+
"tuple_difference_integer",
542+
"tuple_intersection_double",
543+
"tuple_intersection_integer",
544+
"tuple_sketch_estimate_double",
545+
"tuple_sketch_estimate_integer",
546+
"tuple_sketch_summary_double",
547+
"tuple_sketch_summary_integer",
548+
"tuple_sketch_theta_double",
549+
"tuple_sketch_theta_integer",
550+
"tuple_union_double",
551+
"tuple_union_integer",
534552
"try_aes_decrypt",
535553
"try_reflect",
536554
"typeof",

0 commit comments

Comments
 (0)