@@ -74,6 +74,15 @@ def create_abfss_path(
7474 return path
7575
7676
77+ def create_abfss_path_from_path (
78+ lakehouse_id : UUID , workspace_id : UUID , file_path : str
79+ ) -> str :
80+
81+ fp = _get_default_file_path ()
82+
83+ return f"abfss://{ workspace_id } @{ fp } /{ lakehouse_id } /{ file_path } "
84+
85+
7786def _get_default_file_path () -> str :
7887
7988 default_file_storage = _get_fabric_context_setting (name = "fs.defaultFS" )
@@ -1547,31 +1556,10 @@ def _get_column_aggregate(
15471556 path = create_abfss_path (lakehouse_id , workspace_id , table_name , schema_name )
15481557 df = _read_delta_table (path )
15491558
1550- if isinstance (column_name , str ):
1551- result = _get_aggregate (
1552- df = df ,
1553- column_name = column_name ,
1554- function = function ,
1555- default_value = default_value ,
1556- )
1557- elif isinstance (column_name , list ):
1558- result = {}
1559- for col in column_name :
1560- result [col ] = _get_aggregate (
1561- df = df ,
1562- column_name = col ,
1563- function = function ,
1564- default_value = default_value ,
1565- )
1566- else :
1567- raise TypeError ("column_name must be a string or a list of strings." )
1568-
1569- return result
1570-
1571-
1572- def _get_aggregate (df , column_name , function , default_value : int = 0 ) -> int :
1559+ function = function .lower ()
15731560
1574- function = function .upper ()
1561+ if isinstance (column_name , str ):
1562+ column_name = [column_name ]
15751563
15761564 if _pure_python_notebook ():
15771565 import polars as pl
@@ -1581,36 +1569,76 @@ def _get_aggregate(df, column_name, function, default_value: int = 0) -> int:
15811569
15821570 df = pl .from_pandas (df )
15831571
1584- # Perform aggregation
1585- if "DISTINCT" in function :
1586- if isinstance (df [column_name ].dtype , pl .Decimal ):
1587- result = df [column_name ].cast (pl .Float64 ).n_unique ()
1572+ def get_expr (col ):
1573+ col_dtype = df .schema [col ]
1574+
1575+ if "approx" in function :
1576+ return pl .col (col ).unique ().count ().alias (col )
1577+ elif "distinct" in function :
1578+ if col_dtype == pl .Decimal :
1579+ return pl .col (col ).cast (pl .Float64 ).n_unique ().alias (col )
1580+ else :
1581+ return pl .col (col ).n_unique ().alias (col )
1582+ elif function == "sum" :
1583+ return pl .col (col ).sum ().alias (col )
1584+ elif function == "min" :
1585+ return pl .col (col ).min ().alias (col )
1586+ elif function == "max" :
1587+ return pl .col (col ).max ().alias (col )
1588+ elif function == "count" :
1589+ return pl .col (col ).count ().alias (col )
1590+ elif function in {"avg" , "mean" }:
1591+ return pl .col (col ).mean ().alias (col )
15881592 else :
1589- result = df [column_name ].n_unique ()
1590- elif "APPROX" in function :
1591- result = df [column_name ].unique ().shape [0 ]
1592- else :
1593- try :
1594- result = getattr (df [column_name ], function .lower ())()
1595- except AttributeError :
15961593 raise ValueError (f"Unsupported function: { function } " )
15971594
1598- return result if result is not None else default_value
1595+ exprs = [get_expr (col ) for col in column_name ]
1596+ aggs = df .select (exprs ).to_dict (as_series = False )
1597+
1598+ if len (column_name ) == 1 :
1599+ result = aggs [column_name [0 ]][0 ] or default_value
1600+ else :
1601+ result = {col : aggs [col ][0 ] for col in column_name }
15991602 else :
1600- from pyspark .sql .functions import approx_count_distinct
1601- from pyspark .sql import functions as F
1603+ from pyspark .sql .functions import (
1604+ count ,
1605+ sum ,
1606+ min ,
1607+ max ,
1608+ avg ,
1609+ approx_count_distinct ,
1610+ countDistinct ,
1611+ )
16021612
1603- if isinstance (df , pd .DataFrame ):
1604- df = _create_spark_dataframe (df )
1613+ result = None
1614+ if "approx" in function :
1615+ spark_func = approx_count_distinct
1616+ elif "distinct" in function :
1617+ spark_func = countDistinct
1618+ elif function == "count" :
1619+ spark_func = count
1620+ elif function == "sum" :
1621+ spark_func = sum
1622+ elif function == "min" :
1623+ spark_func = min
1624+ elif function == "max" :
1625+ spark_func = max
1626+ elif function == "avg" :
1627+ spark_func = avg
1628+ else :
1629+ raise ValueError (f"Unsupported function: { function } " )
1630+
1631+ agg_exprs = []
1632+ for col in column_name :
1633+ agg_exprs .append (spark_func (col ).alias (col ))
16051634
1606- if "DISTINCT" in function :
1607- result = df .select (F .count_distinct (F .col (column_name )))
1608- elif "APPROX" in function :
1609- result = df .select (approx_count_distinct (column_name ))
1635+ aggs = df .agg (* agg_exprs ).collect ()[0 ]
1636+ if len (column_name ) == 1 :
1637+ result = aggs [0 ] or default_value
16101638 else :
1611- result = df . selectExpr ( f" { function } ( { column_name } )" )
1639+ result = { col : aggs [ col ] for col in column_name }
16121640
1613- return result . collect ()[ 0 ][ 0 ] or default_value
1641+ return result
16141642
16151643
16161644def _create_spark_dataframe (df : pd .DataFrame ):
@@ -2222,3 +2250,23 @@ def _xml_to_dict(element):
22222250 element .text .strip () if element .text and element .text .strip () else None
22232251 )
22242252 return data
2253+
2254+
2255+ def file_exists (file_path : str ) -> bool :
2256+ """
2257+ Check if a file exists in the given path.
2258+
2259+ Parameters
2260+ ----------
2261+ file_path : str
2262+ The path to the file.
2263+
2264+ Returns
2265+ -------
2266+ bool
2267+ True if the file exists, False otherwise.
2268+ """
2269+
2270+ import notebookutils
2271+
2272+ return len (notebookutils .fs .ls (file_path )) > 0
0 commit comments