snowflakedb
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/snowflake/snowpark/_internal/debug_utils.py‎
Lines changed: 229 additions & 12 deletions b/‎src/snowflake/snowpark/_internal/debug_utils.py‎
Lines changed: 229 additions & 12 deletions
diff --git a/‎src/snowflake/snowpark/_internal/server_connection.py‎
Lines changed: 15 additions & 3 deletions b/‎src/snowflake/snowpark/_internal/server_connection.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎src/snowflake/snowpark/_internal/utils.py‎
Lines changed: 6 additions & 2 deletions b/‎src/snowflake/snowpark/_internal/utils.py‎
Lines changed: 6 additions & 2 deletions
@@ -17,6 +17,7 @@
 - Added support for the following functions in `functions.py`:
   - `ai_embed`
   - `try_parse_json`
+- Added a dataframe profiler. To use, you can call get_execution_profile() on your desired dataframe. This profiler reports the queries executed to evaluate a dataframe, and statistics about each of the query operators.
 
 #### Bug Fixes
 
@@ -64,8 +65,7 @@
 - Added debuggability improvements to eagerly validate dataframe schema metadata. Enable it using `snowflake.snowpark.context.configure_development_features()`.
 - Added a new function `snowflake.snowpark.dataframe.map_in_pandas` that allows users map a function across a dataframe. The mapping function takes an iterator of pandas dataframes as input and provides one as output.
 - Added a ttl cache to describe queries. Repeated queries in a 15 second interval will use the cached value rather than requery Snowflake.
-- Added a parameter `fetch_with_process` to `DataFrameReader.dbapi` (PrPr) to enable multiprocessing for parallel data fetching in
-local ingestion. By default, local ingestion uses multithreading. Multiprocessing may improve performance for CPU-bound tasks like Parquet file generation.
+- Added a parameter `fetch_with_process` to `DataFrameReader.dbapi` (PrPr) to enable multiprocessing for parallel data fetching in local ingestion. By default, local ingestion uses multithreading. Multiprocessing may improve performance for CPU-bound tasks like Parquet file generation.
 - Added a new function `snowflake.snowpark.functions.model` that allows users to call methods of a model.
 
 #### Improvements
 
@@ -5,13 +5,15 @@
 from functools import cached_property
 import os
 import sys
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set, Tuple
 import itertools
 import re
 from typing import TYPE_CHECKING
+import snowflake.snowpark
 from snowflake.snowpark._internal.ast.batch import get_dependent_bind_ids
 from snowflake.snowpark._internal.ast.utils import __STRING_INTERNING_MAP__
 import snowflake.snowpark._internal.proto.generated.ast_pb2 as proto
+from ast import literal_eval
 from snowflake.snowpark._internal.ast.utils import extract_src_from_expr
 
 if TYPE_CHECKING:
@@ -220,6 +222,32 @@ def _format_source_location(src: Optional[proto.SrcPosition]) -> str:
     return lines_info
 
 
+def _extract_source_locations_from_plan(plan: "SnowflakePlan") -> List[str]:
+    """
+    Extract source locations from a SnowflakePlan's AST IDs.
+
+    Args:
+        plan: The SnowflakePlan object to extract source locations from
+
+    Returns:
+        List of unique source location strings (e.g., "file.py: line 42")
+    """
+    source_locations = []
+    found_locations = set()
+
+    if plan.df_ast_ids is not None:
+        for ast_id in plan.df_ast_ids:
+            bind_stmt = plan.session._ast_batch._bind_stmt_cache.get(ast_id)
+            if bind_stmt is not None:
+                src = extract_src_from_expr(bind_stmt.bind.expr)
+                location = _format_source_location(src)
+                if location and location not in found_locations:
+                    found_locations.add(location)
+                    source_locations.append(location)
+
+    return source_locations
+
+
 def get_python_source_from_sql_error(top_plan: "SnowflakePlan", error_msg: str) -> str:
     """
     Extract SQL error line number and map it back to Python source code. We use the
@@ -249,17 +277,8 @@ def get_python_source_from_sql_error(top_plan: "SnowflakePlan", error_msg: str)
     )
 
     plan = get_plan_from_line_numbers(top_plan, sql_line_number)
-    source_locations = []
-    found_locations = set()
-    if plan.df_ast_ids is not None:
-        for ast_id in plan.df_ast_ids:
-            bind_stmt = plan.session._ast_batch._bind_stmt_cache.get(ast_id)
-            if bind_stmt is not None:
-                src = extract_src_from_expr(bind_stmt.bind.expr)
-                location = _format_source_location(src)
-                if location != "" and location not in found_locations:
-                    found_locations.add(location)
-                    source_locations.append(location)
+    source_locations = _extract_source_locations_from_plan(plan)
+
     if source_locations:
         if len(source_locations) == 1:
             return f"\nSQL compilation error corresponds to Python source at {source_locations[0]}.\n"
@@ -434,3 +453,201 @@ def sql_contains_object_creation(sql_query: str, target_object: str) -> bool:
                         return f"\nObject '{object_name}' was first referenced at {location}.\n"
 
     return ""
+
+
+class QueryProfiler:
+    """
+    A class for profiling Snowflake queries and analyzing operator statistics.
+    It can generate tree visualizations and output tables of operator statistics.
+    """
+
+    def __init__(
+        self, session: "snowflake.snowpark.Session", output_file: Optional[str] = None
+    ) -> None:
+        self.session = session
+        if output_file:
+            self.file_handle = open(output_file, "a", encoding="utf-8")
+        else:
+            self.file_handle = None
+
+    def _get_node_info(self, row: Dict) -> Dict:
+        parent_operators = row.get("PARENT_OPERATORS")
+        parent_operators = (
+            str(parent_operators) if parent_operators is not None else None
+        )
+        node_info = {
+            "id": row.get("OPERATOR_ID") or 0,
+            "parent_operators": parent_operators,
+            "type": row.get("OPERATOR_TYPE") or "N/A",
+            "input_rows": row.get("INPUT_ROWS") or 0,
+            "output_rows": row.get("OUTPUT_ROWS") or 0,
+            "row_multiple": row.get("ROW_MULTIPLE") or 0,
+            "exec_time": row.get("OVERALL_PERCENTAGE") or 0,
+            "attributes": row.get("OPERATOR_ATTRIBUTES") or "N/A",
+        }
+        return node_info
+
+    def build_operator_tree(self, operators_data: List[Dict]) -> Tuple[Dict, Dict, Set]:
+        """
+        Build a tree structure from raw operator data for query profiling.
+
+        Args:
+            operators_data (List[Dict]): A list of dictionaries containing operator statistics.
+            The keys include operator id, operator type, parent operators, input rows, output rows,
+            row multiple, overall percentage, and operator attributes.
+
+        Returns:
+            Tuple[Dict, Dict, Set]: A tuple containing:
+                - nodes (Dict[int, Dict]): Dictionary mapping operator IDs to node information
+                - children (Dict[int, List[int]]): Dictionary mapping operator IDs to lists of child operator IDs
+                - root_nodes (Set[int]): Set of operator IDs that are root nodes (have no parents)
+
+        """
+
+        nodes = {}
+        children = {}
+        root_nodes = set()
+        for row in operators_data:
+            node_info = self._get_node_info(row)
+
+            nodes[node_info["id"]] = node_info
+            children[node_info["id"]] = []
+
+            if node_info["parent_operators"] is None:
+                root_nodes.add(node_info["id"])
+            else:
+                # parse parent_operators, which is a string like "[1, 2, 3]" to a list
+                x = literal_eval(node_info["parent_operators"])
+                for parent_id in x:
+                    if parent_id not in children:
+                        children[parent_id] = []
+                    children[parent_id].append(node_info["id"])
+
+        return nodes, children, root_nodes
+
+    def _write_output(self, message: str) -> None:
+        """Helper function to write output to either console or file."""
+        if self.file_handle:
+            self.file_handle.write(message + "\n")
+        else:
+            sys.stdout.write(message + "\n")
+
+    def close(self) -> None:
+        """Close the file handle if it exists."""
+        if self.file_handle:
+            self.file_handle.close()
+
+    def print_operator_tree(
+        self,
+        nodes: Dict[int, Dict],
+        children: Dict[int, List[int]],
+        node_id: int,
+        prefix: str = "",
+        is_last: bool = True,
+    ) -> None:
+        """
+        Print a visual tree representation of query operators with their statistics.
+
+        Args:
+            nodes (Dict[int, Dict]): Dictionary mapping operator IDs to node information.
+            children (Dict[int, List[int]]): Dictionary mapping operator IDs to lists of child operator IDs.
+            node_id (int): The ID of the current operator node to print.
+            prefix (str, optional): String prefix for tree formatting (used for indentation).
+                Defaults to "".
+            is_last (bool, optional): Whether this node is the last child of its parent.
+                Used for proper tree connector formatting. Defaults to True.
+
+        Returns:
+            None: This function writes output to a file or prints and doesn't return a value.
+
+        """
+        node = nodes[node_id]
+
+        connector = "└── " if is_last else "├── "
+
+        node_info = (
+            f"[{node['id']}] {node['type']} "
+            f"(In: {node['input_rows']:,}, Out: {node['output_rows']:,}, "
+            f"Mult: {node['row_multiple']:.2f}, Time: {node['exec_time']:.2f}%)"
+        )
+
+        self._write_output(f"{prefix}{connector}{node_info}")
+
+        extension = "    " if is_last else "│   "
+        new_prefix = prefix + extension
+
+        child_list = children.get(node_id, [])
+        for i, child_id in enumerate(child_list):
+            is_last_child = i == len(child_list) - 1
+            self.print_operator_tree(
+                nodes, children, child_id, new_prefix, is_last_child
+            )
+
+    def profile_query(
+        self,
+        query_id: str,
+    ) -> None:
+        """
+        Profile a query and save the results to a file.
+
+        Args:
+            query_id: The query ID to profile
+
+        Returns:
+            None - output either to the console or to the file specified by output_file
+        """
+
+        stats_query = f"""
+            SELECT
+                operator_id,
+                operator_type,
+                operator_attributes,
+                operator_statistics:input_rows::number as input_rows,
+                operator_statistics:output_rows::number as output_rows,
+                CASE
+                    WHEN operator_statistics:input_rows::number > 0
+                    THEN operator_statistics:output_rows::number / operator_statistics:input_rows::number
+                    ELSE NULL
+                END as row_multiple,
+                execution_time_breakdown:overall_percentage::number as overall_percentage
+            FROM TABLE(get_query_operator_stats('{query_id}'))
+            ORDER BY step_id, operator_id
+            """
+        stats_connection = self.session._conn._conn.cursor()
+        stats_connection.execute(stats_query)
+        raw_results = stats_connection.fetchall()
+
+        column_names = [desc[0] for desc in stats_connection.description]
+        stats_result = [dict(zip(column_names, row)) for row in raw_results]
+
+        nodes, children, root_nodes = self.build_operator_tree(stats_result)
+
+        self._write_output(f"\n=== Analyzing Query {query_id} ===")
+        self._write_output(f"\n{'='*80}")
+        self._write_output("QUERY OPERATOR TREE")
+        self._write_output(f"{'='*80}")
+
+        root_list = sorted(list(root_nodes))
+        for i, root_id in enumerate(root_list):
+            is_last_root = i == len(root_list) - 1
+            self.print_operator_tree(nodes, children, root_id, "", is_last_root)
+
+        self._write_output(f"\n{'='*160}")
+        self._write_output("DETAILED OPERATOR STATISTICS")
+        self._write_output(f"{'='*160}")
+        self._write_output(
+            f"{'Operator':<15} {'Type':<15} {'Input Rows':<12} {'Output Rows':<12} {'Row Multiple':<12} {'Overall %':<12} {'Attributes':<50}",
+        )
+        self._write_output(f"{'='*160}")
+
+        for row in stats_result:
+            node_info = self._get_node_info(row)
+            operator_attrs = (
+                node_info["attributes"].replace("\n", " ").replace("  ", " ")
+            )
+
+            self._write_output(
+                f"{node_info['id']:<15} {node_info['type']:<15} {node_info['input_rows']:<12} {node_info['output_rows']:<12} {node_info['row_multiple']:<12.2f} {node_info['exec_time']:<12} {operator_attrs:<50}",
+            )
+
+        self._write_output(f"{'='*160}")
@@ -434,7 +434,10 @@ def execute_and_notify_query_listener(
         notify_kwargs = {}
         if DATAFRAME_AST_PARAMETER in kwargs and is_ast_enabled():
             notify_kwargs["dataframeAst"] = kwargs[DATAFRAME_AST_PARAMETER]
-
+        if "_statement_params" in kwargs and kwargs["_statement_params"]:
+            statement_params = kwargs["_statement_params"]
+            if "_PLAN_UUID" in statement_params:
+                notify_kwargs["dataframe_uuid"] = statement_params["_PLAN_UUID"]
         try:
             results_cursor = self._cursor.execute(query, **kwargs)
         except Exception as ex:
@@ -456,14 +459,23 @@ def execute_and_notify_query_listener(
     def execute_async_and_notify_query_listener(
         self, query: str, **kwargs: Any
     ) -> Dict[str, Any]:
+        notify_kwargs = {}
+
+        if "_statement_params" in kwargs and kwargs["_statement_params"]:
+            statement_params = kwargs["_statement_params"]
+            if "_PLAN_UUID" in statement_params:
+                notify_kwargs["dataframe_uuid"] = statement_params["_PLAN_UUID"]
+
         try:
             results_cursor = self._cursor.execute_async(query, **kwargs)
         except Error as err:
             self.notify_query_listeners(
-                QueryRecord(err.sfqid, err.query), is_error=True
+                QueryRecord(err.sfqid, err.query), is_error=True, **notify_kwargs
             )
             raise err
-        self.notify_query_listeners(QueryRecord(results_cursor["queryId"], query))
+        self.notify_query_listeners(
+            QueryRecord(results_cursor["queryId"], query), **notify_kwargs
+        )
         return results_cursor
 
     def execute_and_get_sfqid(
 
@@ -2029,16 +2029,19 @@ def find_interval_containing_line(intervals, line_number):
         return -1
 
     # traverse the plan tree to find the plan that contains the line number
-    stack = [(plan_node, line_number)]
+    stack = [(plan_node, line_number, None)]
     while stack:
-        node, line_number = stack.pop()
+        node, line_number, df_ast_ids = stack.pop()
         if isinstance(node, Selectable):
             node = node.get_snowflake_plan(skip_schema_query=False)
+        if node.df_ast_ids is not None:
+            df_ast_ids = node.df_ast_ids
         query_line_intervals = node.queries[-1].query_line_intervals
         idx = find_interval_containing_line(query_line_intervals, line_number)
         if idx >= 0:
             uuid = query_line_intervals[idx].uuid
             if node.uuid == uuid:
+                node.df_ast_ids = df_ast_ids
                 return node
             else:
                 for child in node.children_plan_nodes:
@@ -2047,6 +2050,7 @@ def find_interval_containing_line(intervals, line_number):
                             (
                                 child,
                                 line_number - query_line_intervals[idx].start,
+                                df_ast_ids,
                             )
                         )
                         break