|
5 | 5 | from functools import cached_property |
6 | 6 | import os |
7 | 7 | import sys |
8 | | -from typing import Dict, List, Optional |
| 8 | +from typing import Dict, List, Optional, Set, Tuple |
9 | 9 | import itertools |
10 | 10 | import re |
11 | 11 | from typing import TYPE_CHECKING |
| 12 | +import snowflake.snowpark |
12 | 13 | from snowflake.snowpark._internal.ast.batch import get_dependent_bind_ids |
13 | 14 | from snowflake.snowpark._internal.ast.utils import __STRING_INTERNING_MAP__ |
14 | 15 | import snowflake.snowpark._internal.proto.generated.ast_pb2 as proto |
| 16 | +from ast import literal_eval |
15 | 17 | from snowflake.snowpark._internal.ast.utils import extract_src_from_expr |
16 | 18 |
|
17 | 19 | if TYPE_CHECKING: |
@@ -220,6 +222,32 @@ def _format_source_location(src: Optional[proto.SrcPosition]) -> str: |
220 | 222 | return lines_info |
221 | 223 |
|
222 | 224 |
|
| 225 | +def _extract_source_locations_from_plan(plan: "SnowflakePlan") -> List[str]: |
| 226 | + """ |
| 227 | + Extract source locations from a SnowflakePlan's AST IDs. |
| 228 | +
|
| 229 | + Args: |
| 230 | + plan: The SnowflakePlan object to extract source locations from |
| 231 | +
|
| 232 | + Returns: |
| 233 | + List of unique source location strings (e.g., "file.py: line 42") |
| 234 | + """ |
| 235 | + source_locations = [] |
| 236 | + found_locations = set() |
| 237 | + |
| 238 | + if plan.df_ast_ids is not None: |
| 239 | + for ast_id in plan.df_ast_ids: |
| 240 | + bind_stmt = plan.session._ast_batch._bind_stmt_cache.get(ast_id) |
| 241 | + if bind_stmt is not None: |
| 242 | + src = extract_src_from_expr(bind_stmt.bind.expr) |
| 243 | + location = _format_source_location(src) |
| 244 | + if location and location not in found_locations: |
| 245 | + found_locations.add(location) |
| 246 | + source_locations.append(location) |
| 247 | + |
| 248 | + return source_locations |
| 249 | + |
| 250 | + |
223 | 251 | def get_python_source_from_sql_error(top_plan: "SnowflakePlan", error_msg: str) -> str: |
224 | 252 | """ |
225 | 253 | Extract SQL error line number and map it back to Python source code. We use the |
@@ -249,17 +277,8 @@ def get_python_source_from_sql_error(top_plan: "SnowflakePlan", error_msg: str) |
249 | 277 | ) |
250 | 278 |
|
251 | 279 | plan = get_plan_from_line_numbers(top_plan, sql_line_number) |
252 | | - source_locations = [] |
253 | | - found_locations = set() |
254 | | - if plan.df_ast_ids is not None: |
255 | | - for ast_id in plan.df_ast_ids: |
256 | | - bind_stmt = plan.session._ast_batch._bind_stmt_cache.get(ast_id) |
257 | | - if bind_stmt is not None: |
258 | | - src = extract_src_from_expr(bind_stmt.bind.expr) |
259 | | - location = _format_source_location(src) |
260 | | - if location != "" and location not in found_locations: |
261 | | - found_locations.add(location) |
262 | | - source_locations.append(location) |
| 280 | + source_locations = _extract_source_locations_from_plan(plan) |
| 281 | + |
263 | 282 | if source_locations: |
264 | 283 | if len(source_locations) == 1: |
265 | 284 | return f"\nSQL compilation error corresponds to Python source at {source_locations[0]}.\n" |
@@ -434,3 +453,201 @@ def sql_contains_object_creation(sql_query: str, target_object: str) -> bool: |
434 | 453 | return f"\nObject '{object_name}' was first referenced at {location}.\n" |
435 | 454 |
|
436 | 455 | return "" |
| 456 | + |
| 457 | + |
| 458 | +class QueryProfiler: |
| 459 | + """ |
| 460 | + A class for profiling Snowflake queries and analyzing operator statistics. |
| 461 | + It can generate tree visualizations and output tables of operator statistics. |
| 462 | + """ |
| 463 | + |
| 464 | + def __init__( |
| 465 | + self, session: "snowflake.snowpark.Session", output_file: Optional[str] = None |
| 466 | + ) -> None: |
| 467 | + self.session = session |
| 468 | + if output_file: |
| 469 | + self.file_handle = open(output_file, "a", encoding="utf-8") |
| 470 | + else: |
| 471 | + self.file_handle = None |
| 472 | + |
| 473 | + def _get_node_info(self, row: Dict) -> Dict: |
| 474 | + parent_operators = row.get("PARENT_OPERATORS") |
| 475 | + parent_operators = ( |
| 476 | + str(parent_operators) if parent_operators is not None else None |
| 477 | + ) |
| 478 | + node_info = { |
| 479 | + "id": row.get("OPERATOR_ID") or 0, |
| 480 | + "parent_operators": parent_operators, |
| 481 | + "type": row.get("OPERATOR_TYPE") or "N/A", |
| 482 | + "input_rows": row.get("INPUT_ROWS") or 0, |
| 483 | + "output_rows": row.get("OUTPUT_ROWS") or 0, |
| 484 | + "row_multiple": row.get("ROW_MULTIPLE") or 0, |
| 485 | + "exec_time": row.get("OVERALL_PERCENTAGE") or 0, |
| 486 | + "attributes": row.get("OPERATOR_ATTRIBUTES") or "N/A", |
| 487 | + } |
| 488 | + return node_info |
| 489 | + |
| 490 | + def build_operator_tree(self, operators_data: List[Dict]) -> Tuple[Dict, Dict, Set]: |
| 491 | + """ |
| 492 | + Build a tree structure from raw operator data for query profiling. |
| 493 | +
|
| 494 | + Args: |
| 495 | + operators_data (List[Dict]): A list of dictionaries containing operator statistics. |
| 496 | + The keys include operator id, operator type, parent operators, input rows, output rows, |
| 497 | + row multiple, overall percentage, and operator attributes. |
| 498 | +
|
| 499 | + Returns: |
| 500 | + Tuple[Dict, Dict, Set]: A tuple containing: |
| 501 | + - nodes (Dict[int, Dict]): Dictionary mapping operator IDs to node information |
| 502 | + - children (Dict[int, List[int]]): Dictionary mapping operator IDs to lists of child operator IDs |
| 503 | + - root_nodes (Set[int]): Set of operator IDs that are root nodes (have no parents) |
| 504 | +
|
| 505 | + """ |
| 506 | + |
| 507 | + nodes = {} |
| 508 | + children = {} |
| 509 | + root_nodes = set() |
| 510 | + for row in operators_data: |
| 511 | + node_info = self._get_node_info(row) |
| 512 | + |
| 513 | + nodes[node_info["id"]] = node_info |
| 514 | + children[node_info["id"]] = [] |
| 515 | + |
| 516 | + if node_info["parent_operators"] is None: |
| 517 | + root_nodes.add(node_info["id"]) |
| 518 | + else: |
| 519 | + # parse parent_operators, which is a string like "[1, 2, 3]" to a list |
| 520 | + x = literal_eval(node_info["parent_operators"]) |
| 521 | + for parent_id in x: |
| 522 | + if parent_id not in children: |
| 523 | + children[parent_id] = [] |
| 524 | + children[parent_id].append(node_info["id"]) |
| 525 | + |
| 526 | + return nodes, children, root_nodes |
| 527 | + |
| 528 | + def _write_output(self, message: str) -> None: |
| 529 | + """Helper function to write output to either console or file.""" |
| 530 | + if self.file_handle: |
| 531 | + self.file_handle.write(message + "\n") |
| 532 | + else: |
| 533 | + sys.stdout.write(message + "\n") |
| 534 | + |
| 535 | + def close(self) -> None: |
| 536 | + """Close the file handle if it exists.""" |
| 537 | + if self.file_handle: |
| 538 | + self.file_handle.close() |
| 539 | + |
| 540 | + def print_operator_tree( |
| 541 | + self, |
| 542 | + nodes: Dict[int, Dict], |
| 543 | + children: Dict[int, List[int]], |
| 544 | + node_id: int, |
| 545 | + prefix: str = "", |
| 546 | + is_last: bool = True, |
| 547 | + ) -> None: |
| 548 | + """ |
| 549 | + Print a visual tree representation of query operators with their statistics. |
| 550 | +
|
| 551 | + Args: |
| 552 | + nodes (Dict[int, Dict]): Dictionary mapping operator IDs to node information. |
| 553 | + children (Dict[int, List[int]]): Dictionary mapping operator IDs to lists of child operator IDs. |
| 554 | + node_id (int): The ID of the current operator node to print. |
| 555 | + prefix (str, optional): String prefix for tree formatting (used for indentation). |
| 556 | + Defaults to "". |
| 557 | + is_last (bool, optional): Whether this node is the last child of its parent. |
| 558 | + Used for proper tree connector formatting. Defaults to True. |
| 559 | +
|
| 560 | + Returns: |
| 561 | + None: This function writes output to a file or prints and doesn't return a value. |
| 562 | +
|
| 563 | + """ |
| 564 | + node = nodes[node_id] |
| 565 | + |
| 566 | + connector = "└── " if is_last else "├── " |
| 567 | + |
| 568 | + node_info = ( |
| 569 | + f"[{node['id']}] {node['type']} " |
| 570 | + f"(In: {node['input_rows']:,}, Out: {node['output_rows']:,}, " |
| 571 | + f"Mult: {node['row_multiple']:.2f}, Time: {node['exec_time']:.2f}%)" |
| 572 | + ) |
| 573 | + |
| 574 | + self._write_output(f"{prefix}{connector}{node_info}") |
| 575 | + |
| 576 | + extension = " " if is_last else "│ " |
| 577 | + new_prefix = prefix + extension |
| 578 | + |
| 579 | + child_list = children.get(node_id, []) |
| 580 | + for i, child_id in enumerate(child_list): |
| 581 | + is_last_child = i == len(child_list) - 1 |
| 582 | + self.print_operator_tree( |
| 583 | + nodes, children, child_id, new_prefix, is_last_child |
| 584 | + ) |
| 585 | + |
| 586 | + def profile_query( |
| 587 | + self, |
| 588 | + query_id: str, |
| 589 | + ) -> None: |
| 590 | + """ |
| 591 | + Profile a query and save the results to a file. |
| 592 | +
|
| 593 | + Args: |
| 594 | + query_id: The query ID to profile |
| 595 | +
|
| 596 | + Returns: |
| 597 | + None - output either to the console or to the file specified by output_file |
| 598 | + """ |
| 599 | + |
| 600 | + stats_query = f""" |
| 601 | + SELECT |
| 602 | + operator_id, |
| 603 | + operator_type, |
| 604 | + operator_attributes, |
| 605 | + operator_statistics:input_rows::number as input_rows, |
| 606 | + operator_statistics:output_rows::number as output_rows, |
| 607 | + CASE |
| 608 | + WHEN operator_statistics:input_rows::number > 0 |
| 609 | + THEN operator_statistics:output_rows::number / operator_statistics:input_rows::number |
| 610 | + ELSE NULL |
| 611 | + END as row_multiple, |
| 612 | + execution_time_breakdown:overall_percentage::number as overall_percentage |
| 613 | + FROM TABLE(get_query_operator_stats('{query_id}')) |
| 614 | + ORDER BY step_id, operator_id |
| 615 | + """ |
| 616 | + stats_connection = self.session._conn._conn.cursor() |
| 617 | + stats_connection.execute(stats_query) |
| 618 | + raw_results = stats_connection.fetchall() |
| 619 | + |
| 620 | + column_names = [desc[0] for desc in stats_connection.description] |
| 621 | + stats_result = [dict(zip(column_names, row)) for row in raw_results] |
| 622 | + |
| 623 | + nodes, children, root_nodes = self.build_operator_tree(stats_result) |
| 624 | + |
| 625 | + self._write_output(f"\n=== Analyzing Query {query_id} ===") |
| 626 | + self._write_output(f"\n{'='*80}") |
| 627 | + self._write_output("QUERY OPERATOR TREE") |
| 628 | + self._write_output(f"{'='*80}") |
| 629 | + |
| 630 | + root_list = sorted(list(root_nodes)) |
| 631 | + for i, root_id in enumerate(root_list): |
| 632 | + is_last_root = i == len(root_list) - 1 |
| 633 | + self.print_operator_tree(nodes, children, root_id, "", is_last_root) |
| 634 | + |
| 635 | + self._write_output(f"\n{'='*160}") |
| 636 | + self._write_output("DETAILED OPERATOR STATISTICS") |
| 637 | + self._write_output(f"{'='*160}") |
| 638 | + self._write_output( |
| 639 | + f"{'Operator':<15} {'Type':<15} {'Input Rows':<12} {'Output Rows':<12} {'Row Multiple':<12} {'Overall %':<12} {'Attributes':<50}", |
| 640 | + ) |
| 641 | + self._write_output(f"{'='*160}") |
| 642 | + |
| 643 | + for row in stats_result: |
| 644 | + node_info = self._get_node_info(row) |
| 645 | + operator_attrs = ( |
| 646 | + node_info["attributes"].replace("\n", " ").replace(" ", " ") |
| 647 | + ) |
| 648 | + |
| 649 | + self._write_output( |
| 650 | + f"{node_info['id']:<15} {node_info['type']:<15} {node_info['input_rows']:<12} {node_info['output_rows']:<12} {node_info['row_multiple']:<12.2f} {node_info['exec_time']:<12} {operator_attrs:<50}", |
| 651 | + ) |
| 652 | + |
| 653 | + self._write_output(f"{'='*160}") |
0 commit comments