SNOW-1877318 combine telemetry usages (#2855)

sfc-gh-aalam · web-flow · commit 01b8f33828ae · 2025-01-14T13:29:03.000-08:00
diff --git a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
@@ -4,7 +4,7 @@
 
 import logging
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     drop_table_if_exists_statement,
@@ -46,7 +46,7 @@
 from snowflake.snowpark._internal.compiler.query_generator import QueryGenerator
 from snowflake.snowpark._internal.compiler.telemetry_constants import (
     CompilationStageTelemetryField,
-    InvalidNodesInBreakdownCategory,
+    NodeBreakdownCategory,
     SkipLargeQueryBreakdownCategory,
 )
 from snowflake.snowpark._internal.compiler.utils import (
@@ -255,35 +255,56 @@ def _try_to_breakdown_plan(self, root: TreeNode) -> List[LogicalPlan]:
             return [root]
 
         plans = []
-        final_partition_breakdown_summary = {}
+        self._current_breakdown_summary: Dict[str, Any] = {
+            CompilationStageTelemetryField.NUM_PARTITIONS_MADE.value: 0,
+            CompilationStageTelemetryField.NUM_PIPELINE_BREAKER_USED.value: 0,
+            CompilationStageTelemetryField.NUM_RELAXED_BREAKER_USED.value: 0,
+        }
         while complexity_score > self.complexity_score_upper_bound:
             child, validity_statistics = self._find_node_to_breakdown(root)
+            self._update_current_breakdown_summary(validity_statistics)
+
             if child is None:
-                final_partition_breakdown_summary = {
-                    k.value: validity_statistics.get(k, 0)
-                    for k in InvalidNodesInBreakdownCategory
-                }
                 _logger.debug(
                     f"Could not find a valid node for partitioning. "
-                    f"Skipping with root {complexity_score=} {final_partition_breakdown_summary=}"
+                    f"Skipping with root {complexity_score=} {self._current_breakdown_summary=}"
                 )
                 break
 
             partition = self._get_partitioned_plan(root, child)
             plans.append(partition)
             complexity_score = get_complexity_score(root)
 
-        final_partition_breakdown_summary[
-            CompilationStageTelemetryField.NUM_PARTITIONS_MADE.value
-        ] = len(plans)
-        self._breakdown_summary.append(final_partition_breakdown_summary)
-
+        self._breakdown_summary.append(self._current_breakdown_summary)
         plans.append(root)
         return plans
 
+    def _update_current_breakdown_summary(
+        self, validity_statistics: Dict[NodeBreakdownCategory, int]
+    ) -> None:
+        """Method to update the breakdown summary based on the validity statistics of the current root."""
+        if validity_statistics.get(NodeBreakdownCategory.VALID_NODE, 0) > 0:
+            self._current_breakdown_summary[
+                CompilationStageTelemetryField.NUM_PARTITIONS_MADE.value
+            ] += 1
+            self._current_breakdown_summary[
+                CompilationStageTelemetryField.NUM_PIPELINE_BREAKER_USED.value
+            ] += 1
+        elif validity_statistics.get(NodeBreakdownCategory.VALID_NODE_RELAXED, 0) > 0:
+            self._current_breakdown_summary[
+                CompilationStageTelemetryField.NUM_PARTITIONS_MADE.value
+            ] += 1
+            self._current_breakdown_summary[
+                CompilationStageTelemetryField.NUM_RELAXED_BREAKER_USED.value
+            ] += 1
+        else:  # no valid nodes found
+            self._current_breakdown_summary[
+                CompilationStageTelemetryField.FAILED_PARTITION_SUMMARY.value
+            ] = {k.value: validity_statistics.get(k, 0) for k in NodeBreakdownCategory}
+
     def _find_node_to_breakdown(
         self, root: TreeNode
-    ) -> Tuple[Optional[TreeNode], Dict[InvalidNodesInBreakdownCategory, int]]:
+    ) -> Tuple[Optional[TreeNode], Dict[NodeBreakdownCategory, int]]:
         """This method traverses the plan tree and partitions the plan based if a valid partition node
         if found. The steps involved are:
 
@@ -307,7 +328,7 @@ def _find_node_to_breakdown(
                     validity_status, score = self._is_node_valid_to_breakdown(
                         child, root
                     )
-                    if validity_status == InvalidNodesInBreakdownCategory.VALID_NODE:
+                    if validity_status == NodeBreakdownCategory.VALID_NODE:
                         # If the score for valid node is higher than the last candidate,
                         # update the candidate node and score.
                         if score > candidate_score:
@@ -317,10 +338,7 @@ def _find_node_to_breakdown(
                         # don't traverse subtrees if parent is a valid candidate
                         next_level.append(child)
 
-                    if (
-                        validity_status
-                        == InvalidNodesInBreakdownCategory.VALID_NODE_RELAXED
-                    ):
+                    if validity_status == NodeBreakdownCategory.VALID_NODE_RELAXED:
                         # Update the relaxed candidate node and score.
                         if score > relaxed_candidate_score:
                             relaxed_candidate_score = score
@@ -370,7 +388,7 @@ def _get_partitioned_plan(self, root: TreeNode, child: TreeNode) -> SnowflakePla
 
     def _is_node_valid_to_breakdown(
         self, node: TreeNode, root: TreeNode
-    ) -> Tuple[InvalidNodesInBreakdownCategory, int]:
+    ) -> Tuple[NodeBreakdownCategory, int]:
         """Method to check if a node is valid to breakdown based on complexity score and node type.
 
         Returns:
@@ -381,29 +399,29 @@ def _is_node_valid_to_breakdown(
         """
         score = get_complexity_score(node)
         is_valid = True
-        validity_status = InvalidNodesInBreakdownCategory.VALID_NODE
+        validity_status = NodeBreakdownCategory.VALID_NODE
 
         # check score bounds
         if score < self.complexity_score_lower_bound:
             is_valid = False
-            validity_status = InvalidNodesInBreakdownCategory.SCORE_BELOW_LOWER_BOUND
+            validity_status = NodeBreakdownCategory.SCORE_BELOW_LOWER_BOUND
 
         if score > self.complexity_score_upper_bound:
             is_valid = False
-            validity_status = InvalidNodesInBreakdownCategory.SCORE_ABOVE_UPPER_BOUND
+            validity_status = NodeBreakdownCategory.SCORE_ABOVE_UPPER_BOUND
 
         # check pipeline breaker condition
         if is_valid and not self._is_node_pipeline_breaker(node):
             if self._is_relaxed_pipeline_breaker(node):
-                validity_status = InvalidNodesInBreakdownCategory.VALID_NODE_RELAXED
+                validity_status = NodeBreakdownCategory.VALID_NODE_RELAXED
             else:
                 is_valid = False
-                validity_status = InvalidNodesInBreakdownCategory.NON_PIPELINE_BREAKER
+                validity_status = NodeBreakdownCategory.NON_PIPELINE_BREAKER
 
         # check external CTE ref condition
         if is_valid and self._contains_external_cte_ref(node, root):
             is_valid = False
-            validity_status = InvalidNodesInBreakdownCategory.EXTERNAL_CTE_REF
+            validity_status = NodeBreakdownCategory.EXTERNAL_CTE_REF
 
         if is_valid:
             _logger.debug(
diff --git a/src/snowflake/snowpark/_internal/compiler/plan_compiler.py b/src/snowflake/snowpark/_internal/compiler/plan_compiler.py
@@ -125,7 +125,7 @@ def compile(self) -> Dict[PlanQueryType, List[Query]]:
                     plot_plan_if_enabled(plan, f"cte_optimized_plan_{i}")
 
                 # Large query breakdown
-                breakdown_failure_summary, skipped_summary = {}, {}
+                breakdown_summary, skipped_summary = {}, {}
                 if session.large_query_breakdown_enabled:
                     large_query_breakdown = LargeQueryBreakdown(
                         session,
@@ -135,7 +135,7 @@ def compile(self) -> Dict[PlanQueryType, List[Query]]:
                     )
                     breakdown_result = large_query_breakdown.apply()
                     logical_plans = breakdown_result.logical_plans
-                    breakdown_failure_summary = breakdown_result.breakdown_summary
+                    breakdown_summary = breakdown_result.breakdown_summary
                     skipped_summary = breakdown_result.skipped_summary
 
                 large_query_breakdown_end_time = time.time()
@@ -166,8 +166,8 @@ def compile(self) -> Dict[PlanQueryType, List[Query]]:
                     CompilationStageTelemetryField.COMPLEXITY_SCORE_BEFORE_COMPILATION.value: complexity_score_before_compilation,
                     CompilationStageTelemetryField.COMPLEXITY_SCORE_AFTER_CTE_OPTIMIZATION.value: complexity_scores_after_cte,
                     CompilationStageTelemetryField.COMPLEXITY_SCORE_AFTER_LARGE_QUERY_BREAKDOWN.value: complexity_scores_after_large_query_breakdown,
-                    CompilationStageTelemetryField.BREAKDOWN_FAILURE_SUMMARY.value: breakdown_failure_summary,
-                    CompilationStageTelemetryField.TYPE_LARGE_QUERY_BREAKDOWN_OPTIMIZATION_SKIPPED.value: skipped_summary,
+                    CompilationStageTelemetryField.BREAKDOWN_SUMMARY.value: breakdown_summary,
+                    CompilationStageTelemetryField.LARGE_QUERY_BREAKDOWN_OPTIMIZATION_SKIPPED.value: skipped_summary,
                 }
                 # add the extra optimization status
                 summary_value.update(extra_optimization_status)
diff --git a/src/snowflake/snowpark/_internal/compiler/telemetry_constants.py b/src/snowflake/snowpark/_internal/compiler/telemetry_constants.py
@@ -19,11 +19,7 @@ class CompilationStageTelemetryField(Enum):
     QUERY_PLAN_COMPLEXITY = "query_plan_complexity"
 
     # types
-    TYPE_LARGE_QUERY_BREAKDOWN_OPTIMIZATION_SKIPPED = (
-        "snowpark_large_query_breakdown_optimization_skipped"
-    )
     TYPE_COMPILATION_STAGE_STATISTICS = "snowpark_compilation_stage_statistics"
-    TYPE_COMPILATION_STAGE_FAILED = "snowpark_compilation_stage_failed"
     TYPE_LARGE_QUERY_BREAKDOWN_UPDATE_COMPLEXITY_BOUNDS = (
         "snowpark_large_query_breakdown_update_complexity_bounds"
     )
@@ -37,22 +33,28 @@ class CompilationStageTelemetryField(Enum):
     TIME_TAKEN_FOR_DEEP_COPY_PLAN = "time_taken_for_deep_copy_plan_sec"
     TIME_TAKEN_FOR_CTE_OPTIMIZATION = "time_taken_for_cte_optimization_sec"
     TIME_TAKEN_FOR_LARGE_QUERY_BREAKDOWN = "time_taken_for_large_query_breakdown_sec"
+    LARGE_QUERY_BREAKDOWN_OPTIMIZATION_SKIPPED = (
+        "query_breakdown_optimization_skipped_reason"
+    )
 
     # keys for repeated subquery elimination
     CTE_NODE_CREATED = "cte_node_created"
 
     # keys for large query breakdown
-    BREAKDOWN_FAILURE_SUMMARY = "breakdown_failure_summary"
+    BREAKDOWN_SUMMARY = "breakdown_summary"
     COMPLEXITY_SCORE_AFTER_CTE_OPTIMIZATION = "complexity_score_after_cte_optimization"
     COMPLEXITY_SCORE_AFTER_LARGE_QUERY_BREAKDOWN = (
         "complexity_score_after_large_query_breakdown"
     )
     COMPLEXITY_SCORE_BEFORE_COMPILATION = "complexity_score_before_compilation"
     COMPLEXITY_SCORE_BOUNDS = "complexity_score_bounds"
     NUM_PARTITIONS_MADE = "num_partitions_made"
+    NUM_PIPELINE_BREAKER_USED = "num_pipeline_breaker_used"
+    NUM_RELAXED_BREAKER_USED = "num_relaxed_breaker_used"
+    FAILED_PARTITION_SUMMARY = "failed_partition_summary"
 
 
-class InvalidNodesInBreakdownCategory(Enum):
+class NodeBreakdownCategory(Enum):
     SCORE_BELOW_LOWER_BOUND = "num_nodes_below_lower_bound"
     SCORE_ABOVE_UPPER_BOUND = "num_nodes_above_upper_bound"
     NON_PIPELINE_BREAKER = "num_non_pipeline_breaker_nodes"
diff --git a/src/snowflake/snowpark/_internal/telemetry.py b/src/snowflake/snowpark/_internal/telemetry.py
@@ -87,9 +87,6 @@ class TelemetryField(Enum):
     NUM_TEMP_TABLES_CLEANED = "num_temp_tables_cleaned"
     NUM_TEMP_TABLES_CREATED = "num_temp_tables_created"
     TEMP_TABLE_CLEANER_ENABLED = "temp_table_cleaner_enabled"
-    TYPE_TEMP_TABLE_CLEANUP_ABNORMAL_EXCEPTION = (
-        "snowpark_temp_table_cleanup_abnormal_exception"
-    )
     TEMP_TABLE_CLEANUP_ABNORMAL_EXCEPTION_TABLE_NAME = (
         "temp_table_cleanup_abnormal_exception_table_name"
     )
@@ -487,7 +484,7 @@ def send_query_compilation_stage_failed_telemetry(
     ) -> None:
         message = {
             **self._create_basic_telemetry_data(
-                CompilationStageTelemetryField.TYPE_COMPILATION_STAGE_FAILED.value
+                CompilationStageTelemetryField.TYPE_COMPILATION_STAGE_STATISTICS.value
             ),
             TelemetryField.KEY_DATA.value: {
                 TelemetryField.SESSION_ID.value: session_id,
@@ -526,7 +523,7 @@ def send_temp_table_cleanup_abnormal_exception_telemetry(
     ) -> None:
         message = {
             **self._create_basic_telemetry_data(
-                TelemetryField.TYPE_TEMP_TABLE_CLEANUP_ABNORMAL_EXCEPTION.value
+                TelemetryField.TYPE_TEMP_TABLE_CLEANUP.value
             ),
             TelemetryField.KEY_DATA.value: {
                 TelemetryField.SESSION_ID.value: session_id,
diff --git a/tests/integ/test_large_query_breakdown.py b/tests/integ/test_large_query_breakdown.py
@@ -96,7 +96,14 @@ def check_result_with_and_without_breakdown(session, df):
 def check_summary_breakdown_value(patch_send, expected_summary):
     _, kwargs = patch_send.call_args
     summary_value = kwargs["compilation_stage_summary"]
-    assert summary_value["breakdown_failure_summary"] == expected_summary
+    assert summary_value["breakdown_summary"] == expected_summary
+
+
+def check_optimization_skipped_reason(patch_send, expected_reason):
+    summary_value = patch_send.call_args[1]["compilation_stage_summary"]
+    assert (
+        summary_value["query_breakdown_optimization_skipped_reason"] == expected_reason
+    )
 
 
 def test_no_pipeline_breaker_nodes(session):
@@ -134,6 +141,8 @@ def test_no_pipeline_breaker_nodes(session):
     expected_summary = [
         {
             "num_partitions_made": 1,
+            "num_pipeline_breaker_used": 0,
+            "num_relaxed_breaker_used": 1,
         }
     ]
     check_summary_breakdown_value(patch_send, expected_summary)
@@ -174,13 +183,17 @@ def test_large_query_breakdown_external_cte_ref(session):
     patch_send.assert_called_once()
     expected_summary = [
         {
-            "num_external_cte_ref_nodes": 6 if sql_simplifier_enabled else 2,
-            "num_non_pipeline_breaker_nodes": 0 if sql_simplifier_enabled else 2,
-            "num_nodes_below_lower_bound": 28,
-            "num_nodes_above_upper_bound": 1 if sql_simplifier_enabled else 0,
-            "num_valid_nodes": 0,
-            "num_valid_nodes_relaxed": 0,
+            "failed_partition_summary": {
+                "num_external_cte_ref_nodes": 6 if sql_simplifier_enabled else 2,
+                "num_non_pipeline_breaker_nodes": 0 if sql_simplifier_enabled else 2,
+                "num_nodes_below_lower_bound": 28,
+                "num_nodes_above_upper_bound": 1 if sql_simplifier_enabled else 0,
+                "num_valid_nodes": 0,
+                "num_valid_nodes_relaxed": 0,
+            },
             "num_partitions_made": 0,
+            "num_pipeline_breaker_used": 0,
+            "num_relaxed_breaker_used": 0,
         }
     ]
     check_summary_breakdown_value(patch_send, expected_summary)
@@ -213,14 +226,12 @@ def test_breakdown_at_with_query_node(session):
 
 def test_large_query_breakdown_with_cte_optimization(session):
     """Test large query breakdown works with cte optimized plan"""
-    if not session.cte_optimization_enabled:
-        pytest.skip("CTE optimization is not enabled")
+    session._cte_optimization_enabled = True
 
     if not session.sql_simplifier_enabled:
         # the complexity bounds are updated since nested selected calculation is not supported
         # when sql simplifier disabled
         set_bounds(session, 60, 90)
-    session._cte_optimization_enabled = True
     df0 = session.sql("select 2 as b, 32 as c")
     df1 = session.sql("select 1 as a, 2 as b").filter(col("a") == 1)
     df1 = df1.join(df0, on=["b"], how="inner")
@@ -231,7 +242,7 @@ def test_large_query_breakdown_with_cte_optimization(session):
         df2 = df2.with_column("a", col("a") + i + col("a"))
         df3 = df3.with_column("b", col("b") + i + col("b"))
 
-    df2 = df2.group_by("a").agg(sum_distinct(col("b")).alias("b"))
+    df2 = df2.select("b", "a")
     df3 = df3.group_by("b").agg(sum_distinct(col("a")).alias("a"))
 
     df4 = df2.union_all(df3).filter(col("a") > 2).with_column("a", col("a") + 1)
@@ -256,14 +267,15 @@ def test_large_query_breakdown_with_cte_optimization(session):
     assert len(queries["post_actions"]) == 1
     assert queries["post_actions"][0].startswith("DROP  TABLE  If  EXISTS")
 
-    patch_send.assert_called_once()
-    _, kwargs = patch_send.call_args
-    summary_value = kwargs["compilation_stage_summary"]
-    assert summary_value["breakdown_failure_summary"] == [
+    expected_summary = [
         {
             "num_partitions_made": 1,
+            "num_pipeline_breaker_used": 1,
+            "num_relaxed_breaker_used": 0,
         }
     ]
+    check_summary_breakdown_value(patch_send, expected_summary)
+    patch_send.assert_called_once()
 
 
 def test_save_as_table(session, large_query_df):
@@ -547,10 +559,7 @@ def test_optimization_skipped_with_transaction(session, large_query_df, caplog):
                 ) as patch_send:
                     large_query_df.collect()
 
-    summary_value = patch_send.call_args[1]["compilation_stage_summary"]
-    assert summary_value["snowpark_large_query_breakdown_optimization_skipped"] == {
-        "active transaction": 1,
-    }
+    check_optimization_skipped_reason(patch_send, {"active transaction": 1})
 
     assert len(history.queries) == 2, history.queries
     assert history.queries[0].sql_text == "SELECT CURRENT_TRANSACTION()"
@@ -582,10 +591,9 @@ def test_optimization_skipped_with_views_and_dynamic_tables(session, caplog):
             "Skipping large query breakdown optimization for view/dynamic table plan"
             in caplog.text
         )
-        summary_value = patch_send.call_args[1]["compilation_stage_summary"]
-        assert summary_value["snowpark_large_query_breakdown_optimization_skipped"] == {
-            "view or dynamic table command": 1,
-        }
+        check_optimization_skipped_reason(
+            patch_send, {"view or dynamic table command": 1}
+        )
 
         with caplog.at_level(logging.DEBUG):
             with patch.object(
@@ -598,10 +606,9 @@ def test_optimization_skipped_with_views_and_dynamic_tables(session, caplog):
             in caplog.text
         )
         patch_send.assert_called_once()
-        summary_value = patch_send.call_args[1]["compilation_stage_summary"]
-        assert summary_value["snowpark_large_query_breakdown_optimization_skipped"] == {
-            "view or dynamic table command": 1,
-        }
+        check_optimization_skipped_reason(
+            patch_send, {"view or dynamic table command": 1}
+        )
     finally:
         Utils.drop_dynamic_table(session, table_name)
         Utils.drop_view(session, view_name)
@@ -656,10 +663,7 @@ def test_optimization_skipped_with_no_active_db_or_schema(
         in caplog.text
     )
     patch_send.assert_called_once()
-    summary_value = patch_send.call_args[1]["compilation_stage_summary"]
-    assert summary_value["snowpark_large_query_breakdown_optimization_skipped"] == {
-        f"no active {db_or_schema}": 1,
-    }
+    check_optimization_skipped_reason(patch_send, {f"no active {db_or_schema}": 1})
 
 
 def test_async_job_with_large_query_breakdown(large_query_df):
diff --git a/tests/integ/test_telemetry.py b/tests/integ/test_telemetry.py