SNOW-2230553 - Reduce Telemetry Overhead when running w/ a Pandas Engine (#3610)

sfc-gh-jkew · graphite-app[bot] · web-flow · commit 42d10859d598 · 2025-08-27T13:04:04.000-07:00
Co-authored-by: graphite-app[bot] &lt;96075541+graphite-app[bot]@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -50,6 +50,7 @@
 #### Improvements
 - Set the default transfer limit in hybrid execution for data leaving Snowflake to 100k, which can be overridden with the SnowflakePandasTransferThreshold environment variable. This configuration is appropriate for scenarios with two available engines, "Pandas" and "Snowflake" on relational workloads.
 - Improve import error message by adding '--upgrade' to 'pip install "snowflake-snowpark-python[modin]"' in the error message.
+- Reduce the telemetry messages from the modin client by pre-aggregating into 5 second windows and only keeping a narrow band of metrics which are useful for tracking hybrid execution and native pandas performance.
 
 #### Dependency Updates
 
diff --git a/src/snowflake/snowpark/modin/config/__init__.py b/src/snowflake/snowpark/modin/config/__init__.py
@@ -55,6 +55,8 @@
     RayRedisPassword,
     ReadSqlEngine,
     SnowflakePandasTransferThreshold,
+    SnowflakeModinTelemetryFlushInterval,
+    SnowflakeModinTelemetryEnabled,
     StorageFormat,
     TestDatasetSize,
     TestReadFromPostgres,
@@ -118,4 +120,6 @@
     # Plugin settings
     "DocModule",
     "SnowflakePandasTransferThreshold",
+    "SnowflakeModinTelemetryFlushInterval",
+    "SnowflakeModinTelemetryEnabled",
 ]
diff --git a/src/snowflake/snowpark/modin/config/envvars.py b/src/snowflake/snowpark/modin/config/envvars.py
@@ -78,6 +78,34 @@ def get_help(cls) -> str:
         return help
 
 
+class SnowflakeModinTelemetryFlushInterval(EnvironmentVariable, type=int):
+    """
+    Minimum number of seconds between a flush of telemetry to snowflake
+    from metrics generated in the client modin layer.
+    """
+
+    varname = "SNOWFLAKE_MODIN_TELEMETRY_FLUSH_INTERVAL"
+    default = 5
+
+
+modin_config.SnowflakeModinTelemetryFlushInterval = SnowflakeModinTelemetryFlushInterval
+
+
+class SnowflakeModinTelemetryEnabled(EnvironmentVariable, type=bool):
+    """
+    Enable or disable telemetry sent to Snowflake from the modin
+    client. This only includes telemetry sent through the modin
+    metrics events, not all snowpark telemetry generated through lazily
+    evaluated queries on the Snowflake backend.
+    """
+
+    varname = "SNOWFLAKE_MODIN_TELEMETRY_ENABLED"
+    default = True
+
+
+modin_config.SnowflakeModinTelemetryEnabled = SnowflakeModinTelemetryEnabled
+
+
 class SnowflakePandasTransferThreshold(EnvironmentVariable, type=int):
     """
     Targeted max number of dataframe rows which should be transferred from
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/__init__.py b/src/snowflake/snowpark/modin/plugin/_internal/__init__.py
@@ -1,3 +1,7 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+from snowflake.snowpark.modin.plugin._internal.telemetry import ModinTelemetrySender
+
+# exported to allow for mocking of the telemetry in a consistent way
+__all__ = ["ModinTelemetrySender"]
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py
@@ -6,6 +6,7 @@
 import functools
 import inspect
 import re
+import time
 from contextlib import nullcontext
 from enum import Enum, unique
 from typing import Any, Callable, Optional, TypeVar, Union, cast
@@ -15,6 +16,10 @@
 from modin.config import MetricsMode
 from typing_extensions import ParamSpec
 
+from snowflake.snowpark.modin.config.envvars import (
+    SnowflakeModinTelemetryEnabled,
+    SnowflakeModinTelemetryFlushInterval,
+)
 import snowflake.snowpark.session
 from snowflake.connector.telemetry import TelemetryField as PCTelemetryField
 from snowflake.snowpark._internal.telemetry import TelemetryField, safe_telemetry
@@ -62,6 +67,19 @@ class PropertyMethodType(Enum):
     FDEL = "delete"
 
 
+class ModinTelemetrySender:
+    """
+    Class designed to allow for easier testing of telemetry
+    """
+
+    @classmethod
+    def _send_telemetry(cls, session: Session, message: dict) -> None:
+        """
+        Internal method to allow for easier testing
+        """
+        return session._conn._telemetry_client.send(message)
+
+
 @safe_telemetry
 def _send_modin_api_telemetry(
     session: Session, event: str, value: Union[int, float], aggregatable: bool
@@ -94,7 +112,7 @@ def _send_modin_api_telemetry(
         TelemetryField.KEY_DATA.value: data,
         PCTelemetryField.KEY_SOURCE.value: "modin",
     }
-    session._conn._telemetry_client.send(message)
+    ModinTelemetrySender()._send_telemetry(session, message)
 
 
 @safe_telemetry
@@ -146,7 +164,7 @@ def _send_snowpark_pandas_telemetry_helper(
         TelemetryField.KEY_DATA.value: data,
         PCTelemetryField.KEY_SOURCE.value: "SnowparkPandas",
     }
-    session._conn._telemetry_client.send(message)
+    ModinTelemetrySender()._send_telemetry(session, message)
 
 
 def _not_equal_to_default(arg_val: Any, default_val: Any) -> bool:
@@ -644,28 +662,101 @@ def __new__(
         return type.__new__(cls, name, bases, attrs)
 
 
+_modin_event_log: list = [[]]
+_last_modin_metric_flush: float = 0
+_modin_metric_flush_interval = 0
+
+MODIN_SWITCH_DECISION_METRIC_PREFIXES = (
+    "modin.hybrid.merge.decision",
+    "modin.hybrid.auto.decision",
+)
+MODIN_PERFORMANCE_METRIC_PREFIXES = ("modin.query-compiler",)
+
+
+def _check_and_reset_metric_flush_time() -> bool:
+    """
+    Return False if we still need to aggregate more metrics
+    Return True if we should flush the metrics, and reset the clock
+
+    """
+    global _last_modin_metric_flush
+    global _modin_metric_flush_interval
+
+    # Support a changing flush interval
+    current_flush_interval = SnowflakeModinTelemetryFlushInterval.get()
+    current_time = time.time()
+    if current_time > _last_modin_metric_flush + current_flush_interval:
+        _last_modin_metric_flush = current_time
+        return True
+
+    return False
+
+
+def _flush_modin_metrics() -> None:
+    """
+    Flush the collected modin metrics through the normal telemetry channel.
+    Aggregate all metrics with the same name into simple statistics. Set
+    the aggregatable field to True only for the count statistic.
+
+    This will output metrics of the form:
+      modin.query-compiler.snowflakequerycompiler.value_counts.stat.mean
+      modin.query-compiler.snowflakequerycompiler.value_counts.stat.median
+      modin.query-compiler.snowflakequerycompiler.value_counts.stat.count
+      modin.hybrid.auto.decision.Pandas.count
+      modin.hybrid.auto.decision.Snowflake.mean
+      ...
+    """
+    global _modin_event_log
+    try:
+        summary_stat_names = ["count", "median", "mean"]
+        processing_df = native_pd.DataFrame(
+            _modin_event_log, columns=["metric", "value"]
+        )
+        summary_stats = processing_df.groupby("metric").agg(summary_stat_names)
+        session = snowflake.snowpark.session._get_active_session()
+        for row in summary_stats.iterrows():
+            for stat in summary_stats:
+                stat_specific_metric = f"{row[0]}.stat.{stat[1]}"
+
+                _send_modin_api_telemetry(
+                    session=session,
+                    event=stat_specific_metric,
+                    value=row[1][stat],
+                    aggregatable=stat == ("value", "count"),
+                )
+    except Exception:
+        pass
+    _modin_event_log = []
+
+
 def modin_telemetry_watcher(metric_name: str, metric_value: Union[int, float]) -> None:
     """
     Telemetry hook that collects modin telemetry events of interest for
     transmission to Snowflake.
     """
-    useful_metrics = (
-        "modin.hybrid.merge.decision",
-        "modin.pandas-api",
-        "modin.query-compiler",
-        "modin.hybrid.auto.decision",
-    )
-    if metric_name.startswith(useful_metrics):
-        try:
-            session = snowflake.snowpark.session._get_active_session()
-            _send_modin_api_telemetry(
-                session=session,
-                event=metric_name,
-                value=metric_value,
-                aggregatable=False,
-            )
-        except Exception:
-            pass
+    simplified_metric = metric_name
+
+    metric_valid = False
+    # ignore telemetry from dunder and internal metrics
+    if metric_name.startswith(MODIN_PERFORMANCE_METRIC_PREFIXES):
+        parts = metric_name.split(".")
+        if parts[3].startswith("_"):
+            return
+        metric_valid = True
+
+    if metric_name.startswith(MODIN_SWITCH_DECISION_METRIC_PREFIXES):
+        # strip off the groups
+        simplified_metric = ".".join(metric_name.split(".")[0:5])
+        metric_valid = True
+
+    if not metric_valid:
+        return
+
+    _modin_event_log.append([simplified_metric, metric_value])
+    # We will lose telemetry at the tail end of the process, but
+    # that's OK - this telemetry is meant to be lossy
+    if _check_and_reset_metric_flush_time():
+        _flush_modin_metrics()
 
 
 hybrid_switch_log = native_pd.DataFrame({})
@@ -736,5 +827,6 @@ def hybrid_describe_telemetry_watcher(
 
 def connect_modin_telemetry() -> None:
     MetricsMode.enable()
-    add_metric_handler(modin_telemetry_watcher)
+    if SnowflakeModinTelemetryEnabled.get():
+        add_metric_handler(modin_telemetry_watcher)
     add_metric_handler(hybrid_describe_telemetry_watcher)
diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,7 @@`
`1`	`1`	`#`
`2`	`2`	`# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.`
`3`	`3`	`#`
	`4`	`+from snowflake.snowpark.modin.plugin._internal.telemetry import ModinTelemetrySender`
	`5`	`+`
	`6`	`+# exported to allow for mocking of the telemetry in a consistent way`
	`7`	`+__all__ = ["ModinTelemetrySender"]`