Skip to content

Commit 80d722e

Browse files
SNOW-1805840: Augment telemetry with method_call_count (#2804)
<!--- Please answer these questions before creating your pull request. Thanks! ---> 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. <!--- In this section, please add a Snowflake Jira issue number. Note that if a corresponding GitHub issue exists, you should still include the Snowflake Jira issue number. For example, for GitHub issue #1400, you should add "SNOW-1335071" here. ---> Fixes SNOW-1805840 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [x] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Adding method_call_count which is the # of times a pandas API method has been called. See more info in the interchange protocol design doc here: https://docs.google.com/document/d/1EfqQwejVbF5_36hnOP-ap0t3NaCWmDz62iAcR0PtX20/edit?tab=t.0#heading=h.4uu48icmuq7z --------- Signed-off-by: Labanya Mukhopadhyay <[email protected]>
1 parent 9ab8318 commit 80d722e

File tree

5 files changed

+141
-2
lines changed

5 files changed

+141
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,9 @@
6262
- Updated integration testing for `session.lineage.trace` to exclude deleted objects
6363
- Added documentation for `DataFrame.map`.
6464
- Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible.
65-
- Added documentation on the extent of Snowpark pandas interoperability with scikit-learn
65+
- Added documentation on the extent of Snowpark pandas interoperability with scikit-learn.
6666
- Infer return type of functions in `Series.map`, `Series.apply` and `DataFrame.map` if type-hint is not provided.
67+
- Added `call_count` to telemetry that counts method calls including interchange protocol calls.
6768

6869
## 1.26.0 (2024-12-05)
6970

src/snowflake/snowpark/modin/plugin/_internal/telemetry.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class SnowparkPandasTelemetryField(Enum):
3737
ARGS = "argument"
3838
# fallback flag
3939
IS_FALLBACK = "is_fallback"
40+
# number of times a method has been called on the same query compiler
41+
CALL_COUNT = "call_count"
4042

4143

4244
# Argument truncating size after converted to str. Size amount can be later specified after analysis and needs.
@@ -59,6 +61,7 @@ def _send_snowpark_pandas_telemetry_helper(
5961
func_name: str,
6062
query_history: Optional[QueryHistory],
6163
api_calls: Union[str, list[dict[str, Any]]],
64+
method_call_count: str,
6265
) -> None:
6366
"""
6467
A helper function that sends Snowpark pandas API telemetry data.
@@ -72,6 +75,7 @@ def _send_snowpark_pandas_telemetry_helper(
7275
query_history: The query history context manager to record queries that are pushed down to the Snowflake
7376
database in the session.
7477
api_calls: Optional list of Snowpark pandas API calls made during the function execution.
78+
method_call_count: Number of times a method has been called.
7579
7680
Returns:
7781
None
@@ -80,6 +84,11 @@ def _send_snowpark_pandas_telemetry_helper(
8084
TelemetryField.KEY_FUNC_NAME.value: func_name,
8185
TelemetryField.KEY_CATEGORY.value: SnowparkPandasTelemetryField.FUNC_CATEGORY_SNOWPARK_PANDAS.value,
8286
TelemetryField.KEY_ERROR_MSG.value: error_msg,
87+
**(
88+
{SnowparkPandasTelemetryField.CALL_COUNT.value: method_call_count}
89+
if method_call_count is not None
90+
else {}
91+
),
8392
}
8493
if len(api_calls) > 0:
8594
data[TelemetryField.KEY_API_CALLS.value] = api_calls
@@ -275,6 +284,7 @@ def _telemetry_helper(
275284
# Moving existing api call out first can avoid to generate duplicates.
276285
existing_api_calls = []
277286
need_to_restore_args0_api_calls = False
287+
method_call_count = None
278288

279289
# If the decorated func is a class method or a standalone function, we need to get an active session:
280290
if is_standalone_function or (len(args) > 0 and isinstance(args[0], type)):
@@ -296,6 +306,11 @@ def _telemetry_helper(
296306
need_to_restore_args0_api_calls = True
297307
session = args[0]._query_compiler._modin_frame.ordered_dataframe.session
298308
class_prefix = args[0].__class__.__name__
309+
func_name = _gen_func_name(
310+
class_prefix, func, property_name, property_method_type
311+
)
312+
args[0]._query_compiler._method_call_counts[func_name] += 1
313+
method_call_count = args[0]._query_compiler._method_call_counts[func_name]
299314
except (TypeError, IndexError, AttributeError):
300315
# TypeError: args might not support indexing; IndexError: args is empty; AttributeError: args[0] might not
301316
# have _query_compiler attribute.
@@ -338,6 +353,7 @@ def _telemetry_helper(
338353
func_name=func_name,
339354
query_history=query_history,
340355
api_calls=existing_api_calls + [curr_api_call],
356+
method_call_count=method_call_count,
341357
)
342358
raise e
343359

@@ -372,6 +388,7 @@ def _telemetry_helper(
372388
func_name=func_name,
373389
query_history=query_history,
374390
api_calls=existing_api_calls + [curr_api_call],
391+
method_call_count=method_call_count,
375392
)
376393
if need_to_restore_args0_api_calls:
377394
args[0]._query_compiler.snowpark_pandas_api_calls = existing_api_calls

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import json
1212
import logging
1313
import re
14+
from collections import Counter
1415
import typing
1516
import uuid
1617
from collections.abc import Hashable, Iterable, Mapping, Sequence
@@ -531,9 +532,11 @@ def __init__(self, frame: InternalFrame) -> None:
531532
), "frame is None or not a InternalFrame"
532533
self._modin_frame = frame
533534
# self.snowpark_pandas_api_calls a list of lazy Snowpark pandas telemetry api calls
534-
# Copying and modifying self.snowpark_pandas_api_calls is taken care of in telemetry decorators
535+
# Copying and modifying self.snowpark_pandas_api_calls and self._method_call_counts
536+
# is taken care of in telemetry decorators
535537
self.snowpark_pandas_api_calls: list = []
536538
self._attrs: dict[Any, Any] = {}
539+
self._method_call_counts: Counter[str] = Counter[str]()
537540

538541
def _raise_not_implemented_error_for_timedelta(
539542
self, frame: InternalFrame = None

tests/integ/modin/test_telemetry.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def test_snowpark_pandas_telemetry_method_decorator(test_table_name):
144144
"sfqids",
145145
"func_name",
146146
"error_msg",
147+
"call_count",
147148
}
148149
assert data["category"] == "snowpark_pandas"
149150
assert data["api_calls"] == df1_expected_api_calls + [
@@ -179,6 +180,7 @@ def test_send_snowpark_pandas_telemetry_helper(send_mock):
179180
func_name="test_send_func",
180181
query_history=None,
181182
api_calls=[],
183+
method_call_count=None,
182184
)
183185
send_mock.assert_called_with(
184186
{
@@ -560,6 +562,119 @@ def test_telemetry_repr():
560562
]
561563

562564

565+
@sql_count_checker(query_count=6, join_count=4)
566+
def test_telemetry_interchange_call_count():
567+
s = pd.DataFrame([1, 2, 3, 4])
568+
t = pd.DataFrame([5])
569+
s.__dataframe__()
570+
s.__dataframe__()
571+
t.__dataframe__()
572+
573+
s.iloc[0, 0] = 7
574+
s.__dataframe__()
575+
s.__dataframe__()
576+
t.__dataframe__()
577+
578+
def _get_data(call):
579+
try:
580+
return call.to_dict()["message"][TelemetryField.KEY_DATA.value]
581+
except Exception:
582+
return None
583+
584+
telemetry_data = [
585+
_get_data(call)
586+
for call in pd.session._conn._telemetry_client.telemetry._log_batch
587+
if _get_data(call) is not None
588+
and "func_name" in _get_data(call)
589+
and _get_data(call)["func_name"] == "DataFrame.__dataframe__"
590+
]
591+
assert len(telemetry_data) == 6
592+
# s calls __dataframe__() for the first time.
593+
assert telemetry_data[0]["call_count"] == 1
594+
# s calls __dataframe__() for the second time.
595+
assert telemetry_data[1]["call_count"] == 2
596+
# t calls __dataframe__() for the first time.
597+
assert telemetry_data[2]["call_count"] == 1
598+
# the new version of s calls __dataframe__() for the first time.
599+
assert telemetry_data[3]["call_count"] == 1
600+
# the new version of s calls __dataframe__() for the second time.
601+
assert telemetry_data[4]["call_count"] == 2
602+
# t calls __dataframe__() for the second time.
603+
assert telemetry_data[5]["call_count"] == 2
604+
605+
606+
@sql_count_checker(query_count=4)
607+
def test_telemetry_func_call_count():
608+
s = pd.DataFrame([1, 2, np.nan, 4])
609+
t = pd.DataFrame([5])
610+
611+
s.__repr__()
612+
s.__repr__()
613+
s.__repr__()
614+
615+
t.__repr__()
616+
617+
def _get_data(call):
618+
try:
619+
return call.to_dict()["message"][TelemetryField.KEY_DATA.value]
620+
except Exception:
621+
return None
622+
623+
telemetry_data = [
624+
_get_data(call)
625+
for call in pd.session._conn._telemetry_client.telemetry._log_batch
626+
if _get_data(call) is not None
627+
and "func_name" in _get_data(call)
628+
and _get_data(call)["func_name"] == "DataFrame.__repr__"
629+
]
630+
631+
# second to last call from telemetry data
632+
# s called __repr__() 3 times.
633+
assert telemetry_data[-2]["call_count"] == 3
634+
635+
# last call from telemetry data
636+
# t called __repr__() 1 time.
637+
assert telemetry_data[-1]["call_count"] == 1
638+
639+
640+
@sql_count_checker(query_count=3)
641+
def test_telemetry_multiple_func_call_count():
642+
s = pd.DataFrame([1, 2, np.nan, 4])
643+
644+
s.__repr__()
645+
s.__repr__()
646+
s.__dataframe__()
647+
648+
def _get_data(call):
649+
try:
650+
return call.to_dict()["message"][TelemetryField.KEY_DATA.value]
651+
except Exception:
652+
return None
653+
654+
repr_telemetry_data = [
655+
_get_data(call)
656+
for call in pd.session._conn._telemetry_client.telemetry._log_batch
657+
if _get_data(call) is not None
658+
and "func_name" in _get_data(call)
659+
and _get_data(call)["func_name"] == "DataFrame.__repr__"
660+
]
661+
dataframe_telemetry_data = [
662+
_get_data(call)
663+
for call in pd.session._conn._telemetry_client.telemetry._log_batch
664+
if _get_data(call) is not None
665+
and "func_name" in _get_data(call)
666+
and _get_data(call)["func_name"] == "DataFrame.__dataframe__"
667+
]
668+
669+
# last call from telemetry data
670+
# s called __repr__() 2 times.
671+
assert repr_telemetry_data[-1]["call_count"] == 2
672+
673+
# last call from telemetry data
674+
# s called __dataframe__() 2 times.
675+
assert dataframe_telemetry_data[-1]["call_count"] == 1
676+
677+
563678
@sql_count_checker(query_count=0)
564679
def test_telemetry_copy():
565680
# copy() is defined in upstream Modin's BasePandasDataset class, and not overridden by any

tests/unit/modin/test_telemetry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def snowpark_pandas_error_test_helper(
5555
query_history=ANY,
5656
telemetry_type=telemetry_type,
5757
error_msg=error_msg,
58+
method_call_count=ANY,
5859
)
5960

6061

@@ -116,6 +117,7 @@ def raise_real_type_error(_):
116117
query_history=ANY,
117118
telemetry_type="snowpark_pandas_type_error",
118119
error_msg=None,
120+
method_call_count=ANY,
119121
)
120122
assert len(mock_arg2._query_compiler.snowpark_pandas_api_calls) == 0
121123

@@ -134,6 +136,7 @@ def raise_real_type_error(_):
134136
query_history=ANY,
135137
telemetry_type="snowpark_pandas_type_error",
136138
error_msg=None,
139+
method_call_count=ANY,
137140
)
138141

139142

0 commit comments

Comments
 (0)