Skip to content

Commit a05ffa2

Browse files
Genesis929tswast
andauthored
chore: unimplemented API tracking (#1269)
* chore: unimplemented API tracking * fix * fix * fix * update docstring * update docstring * update tests * update skip condition * update skip condition * update skip condition * update error * update logic, remove args for missing method * update tests * Update bigframes/core/log_adapter.py --------- Co-authored-by: Tim Sweña (Swast) <[email protected]>
1 parent a687050 commit a05ffa2

File tree

4 files changed

+193
-2
lines changed

4 files changed

+193
-2
lines changed

bigframes/core/log_adapter.py

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,21 @@
1313
# limitations under the License.
1414

1515
import functools
16+
import inspect
1617
import threading
1718
from typing import List
1819

20+
from google.cloud import bigquery
21+
import pandas
22+
1923
_lock = threading.Lock()
2024

2125
# The limit is 64 (https://cloud.google.com/bigquery/docs/labels-intro#requirements),
2226
# but leave a few spare for internal labels to be added.
2327
# See internal issue 386825477.
2428
MAX_LABELS_COUNT = 64 - 8
29+
PANDAS_API_TRACKING_TASK = "pandas_api_tracking"
30+
PANDAS_PARAM_TRACKING_TASK = "pandas_param_tracking"
2531

2632
_api_methods: List = []
2733
_excluded_methods = ["__setattr__", "__getattr__"]
@@ -30,6 +36,75 @@
3036
_call_stack: List = []
3137

3238

39+
def submit_pandas_labels(
40+
bq_client: bigquery.Client,
41+
class_name: str,
42+
method_name: str,
43+
args=(),
44+
kwargs={},
45+
task: str = PANDAS_API_TRACKING_TASK,
46+
):
47+
"""
48+
Submits usage of API to BigQuery using a simulated failed query.
49+
50+
This function is designed to capture and log details about the usage of pandas methods,
51+
including class and method names, the count of positional arguments, and any keyword
52+
arguments that match the method's signature. To avoid incurring costs, it simulates a
53+
query execution using a query with syntax errors.
54+
55+
Args:
56+
bq_client (bigquery.Client): The client used to interact with BigQuery.
57+
class_name (str): The name of the pandas class being used.
58+
method_name (str): The name of the method being invoked.
59+
args (tuple): The positional arguments passed to the method.
60+
kwargs (dict): The keyword arguments passed to the method.
61+
task (str): The specific task type for the logging event:
62+
- 'PANDAS_API_TRACKING_TASK': Indicates that the unimplemented feature is a method.
63+
- 'PANDAS_PARAM_TRACKING_TASK': Indicates that the unimplemented feature is a
64+
parameter of a method.
65+
"""
66+
labels_dict = {
67+
"task": task,
68+
"class_name": class_name.lower(),
69+
"method_name": method_name.lower(),
70+
"args_count": len(args),
71+
}
72+
73+
if hasattr(pandas, class_name):
74+
cls = getattr(pandas, class_name)
75+
else:
76+
return
77+
78+
if hasattr(cls, method_name):
79+
method = getattr(cls, method_name)
80+
else:
81+
return
82+
83+
if kwargs:
84+
# Iterate through the keyword arguments and add them to the labels dictionary if they
85+
# are parameters that are implemented in pandas and the maximum label count has not been reached.
86+
signature = inspect.signature(method)
87+
param_names = [param.name for param in signature.parameters.values()]
88+
89+
idx = 0
90+
for key in kwargs.keys():
91+
if len(labels_dict) >= MAX_LABELS_COUNT:
92+
break
93+
if key in param_names:
94+
labels_dict[f"kwargs_{idx}"] = key.lower()
95+
idx += 1
96+
97+
# If this log is for tracking unimplemented parameters and no keyword arguments were
98+
# provided, skip logging.
99+
if len(labels_dict) == 4 and task == PANDAS_PARAM_TRACKING_TASK:
100+
return
101+
102+
# Run a query with syntax error to avoid cost.
103+
query = "SELECT COUNT(x FROM data_table—"
104+
job_config = bigquery.QueryJobConfig(labels=labels_dict)
105+
bq_client.query(query, job_config=job_config)
106+
107+
33108
def class_logger(decorated_cls):
34109
"""Decorator that adds logging functionality to each method of the class."""
35110
for attr_name, attr_value in decorated_cls.__dict__.items():
@@ -46,7 +121,7 @@ def method_logger(method, decorated_cls):
46121
"""Decorator that adds logging functionality to a method."""
47122

48123
@functools.wraps(method)
49-
def wrapper(*args, **kwargs):
124+
def wrapper(self, *args, **kwargs):
50125
class_name = decorated_cls.__name__ # Access decorated class name
51126
api_method_name = str(method.__name__)
52127
full_method_name = f"{class_name.lower()}-{api_method_name}"
@@ -58,7 +133,23 @@ def wrapper(*args, **kwargs):
58133
_call_stack.append(full_method_name)
59134

60135
try:
61-
return method(*args, **kwargs)
136+
return method(self, *args, **kwargs)
137+
except (NotImplementedError, TypeError) as e:
138+
# Log method parameters that are implemented in pandas but either missing (TypeError)
139+
# or not fully supported (NotImplementedError) in BigFrames.
140+
# Logging is currently supported only when we can access the bqclient through
141+
# self._block.expr.session.bqclient. Also, to avoid generating multiple queries
142+
# because of internal calls, we log only when the method is directly invoked.
143+
if hasattr(self, "_block") and len(_call_stack) == 1:
144+
submit_pandas_labels(
145+
self._block.expr.session.bqclient,
146+
class_name,
147+
api_method_name,
148+
args,
149+
kwargs,
150+
task=PANDAS_PARAM_TRACKING_TASK,
151+
)
152+
raise e
62153
finally:
63154
_call_stack.pop()
64155

bigframes/dataframe.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,9 @@ def __getattr__(self, key: str):
645645
return self.__getitem__(key)
646646

647647
if hasattr(pandas.DataFrame, key):
648+
log_adapter.submit_pandas_labels(
649+
self._block.expr.session.bqclient, self.__class__.__name__, key
650+
)
648651
raise AttributeError(
649652
textwrap.dedent(
650653
f"""

bigframes/series.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,9 @@ def __getattr__(self, key: str):
12991299
if key == "_block":
13001300
raise AttributeError(key)
13011301
elif hasattr(pandas.Series, key):
1302+
log_adapter.submit_pandas_labels(
1303+
self._block.expr.session.bqclient, self.__class__.__name__, key
1304+
)
13021305
raise AttributeError(
13031306
textwrap.dedent(
13041307
f"""

tests/unit/core/test_log_adapter.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from unittest import mock
16+
17+
from google.cloud import bigquery
1518
import pytest
1619

1720
from bigframes.core import log_adapter
@@ -22,6 +25,12 @@
2225
MAX_LABELS_COUNT = 56
2326

2427

28+
@pytest.fixture
29+
def mock_bqclient():
30+
mock_bqclient = mock.create_autospec(spec=bigquery.Client)
31+
return mock_bqclient
32+
33+
2534
@pytest.fixture
2635
def test_instance():
2736
# Create a simple class for testing
@@ -61,3 +70,88 @@ def test_get_and_reset_api_methods(test_instance):
6170
previous_methods = log_adapter.get_and_reset_api_methods()
6271
assert previous_methods is not None
6372
assert log_adapter._api_methods == []
73+
74+
75+
@pytest.mark.parametrize(
76+
("class_name", "method_name", "args", "kwargs", "task", "expected_labels"),
77+
(
78+
(
79+
"DataFrame",
80+
"resample",
81+
["a", "b", "c"],
82+
{"aa": "bb", "rule": "1s"},
83+
log_adapter.PANDAS_API_TRACKING_TASK,
84+
{
85+
"task": log_adapter.PANDAS_API_TRACKING_TASK,
86+
"class_name": "dataframe",
87+
"method_name": "resample",
88+
"args_count": 3,
89+
"kwargs_0": "rule",
90+
},
91+
),
92+
(
93+
"Series",
94+
"resample",
95+
[],
96+
{"aa": "bb", "rule": "1s"},
97+
log_adapter.PANDAS_PARAM_TRACKING_TASK,
98+
{
99+
"task": log_adapter.PANDAS_PARAM_TRACKING_TASK,
100+
"class_name": "series",
101+
"method_name": "resample",
102+
"args_count": 0,
103+
"kwargs_0": "rule",
104+
},
105+
),
106+
(
107+
"DataFrame",
108+
"resample",
109+
[],
110+
{"aa": "bb"},
111+
log_adapter.PANDAS_API_TRACKING_TASK,
112+
{
113+
"task": log_adapter.PANDAS_API_TRACKING_TASK,
114+
"class_name": "dataframe",
115+
"method_name": "resample",
116+
"args_count": 0,
117+
},
118+
),
119+
(
120+
"DataFrame",
121+
"resample",
122+
[],
123+
{},
124+
log_adapter.PANDAS_API_TRACKING_TASK,
125+
{
126+
"task": log_adapter.PANDAS_API_TRACKING_TASK,
127+
"class_name": "dataframe",
128+
"method_name": "resample",
129+
"args_count": 0,
130+
},
131+
),
132+
),
133+
)
134+
def test_submit_pandas_labels(
135+
mock_bqclient, class_name, method_name, args, kwargs, task, expected_labels
136+
):
137+
log_adapter.submit_pandas_labels(
138+
mock_bqclient, class_name, method_name, args, kwargs, task
139+
)
140+
141+
mock_bqclient.query.assert_called_once()
142+
143+
query_call_args = mock_bqclient.query.call_args_list[0]
144+
labels = query_call_args[1]["job_config"].labels
145+
assert labels == expected_labels
146+
147+
148+
def test_submit_pandas_labels_without_valid_params_for_param_logging(mock_bqclient):
149+
log_adapter.submit_pandas_labels(
150+
mock_bqclient,
151+
"Series",
152+
"resample",
153+
task=log_adapter.PANDAS_PARAM_TRACKING_TASK,
154+
)
155+
156+
# For param tracking task without kwargs, we won't submit labels
157+
mock_bqclient.query.assert_not_called()

0 commit comments

Comments
 (0)