Skip to content

Commit 68f1d22

Browse files
authored
fix: Address the packages issue for bigframes function (#1991)
1 parent fad5722 commit 68f1d22

File tree

3 files changed

+171
-8
lines changed

3 files changed

+171
-8
lines changed

bigframes/exceptions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ class FunctionConflictTypeHintWarning(UserWarning):
113113

114114
class FunctionPackageVersionWarning(PreviewWarning):
115115
"""
116-
Managed UDF package versions for Numpy, Pandas, and Pyarrow may not
117-
precisely match users' local environment or the exact versions specified.
116+
Warns that package versions in remote function or managed function may not
117+
match local or specified versions, which might cause unexpected behavior.
118118
"""
119119

120120

bigframes/functions/_utils.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import google.api_core.exceptions
2626
from google.cloud import bigquery, functions_v2
2727
import numpy
28+
from packaging.requirements import Requirement
2829
import pandas
2930
import pyarrow
3031

@@ -63,6 +64,16 @@ def get_remote_function_locations(bq_location):
6364
return bq_location, cloud_function_region
6465

6566

67+
def _package_existed(package_requirements: list[str], package: str) -> bool:
68+
"""Checks if a package (regardless of version) exists in a given list."""
69+
if not package_requirements:
70+
return False
71+
72+
return Requirement(package).name in {
73+
Requirement(req).name for req in package_requirements
74+
}
75+
76+
6677
def get_updated_package_requirements(
6778
package_requirements=None,
6879
is_row_processor=False,
@@ -96,13 +107,16 @@ def get_updated_package_requirements(
96107
requirements.append(f"pyarrow=={pyarrow.__version__}")
97108
requirements.append(f"numpy=={numpy.__version__}")
98109

99-
# TODO(b/435023957): Fix the issue of potential duplicate package versions
100-
# when `package_requirements` also contains `pandas/pyarrow/numpy`.
101-
if package_requirements:
102-
requirements.extend(package_requirements)
110+
if not requirements:
111+
return package_requirements
112+
113+
if not package_requirements:
114+
package_requirements = []
115+
for package in requirements:
116+
if not _package_existed(package_requirements, package):
117+
package_requirements.append(package)
103118

104-
requirements = sorted(requirements)
105-
return requirements
119+
return sorted(package_requirements)
106120

107121

108122
def clean_up_by_session_id(

tests/unit/functions/test_utils.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from unittest.mock import patch
16+
17+
from bigframes.functions._utils import (
18+
_package_existed,
19+
get_updated_package_requirements,
20+
)
21+
22+
23+
def test_get_updated_package_requirements_no_extra_package():
24+
"""Tests with no extra package."""
25+
result = get_updated_package_requirements(capture_references=False)
26+
assert result is None
27+
28+
initial_packages = ["xgboost"]
29+
result = get_updated_package_requirements(
30+
initial_packages, capture_references=False
31+
)
32+
assert result == initial_packages
33+
34+
35+
@patch("bigframes.functions._utils.numpy.__version__", "1.24.4")
36+
@patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1")
37+
@patch("bigframes.functions._utils.pandas.__version__", "2.0.3")
38+
@patch("bigframes.functions._utils.cloudpickle.__version__", "2.2.1")
39+
def test_get_updated_package_requirements_is_row_processor_with_versions():
40+
"""Tests with is_row_processor=True and specific versions."""
41+
expected = [
42+
"cloudpickle==2.2.1",
43+
"numpy==1.24.4",
44+
"pandas==2.0.3",
45+
"pyarrow==14.0.1",
46+
]
47+
result = get_updated_package_requirements(is_row_processor=True)
48+
assert result == expected
49+
50+
51+
@patch("bigframes.functions._utils.warnings.warn")
52+
@patch("bigframes.functions._utils.cloudpickle.__version__", "2.2.1")
53+
def test_get_updated_package_requirements_ignore_version(mock_warn):
54+
"""
55+
Tests with is_row_processor=True and ignore_package_version=True.
56+
Should add packages without versions and raise a warning.
57+
"""
58+
expected = ["cloudpickle==2.2.1", "numpy", "pandas", "pyarrow"]
59+
result = get_updated_package_requirements(
60+
is_row_processor=True, ignore_package_version=True
61+
)
62+
assert result == expected
63+
# Verify that a warning was issued.
64+
mock_warn.assert_called_once()
65+
66+
67+
@patch("bigframes.functions._utils.numpy.__version__", "1.24.4")
68+
@patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1")
69+
@patch("bigframes.functions._utils.pandas.__version__", "2.0.3")
70+
def test_get_updated_package_requirements_capture_references_false():
71+
"""
72+
Tests with capture_references=False.
73+
Should not add cloudpickle but should add others if requested.
74+
"""
75+
# Case 1: Only capture_references=False.
76+
result_1 = get_updated_package_requirements(capture_references=False)
77+
assert result_1 is None
78+
79+
# Case 2: capture_references=False but is_row_processor=True.
80+
expected_2 = ["numpy==1.24.4", "pandas==2.0.3", "pyarrow==14.0.1"]
81+
result_2 = get_updated_package_requirements(
82+
is_row_processor=True, capture_references=False
83+
)
84+
assert result_2 == expected_2
85+
86+
87+
@patch("bigframes.functions._utils.numpy.__version__", "1.24.4")
88+
@patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1")
89+
@patch("bigframes.functions._utils.pandas.__version__", "2.0.3")
90+
@patch("bigframes.functions._utils.cloudpickle.__version__", "2.2.1")
91+
def test_get_updated_package_requirements_non_overlapping_packages():
92+
"""Tests providing an initial list of packages that do not overlap."""
93+
initial_packages = ["scikit-learn==1.3.0", "xgboost"]
94+
expected = [
95+
"cloudpickle==2.2.1",
96+
"numpy==1.24.4",
97+
"pandas==2.0.3",
98+
"pyarrow==14.0.1",
99+
"scikit-learn==1.3.0",
100+
"xgboost",
101+
]
102+
result = get_updated_package_requirements(
103+
package_requirements=initial_packages, is_row_processor=True
104+
)
105+
assert result == expected
106+
107+
108+
@patch("bigframes.functions._utils.numpy.__version__", "1.24.4")
109+
@patch("bigframes.functions._utils.pyarrow.__version__", "14.0.1")
110+
@patch("bigframes.functions._utils.pandas.__version__", "2.0.3")
111+
@patch("bigframes.functions._utils.cloudpickle.__version__", "2.2.1")
112+
def test_get_updated_package_requirements_overlapping_packages():
113+
"""Tests that packages are not added if they already exist."""
114+
# The function should respect the pre-existing pandas version.
115+
initial_packages = ["pandas==1.5.3", "numpy"]
116+
expected = [
117+
"cloudpickle==2.2.1",
118+
"numpy",
119+
"pandas==1.5.3",
120+
"pyarrow==14.0.1",
121+
]
122+
result = get_updated_package_requirements(
123+
package_requirements=initial_packages, is_row_processor=True
124+
)
125+
assert result == expected
126+
127+
128+
@patch("bigframes.functions._utils.cloudpickle.__version__", "2.2.1")
129+
def test_get_updated_package_requirements_with_existing_cloudpickle():
130+
"""Tests that cloudpickle is not added if it already exists."""
131+
initial_packages = ["cloudpickle==2.0.0"]
132+
expected = ["cloudpickle==2.0.0"]
133+
result = get_updated_package_requirements(package_requirements=initial_packages)
134+
assert result == expected
135+
136+
137+
def test_package_existed_helper():
138+
"""Tests the _package_existed helper function directly."""
139+
reqs = ["pandas==1.0", "numpy", "scikit-learn>=1.2.0"]
140+
# Exact match
141+
assert _package_existed(reqs, "pandas==1.0")
142+
# Different version
143+
assert _package_existed(reqs, "pandas==2.0")
144+
# No version specified
145+
assert _package_existed(reqs, "numpy")
146+
# Not in list
147+
assert not _package_existed(reqs, "xgboost")
148+
# Empty list
149+
assert not _package_existed([], "pandas")

0 commit comments

Comments
 (0)