Skip to content

Commit cbf2d42

Browse files
authored
fix: reduce redundant remote_function deployments (#856)
* fix: reduce redundant `remote_function` deployments * do filename override in the naming rather than pickling * update documentation * update documentation
1 parent d0ab9cc commit cbf2d42

File tree

4 files changed

+161
-134
lines changed

4 files changed

+161
-134
lines changed

bigframes/functions/remote_function.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,23 @@ def get_remote_function_locations(bq_location):
167167

168168
def _get_hash(def_, package_requirements=None):
169169
"Get hash (32 digits alphanumeric) of a function."
170-
def_repr = cloudpickle.dumps(def_, protocol=_pickle_protocol_version)
170+
# There is a known cell-id sensitivity of the cloudpickle serialization in
171+
# notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of
172+
# this, if a cell contains a udf decorated with @remote_function, a unique
173+
# cloudpickle code is generated every time the cell is run, creating new
174+
# cloud artifacts every time. This is slow and wasteful.
175+
# A workaround of the same can be achieved by replacing the filename in the
176+
# code object to a static value
177+
# https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661.
178+
#
179+
# To respect the user code/environment let's make this modification on a
180+
# copy of the udf, not on the original udf itself.
181+
def_copy = cloudpickle.loads(cloudpickle.dumps(def_))
182+
def_copy.__code__ = def_copy.__code__.replace(
183+
co_filename="bigframes_place_holder_filename"
184+
)
185+
186+
def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version)
171187
if package_requirements:
172188
for p in sorted(package_requirements):
173189
def_repr += p.encode()
@@ -877,11 +893,16 @@ def remote_function(
877893
dynamically using the `bigquery_connection_client` assuming the user has necessary
878894
priviliges. The PROJECT_ID should be the same as the BigQuery connection project.
879895
reuse (bool, Optional):
880-
Reuse the remote function if is already exists.
881-
`True` by default, which results in reusing an existing remote
896+
Reuse the remote function if already exists.
897+
`True` by default, which will result in reusing an existing remote
882898
function and corresponding cloud function (if any) that was
883899
previously created for the same udf.
884-
Setting it to `False` forces the creation of a unique remote function.
900+
Please note that for an unnamed (i.e. created without an explicit
901+
`name` argument) remote function, the BigQuery DataFrames
902+
session id is attached in the cloud artifacts names. So for the
903+
effective reuse across the sessions it is recommended to create
904+
the remote function with an explicit `name`.
905+
Setting it to `False` would force creating a unique remote function.
885906
If the required remote function does not exist then it would be
886907
created irrespective of this param.
887908
name (str, Optional):

bigframes/functions/remote_function_template.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -215,23 +215,23 @@ def udf_http_row_processor(request):
215215

216216

217217
def generate_udf_code(def_, directory):
218-
"""Generate serialized bytecode using cloudpickle given a udf."""
218+
"""Generate serialized code using cloudpickle given a udf."""
219219
udf_code_file_name = "udf.py"
220-
udf_bytecode_file_name = "udf.cloudpickle"
220+
udf_pickle_file_name = "udf.cloudpickle"
221221

222222
# original code, only for debugging purpose
223223
udf_code = textwrap.dedent(inspect.getsource(def_))
224224
udf_code_file_path = os.path.join(directory, udf_code_file_name)
225225
with open(udf_code_file_path, "w") as f:
226226
f.write(udf_code)
227227

228-
# serialized bytecode
229-
udf_bytecode_file_path = os.path.join(directory, udf_bytecode_file_name)
228+
# serialized udf
229+
udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name)
230230
# TODO(b/345433300): try io.BytesIO to avoid writing to the file system
231-
with open(udf_bytecode_file_path, "wb") as f:
231+
with open(udf_pickle_file_path, "wb") as f:
232232
cloudpickle.dump(def_, f, protocol=_pickle_protocol_version)
233233

234-
return udf_code_file_name, udf_bytecode_file_name
234+
return udf_code_file_name, udf_pickle_file_name
235235

236236

237237
def generate_cloud_function_main_code(
@@ -252,15 +252,15 @@ def generate_cloud_function_main_code(
252252
"""
253253

254254
# Pickle the udf with all its dependencies
255-
udf_code_file, udf_bytecode_file = generate_udf_code(def_, directory)
255+
udf_code_file, udf_pickle_file = generate_udf_code(def_, directory)
256256

257257
code_blocks = [
258258
f"""\
259259
import cloudpickle
260260
261261
# original udf code is in {udf_code_file}
262-
# serialized udf code is in {udf_bytecode_file}
263-
with open("{udf_bytecode_file}", "rb") as f:
262+
# serialized udf code is in {udf_pickle_file}
263+
with open("{udf_pickle_file}", "rb") as f:
264264
udf = cloudpickle.load(f)
265265
266266
input_types = {repr(input_types)}

bigframes/session/__init__.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,15 +1629,21 @@ def remote_function(
16291629
`True` by default, which will result in reusing an existing remote
16301630
function and corresponding cloud function (if any) that was
16311631
previously created for the same udf.
1632+
Please note that for an unnamed (i.e. created without an explicit
1633+
`name` argument) remote function, the BigQuery DataFrames
1634+
session id is attached in the cloud artifacts names. So for the
1635+
effective reuse across the sessions it is recommended to create
1636+
the remote function with an explicit `name`.
16321637
Setting it to `False` would force creating a unique remote function.
16331638
If the required remote function does not exist then it would be
16341639
created irrespective of this param.
16351640
name (str, Optional):
1636-
Explicit name of the persisted BigQuery remote function. Use it with
1637-
caution, because two users working in the same project and dataset
1638-
could overwrite each other's remote functions if they use the same
1639-
persistent name. When an explicit name is provided, any session
1640-
specific clean up (``bigframes.session.Session.close``/
1641+
Explicit name of the persisted BigQuery remote function. Use it
1642+
with caution, because more than one users working in the same
1643+
project and dataset could overwrite each other's remote
1644+
functions if they use the same persistent name. When an explicit
1645+
name is provided, any session specific clean up (
1646+
``bigframes.session.Session.close``/
16411647
``bigframes.pandas.close_session``/
16421648
``bigframes.pandas.reset_session``/
16431649
``bigframes.pandas.clean_up_by_session_id``) does not clean up

0 commit comments

Comments
 (0)