@@ -224,6 +224,54 @@ def display_single_url(read_url: str, content_type: str):
224
224
for _ , row in df .iterrows ():
225
225
display_single_url (row ["read_url" ], row ["content_type" ])
226
226
227
+ def _resolve_connection (self , connection : Optional [str ] = None ) -> str :
228
+ """Resovle the BigQuery connection.
229
+
230
+ .. note::
231
+ BigFrames Blob is still under experiments. It may not work and
232
+ subject to change in the future.
233
+
234
+ Args:
235
+ connection (str or None, default None): BQ connection used for
236
+ function internet transactions, and the output blob if "dst" is
237
+ str. If None, uses default connection of the session.
238
+
239
+ Returns:
240
+ str: the resolved BigQuery connection string in the format:
241
+ "project.location.connection_id".
242
+
243
+ Raises:
244
+ ValueError: If the connection cannot be resolved to a valid string.
245
+ """
246
+ connection = connection or self ._block .session ._bq_connection
247
+ return clients .resolve_full_bq_connection_name (
248
+ connection ,
249
+ default_project = self ._block .session ._project ,
250
+ default_location = self ._block .session ._location ,
251
+ )
252
+
253
+ def _get_runtime_json_str (
254
+ self , mode : str = "R" , with_metadata : bool = False
255
+ ) -> bigframes .series .Series :
256
+ """Get the runtime and apply the ToJSONSTring transformation.
257
+
258
+ .. note::
259
+ BigFrames Blob is still under experiments. It may not work and
260
+ subject to change in the future.
261
+
262
+ Args:
263
+ mode(str or str, default "R"): the mode for accessing the runtime.
264
+ Default to "R". Possible values are "R" (read-only) and
265
+ "RW" (read-write)
266
+ with_metadata (bool, default False): whether to include metadata
267
+ in the JOSN string. Default to False.
268
+
269
+ Returns:
270
+ str: the runtime object in the JSON string.
271
+ """
272
+ runtime = self ._get_runtime (mode = mode , with_metadata = with_metadata )
273
+ return runtime ._apply_unary_op (ops .ToJSONString ())
274
+
227
275
def image_blur (
228
276
self ,
229
277
ksize : tuple [int , int ],
@@ -246,12 +294,7 @@ def image_blur(
246
294
"""
247
295
import bigframes .blob ._functions as blob_func
248
296
249
- connection = connection or self ._block .session ._bq_connection
250
- connection = clients .resolve_full_bq_connection_name (
251
- connection ,
252
- default_project = self ._block .session ._project ,
253
- default_location = self ._block .session ._location ,
254
- )
297
+ connection = self ._resolve_connection (connection )
255
298
256
299
if isinstance (dst , str ):
257
300
dst = os .path .join (dst , "" )
@@ -268,11 +311,8 @@ def image_blur(
268
311
connection = connection ,
269
312
).udf ()
270
313
271
- src_rt = self ._get_runtime (mode = "R" )
272
- dst_rt = dst .blob ._get_runtime (mode = "RW" )
273
-
274
- src_rt = src_rt ._apply_unary_op (ops .ToJSONString ())
275
- dst_rt = dst_rt ._apply_unary_op (ops .ToJSONString ())
314
+ src_rt = self ._get_runtime_json_str (mode = "R" )
315
+ dst_rt = dst .blob ._get_runtime_json_str (mode = "RW" )
276
316
277
317
df = src_rt .to_frame ().join (dst_rt .to_frame (), how = "outer" )
278
318
df ["ksize_x" ], df ["ksize_y" ] = ksize
@@ -281,3 +321,93 @@ def image_blur(
281
321
res .cache () # to execute the udf
282
322
283
323
return dst
324
+
325
+ def pdf_extract (
326
+ self , * , connection : Optional [str ] = None
327
+ ) -> bigframes .series .Series :
328
+ """Extracts and chunks text from PDF URLs and saves the text as
329
+ arrays of string.
330
+
331
+ .. note::
332
+ BigFrames Blob is still under experiments. It may not work and
333
+ subject to change in the future.
334
+
335
+ Args:
336
+ connection (str or None, default None): BQ connection used for
337
+ function internet transactions, and the output blob if "dst"
338
+ is str. If None, uses default connection of the session.
339
+
340
+ Returns:
341
+ bigframes.series.Series: conatins all text from a pdf file
342
+ """
343
+
344
+ import bigframes .blob ._functions as blob_func
345
+
346
+ connection = self ._resolve_connection (connection )
347
+
348
+ pdf_chunk_udf = blob_func .TransformFunction (
349
+ blob_func .pdf_extract_def ,
350
+ session = self ._block .session ,
351
+ connection = connection ,
352
+ ).udf ()
353
+
354
+ src_rt = self ._get_runtime_json_str (mode = "R" )
355
+ res = src_rt .apply (pdf_chunk_udf )
356
+ return res
357
+
358
+ def pdf_chunk (
359
+ self ,
360
+ * ,
361
+ connection : Optional [str ] = None ,
362
+ chunk_size : int = 1000 ,
363
+ overlap_size : int = 200 ,
364
+ ) -> bigframes .series .Series :
365
+ """Extracts and chunks text from PDF URLs and saves the text as
366
+ arrays of strings.
367
+
368
+ .. note::
369
+ BigFrames Blob is still under experiments. It may not work and
370
+ subject to change in the future.
371
+
372
+ Args:
373
+ connection (str or None, default None): BQ connection used for
374
+ function internet transactions, and the output blob if "dst"
375
+ is str. If None, uses default connection of the session.
376
+ chunk_size (int, default 1000): the desired size of each text chunk
377
+ (number of characters).
378
+ overlap_size (int, default 200): the number of overlapping characters
379
+ between consective chunks. The helps to ensure context is
380
+ perserved across chunk boundaries.
381
+
382
+ Returns:
383
+ bigframe.series.Series of array[str], where each string is a
384
+ chunk of text extracted from PDF.
385
+ """
386
+
387
+ import bigframes .bigquery as bbq
388
+ import bigframes .blob ._functions as blob_func
389
+
390
+ connection = self ._resolve_connection (connection )
391
+
392
+ if chunk_size <= 0 :
393
+ raise ValueError ("chunk_size must be a positive integer." )
394
+ if overlap_size < 0 :
395
+ raise ValueError ("overlap_size must be a non-negative integer." )
396
+ if overlap_size >= chunk_size :
397
+ raise ValueError ("overlap_size must be smaller than chunk_size." )
398
+
399
+ pdf_chunk_udf = blob_func .TransformFunction (
400
+ blob_func .pdf_chunk_def ,
401
+ session = self ._block .session ,
402
+ connection = connection ,
403
+ ).udf ()
404
+
405
+ src_rt = self ._get_runtime_json_str (mode = "R" )
406
+ df = src_rt .to_frame ()
407
+ df ["chunk_size" ] = chunk_size
408
+ df ["overlap_size" ] = overlap_size
409
+
410
+ res = df .apply (pdf_chunk_udf , axis = 1 )
411
+
412
+ res_array = bbq .json_extract_string_array (res )
413
+ return res_array
0 commit comments