Skip to content

Commit 0bd5e1b

Browse files
authored
fix: Correct pypdf dependency specifier for remote PDF functions (#1980)
* fix: Correct pypdf dependency specifier for remote PDF functions * specfy a version for pypdf as well * testcase change * specify a version for cryptography
1 parent 3446950 commit 0bd5e1b

File tree

2 files changed

+68
-67
lines changed

2 files changed

+68
-67
lines changed

bigframes/blob/_functions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
473473
return result_json
474474

475475

476-
pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"])
476+
pdf_extract_def = FunctionDef(
477+
pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
478+
)
477479

478480

479481
# Extracts text from a PDF url and chunks it simultaneously
@@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
527529
return result_json
528530

529531

530-
pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"])
532+
pdf_chunk_def = FunctionDef(
533+
pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
534+
)

tests/system/large/blob/test_function.py

Lines changed: 62 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:
302302

303303

304304
@pytest.mark.parametrize(
305-
"verbose, expected",
305+
"verbose",
306306
[
307-
(
308-
True,
309-
pd.Series(
310-
[
311-
{"status": "File has not been decrypted", "content": ""},
312-
{
313-
"status": "",
314-
"content": "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ",
315-
},
316-
]
317-
),
318-
),
319-
(
320-
False,
321-
pd.Series(
322-
[
323-
"",
324-
"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ",
325-
],
326-
name="pdf",
327-
),
328-
),
307+
(True),
308+
(False),
329309
],
330310
)
331311
def test_blob_pdf_extract(
332312
pdf_mm_df: bpd.DataFrame,
333313
verbose: bool,
334314
bq_connection: str,
335-
expected: pd.Series,
336315
):
337316
actual = (
338317
pdf_mm_df["pdf"]
@@ -341,49 +320,44 @@ def test_blob_pdf_extract(
341320
.to_pandas()
342321
)
343322

344-
pd.testing.assert_series_equal(
345-
actual,
346-
expected,
347-
check_dtype=False,
348-
check_index=False,
323+
# check relative length
324+
expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
325+
expected_len = len(expected_text)
326+
327+
actual_text = ""
328+
if verbose:
329+
# The first entry is for a file that doesn't exist, so we check the second one
330+
successful_results = actual[actual.apply(lambda x: x["status"] == "")]
331+
actual_text = successful_results.apply(lambda x: x["content"]).iloc[0]
332+
else:
333+
actual_text = actual[actual != ""].iloc[0]
334+
actual_len = len(actual_text)
335+
336+
relative_length_tolerance = 0.25
337+
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
338+
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
339+
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
340+
f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
341+
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
342+
f"Expected reference length was {expected_len}. "
349343
)
350344

345+
# check for major keywords
346+
major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
347+
for keyword in major_keywords:
348+
assert (
349+
keyword.lower() in actual_text.lower()
350+
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "
351+
351352

352353
@pytest.mark.parametrize(
353-
"verbose, expected",
354+
"verbose",
354355
[
355-
(
356-
True,
357-
pd.Series(
358-
[
359-
{"status": "File has not been decrypted", "content": []},
360-
{
361-
"status": "",
362-
"content": [
363-
"Sample PDF This is a testing file. Some ",
364-
"dummy messages are used for testing ",
365-
"purposes. ",
366-
],
367-
},
368-
]
369-
),
370-
),
371-
(
372-
False,
373-
pd.Series(
374-
[
375-
pd.NA,
376-
"Sample PDF This is a testing file. Some ",
377-
"dummy messages are used for testing ",
378-
"purposes. ",
379-
],
380-
),
381-
),
356+
(True),
357+
(False),
382358
],
383359
)
384-
def test_blob_pdf_chunk(
385-
pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, expected: pd.Series
386-
):
360+
def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str):
387361
actual = (
388362
pdf_mm_df["pdf"]
389363
.blob.pdf_chunk(
@@ -397,13 +371,36 @@ def test_blob_pdf_chunk(
397371
.to_pandas()
398372
)
399373

400-
pd.testing.assert_series_equal(
401-
actual,
402-
expected,
403-
check_dtype=False,
404-
check_index=False,
374+
# check relative length
375+
expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
376+
expected_len = len(expected_text)
377+
378+
actual_text = ""
379+
if verbose:
380+
# The first entry is for a file that doesn't exist, so we check the second one
381+
successful_results = actual[actual.apply(lambda x: x["status"] == "")]
382+
actual_text = "".join(successful_results.apply(lambda x: x["content"]).iloc[0])
383+
else:
384+
# First entry is NA
385+
actual_text = "".join(actual.dropna())
386+
actual_len = len(actual_text)
387+
388+
relative_length_tolerance = 0.25
389+
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
390+
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
391+
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
392+
f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
393+
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
394+
f"Expected reference length was {expected_len}. "
405395
)
406396

397+
# check for major keywords
398+
major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
399+
for keyword in major_keywords:
400+
assert (
401+
keyword.lower() in actual_text.lower()
402+
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "
403+
407404

408405
@pytest.mark.parametrize(
409406
"model_name, verbose",

0 commit comments

Comments
 (0)